## Setup 1 of N: environment and workspace

Goal
- Install core dependencies for scraping, embeddings, and Chroma
- Create a clean workspace in `/content/anti_echo`
- Print basic environment info so collaborators can debug quickly

Notes
- No Drive mount
- Keep installs minimal and pinned where sensible


In [1]:
# Setup 1 of N: environment and workspace
# Colab safe. No Drive mount.

import os
import sys
import subprocess
import textwrap
from pathlib import Path

def pip_install(pkgs):
    cmd = [sys.executable, "-m", "pip", "install", "-q"] + pkgs
    print("Installing:", " ".join(pkgs))
    subprocess.check_call(cmd)

# Core deps
pip_install([
    "feedparser==6.0.10",
    "trafilatura>=1.6.2,<2.0",
    "sentence-transformers>=2.6.1,<3.0",
    "chromadb>=0.5.5,<0.6.0",
    "huggingface_hub>=0.24.0,<0.28.0",
    "pyyaml>=6.0.1,<7.0",
    "numpy>=1.26.4,<3.0",
    "tqdm>=4.66.0,<5.0",
    "requests>=2.31.0,<3.0",
    "rapidfuzz>=3.6.0,<4.0"
])

# Optional but helpful
try:
    import torch
except Exception:
    pip_install(["torch>=2.2.0,<3.0"])

# Workspace layout
PROJECT_ROOT = Path("/content/anti_echo").resolve()
SUBDIRS = [
    "raw",
    "batches",
    "chroma_db",
    "logs",
    "feeds",
    "tmp"
]

for d in SUBDIRS:
    (PROJECT_ROOT / d).mkdir(parents=True, exist_ok=True)

# Environment tweaks
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Diagnostics
import platform, json
from importlib.metadata import version, PackageNotFoundError

def v(name):
    try:
        return version(name)
    except PackageNotFoundError:
        return "not-installed"

info = {
    "python": sys.version.split()[0],
    "platform": platform.platform(),
    "cuda_available": False,
    "packages": {
        "feedparser": v("feedparser"),
        "trafilatura": v("trafilatura"),
        "sentence-transformers": v("sentence-transformers"),
        "chromadb": v("chromadb"),
        "huggingface_hub": v("huggingface-hub"),
        "pyyaml": v("PyYAML"),
        "numpy": v("numpy"),
        "rapidfuzz": v("rapidfuzz"),
        "torch": v("torch"),
        "tqdm": v("tqdm"),
        "requests": v("requests"),
    },
    "paths": {
        "project_root": str(PROJECT_ROOT),
        "raw": str(PROJECT_ROOT / "raw"),
        "batches": str(PROJECT_ROOT / "batches"),
        "chroma_db": str(PROJECT_ROOT / "chroma_db"),
        "logs": str(PROJECT_ROOT / "logs"),
        "feeds": str(PROJECT_ROOT / "feeds"),
        "tmp": str(PROJECT_ROOT / "tmp"),
    }
}

try:
    import torch
    info["cuda_available"] = bool(torch.cuda.is_available())
    if info["cuda_available"]:
        info["cuda_device_name"] = torch.cuda.get_device_name(0)
except Exception:
    pass

print(json.dumps(info, indent=2))

# Place a small README in the workspace for orientation
workspace_readme = PROJECT_ROOT / "README_WORKSPACE.txt"
if not workspace_readme.exists():
    workspace_readme.write_text(textwrap.dedent("""
        anti echo chamber - Colab workspace
        This directory is ephemeral per session.
        Do not commit files from here directly.
        Subdirs:
          raw        - local scraped texts and meta for this session
          batches    - locally packaged batches before HF upload
          chroma_db  - local Chroma rebuild target
          logs       - run logs
          feeds      - runtime feed artifacts
          tmp        - scratch space
    """).strip() + "\n", encoding="utf-8")
print(f"Workspace ready at {PROJECT_ROOT}")


Installing: feedparser==6.0.10 trafilatura>=1.6.2,<2.0 sentence-transformers>=2.6.1,<3.0 chromadb>=0.5.5,<0.6.0 huggingface_hub>=0.24.0,<0.28.0 pyyaml>=6.0.1,<7.0 numpy>=1.26.4,<3.0 tqdm>=4.66.0,<5.0 requests>=2.31.0,<3.0 rapidfuzz>=3.6.0,<4.0
{
  "python": "3.12.11",
  "platform": "Linux-6.6.97+-x86_64-with-glibc2.35",
  "cuda_available": true,
  "packages": {
    "feedparser": "6.0.10",
    "trafilatura": "1.8.1",
    "sentence-transformers": "2.7.0",
    "chromadb": "0.5.23",
    "huggingface_hub": "0.27.1",
    "pyyaml": "6.0.3",
    "numpy": "1.26.4",
    "rapidfuzz": "3.14.1",
    "torch": "2.8.0+cu126",
    "tqdm": "4.67.1",
    "requests": "2.32.4"
  },
  "paths": {
    "project_root": "/content/anti_echo",
    "raw": "/content/anti_echo/raw",
    "batches": "/content/anti_echo/batches",
    "chroma_db": "/content/anti_echo/chroma_db",
    "logs": "/content/anti_echo/logs",
    "feeds": "/content/anti_echo/feeds",
    "tmp": "/content/anti_echo/tmp"
  },
  "cuda_device_name": "