## Setup 1 of N: environment and workspace

Goal
- Install core dependencies for scraping, embeddings, and Chroma
- Create a clean workspace in `/content/anti_echo`
- Print basic environment info so collaborators can debug quickly

Notes
- No Drive mount
- Keep installs minimal and pinned where sensible


In [1]:
# Setup 1 of N: environment and workspace
# Colab safe. No Drive mount.

import os
import sys
import subprocess
import textwrap
from pathlib import Path

def pip_install(pkgs):
    cmd = [sys.executable, "-m", "pip", "install", "-q"] + pkgs
    print("Installing:", " ".join(pkgs))
    subprocess.check_call(cmd)

# Core deps
pip_install([
    "feedparser==6.0.10",
    "trafilatura>=1.6.2,<2.0",
    "sentence-transformers>=2.6.1,<3.0",
    "chromadb>=0.5.5,<0.6.0",
    "huggingface_hub>=0.24.0,<0.28.0",
    "pyyaml>=6.0.1,<7.0",
    "numpy>=1.26.4,<3.0",
    "tqdm>=4.66.0,<5.0",
    "requests>=2.31.0,<3.0",
    "rapidfuzz>=3.6.0,<4.0"
])

# Optional but helpful
try:
    import torch
except Exception:
    pip_install(["torch>=2.2.0,<3.0"])

# Workspace layout
PROJECT_ROOT = Path("/content/anti_echo").resolve()
SUBDIRS = [
    "raw",
    "batches",
    "chroma_db",
    "logs",
    "feeds",
    "tmp"
]

for d in SUBDIRS:
    (PROJECT_ROOT / d).mkdir(parents=True, exist_ok=True)

# Environment tweaks
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Diagnostics
import platform, json
from importlib.metadata import version, PackageNotFoundError

def v(name):
    try:
        return version(name)
    except PackageNotFoundError:
        return "not-installed"

info = {
    "python": sys.version.split()[0],
    "platform": platform.platform(),
    "cuda_available": False,
    "packages": {
        "feedparser": v("feedparser"),
        "trafilatura": v("trafilatura"),
        "sentence-transformers": v("sentence-transformers"),
        "chromadb": v("chromadb"),
        "huggingface_hub": v("huggingface-hub"),
        "pyyaml": v("PyYAML"),
        "numpy": v("numpy"),
        "rapidfuzz": v("rapidfuzz"),
        "torch": v("torch"),
        "tqdm": v("tqdm"),
        "requests": v("requests"),
    },
    "paths": {
        "project_root": str(PROJECT_ROOT),
        "raw": str(PROJECT_ROOT / "raw"),
        "batches": str(PROJECT_ROOT / "batches"),
        "chroma_db": str(PROJECT_ROOT / "chroma_db"),
        "logs": str(PROJECT_ROOT / "logs"),
        "feeds": str(PROJECT_ROOT / "feeds"),
        "tmp": str(PROJECT_ROOT / "tmp"),
    }
}

try:
    import torch
    info["cuda_available"] = bool(torch.cuda.is_available())
    if info["cuda_available"]:
        info["cuda_device_name"] = torch.cuda.get_device_name(0)
except Exception:
    pass

print(json.dumps(info, indent=2))

# Place a small README in the workspace for orientation
workspace_readme = PROJECT_ROOT / "README_WORKSPACE.txt"
if not workspace_readme.exists():
    workspace_readme.write_text(textwrap.dedent("""
        anti echo chamber - Colab workspace
        This directory is ephemeral per session.
        Do not commit files from here directly.
        Subdirs:
          raw        - local scraped texts and meta for this session
          batches    - locally packaged batches before HF upload
          chroma_db  - local Chroma rebuild target
          logs       - run logs
          feeds      - runtime feed artifacts
          tmp        - scratch space
    """).strip() + "\n", encoding="utf-8")
print(f"Workspace ready at {PROJECT_ROOT}")


Installing: feedparser==6.0.10 trafilatura>=1.6.2,<2.0 sentence-transformers>=2.6.1,<3.0 chromadb>=0.5.5,<0.6.0 huggingface_hub>=0.24.0,<0.28.0 pyyaml>=6.0.1,<7.0 numpy>=1.26.4,<3.0 tqdm>=4.66.0,<5.0 requests>=2.31.0,<3.0 rapidfuzz>=3.6.0,<4.0
{
  "python": "3.12.11",
  "platform": "Linux-6.6.97+-x86_64-with-glibc2.35",
  "cuda_available": true,
  "packages": {
    "feedparser": "6.0.10",
    "trafilatura": "1.8.1",
    "sentence-transformers": "2.7.0",
    "chromadb": "0.5.23",
    "huggingface_hub": "0.27.1",
    "pyyaml": "6.0.3",
    "numpy": "1.26.4",
    "rapidfuzz": "3.14.1",
    "torch": "2.8.0+cu126",
    "tqdm": "4.67.1",
    "requests": "2.32.4"
  },
  "paths": {
    "project_root": "/content/anti_echo",
    "raw": "/content/anti_echo/raw",
    "batches": "/content/anti_echo/batches",
    "chroma_db": "/content/anti_echo/chroma_db",
    "logs": "/content/anti_echo/logs",
    "feeds": "/content/anti_echo/feeds",
    "tmp": "/content/anti_echo/tmp"
  },
  "cuda_device_name": "

## Setup 2 of N: config and paths bootstrap (robust fetch)

Goal
- Load shared config from GitHub with fallback paths
- Cache config locally for this session
- Initialize runtime paths and print key settings

Note
- Tries multiple candidate filenames for `stance_axes` and `topic_labels` in case they are saved with .json or .yaml


In [5]:
# Setup 2 of N: config and paths bootstrap (robust fetch)

import os
import json
import yaml
import requests
from pathlib import Path

PROJECT_ROOT = Path("/content/anti_echo").resolve()
CONFIG_CACHE = PROJECT_ROOT / "config_cache"
CONFIG_CACHE.mkdir(parents=True, exist_ok=True)

REPO_OWNER = "AHMerrill"
REPO_NAME = "anti-echo-chamber"
BRANCH = "main"

def raw_url(path: str) -> str:
    return f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{BRANCH}/{path.lstrip('/')}"

def fetch_text_first(paths):
    last_err = None
    tried = []
    for p in paths:
        url = raw_url(p)
        tried.append(url)
        try:
            r = requests.get(url, timeout=20)
            if r.status_code == 200 and r.text.strip():
                return r.text, p, url
        except Exception as e:
            last_err = e
    msg = "Could not fetch any of the candidate paths.\nTried:\n" + "\n".join(tried)
    if last_err:
        msg += f"\nLast error: {type(last_err).__name__}: {last_err}"
    raise RuntimeError(msg)

# Candidate repo paths
CFG_CANDIDATES = [
    "config/config.yaml",
    "config/config.yml",
    "config/config.json",
]
STANCE_CANDIDATES = [
    "config/stance_axes.json",
    "config/stance_axes.yaml",
    "config/stance_axes.yml",
    "config/stance_axes",
]
TOPIC_CANDIDATES = [
    "config/topic_labels.json",
    "config/topic_labels.yaml",
    "config/topic_labels.yml",
    "config/topic_labels",
]

# Fetch config files
cfg_txt, cfg_path, cfg_url = fetch_text_first(CFG_CANDIDATES)
stance_txt, stance_path, stance_url = fetch_text_first(STANCE_CANDIDATES)
topic_txt, topic_path, topic_url = fetch_text_first(TOPIC_CANDIDATES)

# Cache copies
(CONFIG_CACHE / Path(cfg_path).name).write_text(cfg_txt, encoding="utf-8")
(CONFIG_CACHE / (Path(stance_path).name if Path(stance_path).suffix else "stance_axes.json")).write_text(stance_txt, encoding="utf-8")
(CONFIG_CACHE / (Path(topic_path).name if Path(topic_path).suffix else "topic_labels.json")).write_text(topic_txt, encoding="utf-8")

# Parse helpers
def parse_maybe_json_or_yaml(txt: str):
    txt = txt.strip()
    # try json first
    try:
        return json.loads(txt)
    except Exception:
        pass
    # then yaml
    try:
        return yaml.safe_load(txt)
    except Exception as e:
        raise ValueError(f"Failed to parse as JSON or YAML: {e}")

# Parse into Python objects
if cfg_path.endswith((".yaml", ".yml")):
    CONFIG = yaml.safe_load(cfg_txt)
elif cfg_path.endswith(".json"):
    CONFIG = json.loads(cfg_txt)
else:
    # default to YAML for config
    CONFIG = yaml.safe_load(cfg_txt)

STANCE_AXES = parse_maybe_json_or_yaml(stance_txt)
TOPIC_LABELS = parse_maybe_json_or_yaml(topic_txt)

# Validate minimum keys
required_cfg_keys = ["hf_dataset_id", "chroma_collections", "embeddings", "batch", "ids", "chroma"]
missing = [k for k in required_cfg_keys if k not in CONFIG]
if missing:
    raise ValueError(f"Missing required config keys: {missing}")

# Create runtime subdirs
for key, path in {
    "raw": "raw",
    "batches": CONFIG["batch"]["base_dir"],
    "chroma_db": CONFIG["chroma"]["dir"],
    "logs": CONFIG.get("logging", {}).get("save_dir", "logs"),
    "tmp": "tmp"
}.items():
    (PROJECT_ROOT / path).mkdir(parents=True, exist_ok=True)

# Print a concise summary
summary = {
    "hf_dataset_id": CONFIG["hf_dataset_id"],
    "collections": CONFIG["chroma_collections"],
    "embeddings": {
        "topic_model": CONFIG["embeddings"]["topic_model"],
        "stance_model": CONFIG["embeddings"]["stance_model"],
        "dim": CONFIG["embeddings"]["dim"],
        "dtype": CONFIG["embeddings"]["dtype"],
        "pooling": CONFIG["embeddings"]["pooling"],
        "chunk_tokens": CONFIG["embeddings"]["chunk_tokens"]
    },
    "summarizer": CONFIG.get("summarizer", {}),
    "batch_files": {
        "topic_file": CONFIG["batch"]["topic_file"],
        "stance_file": CONFIG["batch"]["stance_file"],
        "metadata_file": CONFIG["batch"]["metadata_file"],
        "manifest_name": CONFIG["batch"]["manifest_name"],
        "base_dir": CONFIG["batch"]["base_dir"]
    },
    "id_policy": CONFIG["ids"],
    "paths": {
        "project_root": str(PROJECT_ROOT),
        "config_cache": str(CONFIG_CACHE),
        "raw": str(PROJECT_ROOT / "raw"),
        "batches": str(PROJECT_ROOT / CONFIG["batch"]["base_dir"]),
        "chroma_db": str(PROJECT_ROOT / CONFIG["chroma"]["dir"]),
        "logs": str(PROJECT_ROOT / CONFIG.get("logging", {}).get("save_dir", "logs")),
        "tmp": str(PROJECT_ROOT / "tmp")
    },
    "loaded": {
        "stance_axes_count": len(STANCE_AXES) if isinstance(STANCE_AXES, (list, dict)) else "unknown",
        "topic_labels_count": len(TOPIC_LABELS) if isinstance(TOPIC_LABELS, (list, dict)) else "unknown"
    },
    "source_urls": {
        "config": cfg_url,
        "stance_axes": stance_url,
        "topic_labels": topic_url
    }
}

print(json.dumps(summary, indent=2))

# Make HF dataset id available to later cells
os.environ["HF_DATASET_ID"] = CONFIG["hf_dataset_id"]


{
  "hf_dataset_id": "zanimal/anti-echo-artifacts",
  "collections": {
    "topic": "news_topic",
    "stance": "news_stance"
  },
  "embeddings": {
    "topic_model": "sentence-transformers/all-MiniLM-L6-v2",
    "stance_model": "sentence-transformers/all-MiniLM-L6-v2",
    "dim": 384,
    "dtype": "float16",
    "pooling": "mean",
    "chunk_tokens": 512
  },
  "summarizer": {
    "model": "facebook/bart-large-cnn",
    "target_sentences": 5,
    "truncation": 2048
  },
  "batch_files": {
    "topic_file": "embeddings_topic.npz",
    "stance_file": "embeddings_stance.npz",
    "metadata_file": "metadata.jsonl",
    "manifest_name": "manifest.json",
    "base_dir": "batches"
  },
  "id_policy": {
    "scheme": "domain-slug-sha12",
    "hash": "sha256",
    "normalize_whitespace": true,
    "lowercase": true
  },
  "paths": {
    "project_root": "/content/anti_echo",
    "config_cache": "/content/anti_echo/config_cache",
    "raw": "/content/anti_echo/raw",
    "batches": "/content/ant