Setup 1 of N — environment and workspace

Purpose
Install core dependencies, set up a clean workspace at /content/anti_echo, and print diagnostics so collaborators can debug quickly.

Why this matters
Pinned installs reduce breakage. A consistent folder layout keeps artifacts predictable. Diagnostics help when the runtime changes.

Outputs
Directories created under /content/anti_echo, environment flags set, package versions printed.

In [None]:
# Setup 1 of N: environment and workspace
# Colab safe. No Drive mount. Heavy comments for clarity.

import os
import sys
import subprocess
import textwrap
from pathlib import Path

def pip_install(pkgs):
    # Quiet installs but still show what is being installed for reproducibility
    cmd = [sys.executable, "-m", "pip", "install", "-q"] + pkgs
    print("Installing:", " ".join(pkgs))
    subprocess.check_call(cmd)

# Core dependencies with sane pins or upper bounds
pip_install([
    "feedparser==6.0.10",                    # RSS parsing
    "trafilatura>=1.6.2,<2.0",               # robust article extraction
    "sentence-transformers>=2.6.1,<3.0",     # embeddings
    "chromadb>=0.5.5,<0.6.0",                # local vector store
    "huggingface_hub>=0.24.0,<0.28.0",       # HF dataset and file ops
    "pyyaml>=6.0.1,<7.0",                    # config parsing
    "numpy>=1.26.4,<3.0",                    # arrays
    "tqdm>=4.66.0,<5.0",                     # progress
    "requests>=2.31.0,<3.0",                 # HTTP
    "rapidfuzz>=3.6.0,<4.0",                 # dedupe or fuzzy utils
    "scikit-learn>=1.4.0,<2.0",              # clustering for topics
    "transformers>=4.43.0,<5.0",             # summarization model
    "nltk>=3.8.1,<4.0"                       # sentence splitting
])

# Optional accelerator. If import fails, install a compatible torch
try:
    import torch
except Exception:
    pip_install(["torch>=2.2.0,<3.0"])
    import torch

# Define a single project root for all artifacts in this session
PROJECT_ROOT = Path("/content/anti_echo").resolve()
SUBDIRS = [
    "raw",        # scraped article text and sidecar metadata
    "batches",    # packaged embeddings and manifest before HF upload
    "chroma_db",  # local persistent Chroma store
    "logs",       # run logs
    "feeds",      # index and feed state
    "tmp"         # scratch space
]
for d in SUBDIRS:
    (PROJECT_ROOT / d).mkdir(parents=True, exist_ok=True)

# Set environment flags to reduce noise and avoid accidental multithreading bugs
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CHROMA_TELEMETRY_ENABLED"] = "false"
os.environ["ANONYMIZED_TELEMETRY"] = "false"

# Print environment diagnostics for reproducibility and easy debugging
import platform, json
from importlib.metadata import version, PackageNotFoundError

def v(name):
    try:
        return version(name)
    except PackageNotFoundError:
        return "not-installed"

info = {
    "python": sys.version.split()[0],
    "platform": platform.platform(),
    "packages": {
        "feedparser": v("feedparser"),
        "trafilatura": v("trafilatura"),
        "sentence-transformers": v("sentence-transformers"),
        "chromadb": v("chromadb"),
        "huggingface_hub": v("huggingface-hub"),
        "PyYAML": v("PyYAML"),
        "numpy": v("numpy"),
        "rapidfuzz": v("rapidfuzz"),
        "torch": v("torch"),
        "tqdm": v("tqdm"),
        "requests": v("requests"),
        "scikit-learn": v("scikit-learn"),
        "transformers": v("transformers"),
        "nltk": v("nltk"),
    },
    "paths": {
        "project_root": str(PROJECT_ROOT),
        "raw": str(PROJECT_ROOT / "raw"),
        "batches": str(PROJECT_ROOT / "batches"),
        "chroma_db": str(PROJECT_ROOT / "chroma_db"),
        "logs": str(PROJECT_ROOT / "logs"),
        "feeds": str(PROJECT_ROOT / "feeds"),
        "tmp": str(PROJECT_ROOT / "tmp"),
    }
}

# CUDA info is helpful to know if summarization can use GPU
info["cuda_available"] = bool(torch.cuda.is_available())
if info["cuda_available"]:
    info["cuda_device_name"] = torch.cuda.get_device_name(0)

print(json.dumps(info, indent=2))

# A tiny workspace readme helps collaborators quickly orient
workspace_readme = PROJECT_ROOT / "README_WORKSPACE.txt"
if not workspace_readme.exists():
    workspace_readme.write_text(textwrap.dedent("""
        anti echo chamber - Colab workspace
        This directory is ephemeral per session.
        Do not commit files from here directly.
        Subdirs:
          raw        - local scraped texts and meta for this session
          batches    - locally packaged batches before HF upload
          chroma_db  - local Chroma rebuild target
          logs       - run logs
          feeds      - runtime feed artifacts
          tmp        - scratch space
    """).strip() + "\n", encoding="utf-8")
print(f"Workspace ready at {PROJECT_ROOT}")


Installing: feedparser==6.0.10 trafilatura>=1.6.2,<2.0 sentence-transformers>=2.6.1,<3.0 chromadb>=0.5.5,<0.6.0 huggingface_hub>=0.24.0,<0.28.0 pyyaml>=6.0.1,<7.0 numpy>=1.26.4,<3.0 tqdm>=4.66.0,<5.0 requests>=2.31.0,<3.0 rapidfuzz>=3.6.0,<4.0
{
  "python": "3.12.11",
  "platform": "Linux-6.6.97+-x86_64-with-glibc2.35",
  "cuda_available": true,
  "packages": {
    "feedparser": "6.0.10",
    "trafilatura": "1.12.2",
    "sentence-transformers": "2.7.0",
    "chromadb": "0.5.23",
    "huggingface_hub": "0.27.1",
    "pyyaml": "6.0.3",
    "numpy": "2.0.2",
    "rapidfuzz": "3.14.1",
    "torch": "2.8.0+cu126",
    "tqdm": "4.67.1",
    "requests": "2.32.4"
  },
  "paths": {
    "project_root": "/content/anti_echo",
    "raw": "/content/anti_echo/raw",
    "batches": "/content/anti_echo/batches",
    "chroma_db": "/content/anti_echo/chroma_db",
    "logs": "/content/anti_echo/logs",
    "feeds": "/content/anti_echo/feeds",
    "tmp": "/content/anti_echo/tmp"
  },
  "cuda_device_name": "

Setup 2 of N — config and paths bootstrap

Purpose
Pull config and label maps from your GitHub repo, cache them locally in the Colab session, validate required keys, and create runtime dirs from config.

Why this matters
Single source of truth for model names, dims, batch filenames, and collection names. Keeps the notebook aligned with the repo.

Outputs
CONFIG, STANCE_AXES, TOPIC_LABELS in memory, directories created, and HF_DATASET_ID exported to env.


In [None]:
# Setup 2 of N: config and paths bootstrap
# Reads config files from your GitHub repo and prepares runtime paths.

import os
import json
import yaml
import requests
from pathlib import Path

PROJECT_ROOT = Path("/content/anti_echo").resolve()
CONFIG_CACHE = PROJECT_ROOT / "config_cache"
CONFIG_CACHE.mkdir(parents=True, exist_ok=True)

# Your repo location for configuration files
REPO_OWNER = "AHMerrill"
REPO_NAME = "anti-echo-chamber"
BRANCH = "main"

def raw_url(path: str) -> str:
    # Compose a raw GitHub URL for a given path in the repo
    return f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{BRANCH}/{path.lstrip('/')}"

def fetch_text_first(paths):
    # Try multiple candidate filenames and return the first that exists
    last_err = None
    for p in paths:
        url = raw_url(p)
        try:
            r = requests.get(url, timeout=20)
            if r.status_code == 200 and r.text.strip():
                return r.text, p, url
        except Exception as e:
            last_err = e
    raise RuntimeError(f"Could not fetch any of {paths}. Last error: {last_err}")

# Candidate paths allow json or yaml variants without editing the notebook
CFG_CANDIDATES = ["config/config.yaml","config/config.yml","config/config.json"]
STANCE_CANDIDATES = ["config/stance_axes.json","config/stance_axes.yaml","config/stance_axes.yml"]
TOPIC_CANDIDATES = ["config/topic_labels.json","config/topic_labels.yaml","config/topic_labels.yml"]

# Fetch and cache a local copy in the Colab session
cfg_txt, cfg_path, cfg_url = fetch_text_first(CFG_CANDIDATES)
stance_txt, stance_path, stance_url = fetch_text_first(STANCE_CANDIDATES)
topic_txt, topic_path, topic_url = fetch_text_first(TOPIC_CANDIDATES)

(CONFIG_CACHE / Path(cfg_path).name).write_text(cfg_txt, encoding="utf-8")
(CONFIG_CACHE / Path(stance_path).name).write_text(stance_txt, encoding="utf-8")
(CONFIG_CACHE / Path(topic_path).name).write_text(topic_txt, encoding="utf-8")

def parse_json_or_yaml(txt: str):
    # Parse as JSON first, fall back to YAML
    txt = txt.strip()
    try:
        return json.loads(txt)
    except Exception:
        return yaml.safe_load(txt)

# Parse into Python structures
CONFIG = yaml.safe_load(cfg_txt) if cfg_path.endswith((".yaml",".yml")) else json.loads(cfg_txt)
STANCE_AXES = parse_json_or_yaml(stance_txt)
TOPIC_LABELS = parse_json_or_yaml(topic_txt)

# Validate that essential keys exist to avoid surprises later
required_cfg_keys = ["hf_dataset_id","chroma_collections","embeddings","batch","ids","chroma"]
missing = [k for k in required_cfg_keys if k not in CONFIG]
if missing:
    raise ValueError(f"Missing required config keys: {missing}")

# Create directories from config for batches, chroma, and logs
(Path(PROJECT_ROOT / CONFIG["batch"]["base_dir"])).mkdir(parents=True, exist_ok=True)
(Path(PROJECT_ROOT / CONFIG["chroma"]["dir"])).mkdir(parents=True, exist_ok=True)
(Path(PROJECT_ROOT / CONFIG.get("logging", {}).get("save_dir", "logs"))).mkdir(parents=True, exist_ok=True)

# Print a short summary and export dataset id into the environment
print(json.dumps({
    "hf_dataset_id": CONFIG["hf_dataset_id"],
    "collections": CONFIG["chroma_collections"],
    "embeddings": CONFIG["embeddings"],
    "paths": {
        "batches": str(PROJECT_ROOT / CONFIG["batch"]["base_dir"]),
        "chroma_db": str(PROJECT_ROOT / CONFIG["chroma"]["dir"]),
    },
    "source_urls": {"config": cfg_url, "stance_axes": stance_url, "topic_labels": topic_url}
}, indent=2))

os.environ["HF_DATASET_ID"] = CONFIG["hf_dataset_id"]


{
  "hf_dataset_id": "zanimal/anti-echo-artifacts",
  "collections": {
    "topic": "news_topic",
    "stance": "news_stance"
  },
  "embeddings": {
    "topic_model": "sentence-transformers/all-MiniLM-L6-v2",
    "stance_model": "sentence-transformers/all-MiniLM-L6-v2",
    "dim": 384,
    "dtype": "float16",
    "pooling": "mean",
    "chunk_tokens": 512
  },
  "summarizer": {
    "model": "facebook/bart-large-cnn",
    "target_sentences": 5,
    "truncation": 2048
  },
  "batch_files": {
    "topic_file": "embeddings_topic.npz",
    "stance_file": "embeddings_stance.npz",
    "metadata_file": "metadata.jsonl",
    "manifest_name": "manifest.json",
    "base_dir": "batches"
  },
  "id_policy": {
    "scheme": "domain-slug-sha12",
    "hash": "sha256",
    "normalize_whitespace": true,
    "lowercase": true
  },
  "paths": {
    "project_root": "/content/anti_echo",
    "config_cache": "/content/anti_echo/config_cache",
    "raw": "/content/anti_echo/raw",
    "batches": "/content/ant

Setup 3 of N — authentication for Hugging Face and GitHub

Purpose
Load tokens into environment and verify access. Later cells use these tokens to push to HF and update your GitHub registry.

Why this matters
Catching auth issues early prevents failing halfway through a run.

Outputs
Logged in to HF, GitHub token validated.

In [None]:
# Setup 3 of N: auth for Hugging Face and GitHub
# Prompts only if tokens are not already in the environment.

import os
import requests
from getpass import getpass
from huggingface_hub import login, whoami

def need(envvar, prompt):
    # Request once per session if missing
    if not os.environ.get(envvar, "").strip():
        os.environ[envvar] = getpass(prompt)
    print(f"{envvar} loaded:", bool(os.environ.get(envvar)))

# Gather tokens
need("HF_TOKEN", "Enter your Hugging Face token: ")
need("GITHUB_TOKEN", "Enter your GitHub Personal Access Token: ")

# Sign in to HF so upload_file works later
try:
    login(token=os.environ["HF_TOKEN"], add_to_git_credential=False)
    print("HF login OK:", whoami(token=os.environ["HF_TOKEN"]).get("name","(ok)"))
except Exception as e:
    print("HF login failed:", e)

# Quick GitHub check to confirm token scopes
try:
    r = requests.get(
        "https://api.github.com/user",
        headers={"Authorization": f"Bearer {os.environ['GITHUB_TOKEN']}"},
        timeout=15
    )
    print("GitHub auth status:", r.status_code)
except Exception as e:
    print("GitHub auth check failed:", e)


Enter your Hugging Face token: ··········
HF_TOKEN loaded: True
Enter your GitHub Personal Access Token: ··········
GITHUB_TOKEN loaded: True
GitHub auth status: 200
HF user: zanimal


Setup 4 of N — restore feed index and feed state

Purpose
Restore feeds/index.json and feeds/feeds_state.json from HF latest pointers, fallback to GitHub if missing, or reconstruct from HF batch metadata if neither exists.

Why this matters
Prevents re scraping the same URLs, and keeps numbering and batching consistent across runs.

Outputs
feeds/index.json and feeds/feeds_state.json present locally with a quick summary.

In [None]:
# Setup 4 of N: restore feed index and state
# Guarantees local index and state exist before scraping.

import os, json, shutil, requests, datetime as dt, re, hashlib
from pathlib import Path
from huggingface_hub import hf_hub_download

PROJECT_ROOT = Path("/content/anti_echo").resolve()
FEEDS_DIR = PROJECT_ROOT / "feeds"
FEEDS_DIR.mkdir(parents=True, exist_ok=True)

INDEX_PATH = FEEDS_DIR / "index.json"
STATE_PATH = FEEDS_DIR / "feeds_state.json"
HF_DATASET_ID = os.environ["HF_DATASET_ID"]

REPO_OWNER = "AHMerrill"
REPO_NAME = "anti-echo-chamber"
BRANCH = "main"

def try_hf_restore() -> bool:
    # Prefer HF because it is the single source of truth in this design
    try:
        st = hf_hub_download(HF_DATASET_ID, "feeds/feeds_state_latest.json", repo_type="dataset")
        ix = hf_hub_download(HF_DATASET_ID, "feeds/feed_index_latest.json", repo_type="dataset")
        shutil.copy(st, STATE_PATH)
        shutil.copy(ix, INDEX_PATH)
        print("Restored feed state from HF latest")
        return True
    except Exception as e:
        print("HF latest not found:", e)
        return False

def try_github_restore() -> bool:
    # Fallback if HF latest pointers are not present yet
    try:
        base = f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{BRANCH}/feeds"
        got = False
        for src, dst in [("feeds_state_latest.json", STATE_PATH), ("feed_index_latest.json", INDEX_PATH)]:
            r = requests.get(f"{base}/{src}", timeout=20)
            if r.status_code == 200 and r.text.strip():
                dst.write_text(r.text, encoding="utf-8")
                got = True
        if got:
            print("Restored feed state from GitHub latest")
        return got
    except Exception as e:
        print("GitHub restore failed:", e)
        return False

restored = try_hf_restore() or try_github_restore()

if not restored:
    # Reconstruct from HF batch metadata listed in artifacts_registry.json
    print("No latest feed state found. Attempting reconstruction from HF batches...")
    REGISTRY_URL = f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{BRANCH}/artifacts/artifacts_registry.json"
    REGISTRY = requests.get(REGISTRY_URL, timeout=20).json()

    # Collect metadata jsonl from each batch
    metas = []
    for b in REGISTRY.get("batches", []):
        meta_rel = (b.get("hf_paths") or b.get("paths") or {}).get("metadata")
        if not meta_rel:
            continue
        try:
            metas.append(Path(hf_hub_download(HF_DATASET_ID, meta_rel, repo_type="dataset")))
        except Exception as e:
            print("Skip meta fetch:", e)

    # Build a minimal index of known URLs to prevent re scraping
    items = {}
    def norm(txt): return re.sub(r"\s+"," ", txt.strip().lower())
    def sha256_text(txt): return hashlib.sha256(norm(txt).encode()).hexdigest()

    for fp in metas:
        for line in fp.read_text(encoding="utf-8").splitlines():
            if not line.strip():
                continue
            try:
                obj = json.loads(line)
            except Exception:
                continue
            u = obj.get("url")
            if u and u not in items:
                items[u] = {"status": "ok", "fetched_at": dt.datetime.now(dt.timezone.utc).isoformat()}

    INDEX_PATH.write_text(json.dumps({"last_updated": dt.datetime.now(dt.timezone.utc).isoformat(),"items": items}, indent=2), encoding="utf-8")

    # Create a simple ring buffer structure for each feed
    url_hashes = [sha256_text(u)[:12] for u in items.keys()]
    feeds_block = {
        "commentisfree": {"feed_url": None, "recent_url_hashes": url_hashes[-1000:], "recent_url_hashes_max": 1000},
        "theguardian": {"feed_url": None, "recent_url_hashes": url_hashes[-500:], "recent_url_hashes_max": 500},
    }
    STATE_PATH.write_text(json.dumps({"version":1,"updated_at": dt.datetime.now(dt.timezone.utc).isoformat(),"feeds": feeds_block}, indent=2), encoding="utf-8")
    print("Reconstruction complete")

# Echo a small summary so you can verify
ix = json.loads(INDEX_PATH.read_text(encoding="utf-8"))
st = json.loads(STATE_PATH.read_text(encoding="utf-8"))
print({"index_items": len(ix.get("items",{})), "feeds": list(st.get("feeds",{}).keys())})


Enter your Hugging Face token: ··········
HF_TOKEN set in environment for this session (will reset when runtime restarts).


Setup 5 of N — rebuild Chroma from HF batches

Purpose
Create or refresh local Chroma collections from the HF dataset using artifacts/artifacts_registry.json as the ledger. HF is the source of truth. Local Chroma is a cache.

Why this matters
Ensures your retrieval state is consistent before adding new data. No duplicated rows. Clean numbering follows registry order.

Outputs
Two Chroma collections present with counts: topic and stance.

In [None]:
# Setup 5 of N: rebuild Chroma from HF
# Uses HF-hosted batches to fully restore local Chroma before any new run.

import os, json, numpy as np
from pathlib import Path
from huggingface_hub import hf_hub_download
import chromadb, requests

PROJECT_ROOT = Path("/content/anti_echo").resolve()
CHROMA_DIR = PROJECT_ROOT / CONFIG["chroma"]["dir"]
HF_DATASET_ID = CONFIG["hf_dataset_id"]
COLL_TOPIC = CONFIG["chroma_collections"]["topic"]
COLL_STANCE = CONFIG["chroma_collections"]["stance"]
EMB_DIM = int(CONFIG["embeddings"]["dim"])

def ensure_chroma():
    # Persistent Chroma so later cells see the same state
    client = chromadb.PersistentClient(path=str(CHROMA_DIR))
    t = client.get_or_create_collection(name=COLL_TOPIC, metadata={"hnsw:space": "cosine"})
    s = client.get_or_create_collection(name=COLL_STANCE, metadata={"hnsw:space": "cosine"})
    return client, t, s

def read_metadata_jsonl(fp: Path):
    # Metadata drives ids and retrieval context later
    ids, metas = [], []
    with fp.open("r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            obj = json.loads(line)
            ids.append(obj["id"])
            metas.append(obj)
    return ids, metas

def load_npz(fp: Path, dim: int):
    # Supports compressed npz with a single array inside
    arr = np.load(fp, allow_pickle=False)
    if isinstance(arr, np.lib.npyio.NpzFile):
        keys = list(arr.files)
        arr = arr[keys[0]]
    vecs = np.asarray(arr)
    if vecs.ndim != 2 or vecs.shape[1] != dim:
        raise ValueError(f"Bad embedding shape in {fp.name}. Got {vecs.shape}, expected [N,{dim}]")
    if not np.isfinite(vecs).all():
        raise ValueError(f"Non finite values in {fp.name}")
    return vecs

def upsert_chunks(coll, ids, vecs, metas, chunk=2048):
    # Bulk upsert in chunks to avoid payload size limits
    for i in range(0, len(ids), chunk):
        j = min(i+chunk, len(ids))
        coll.upsert(ids=ids[i:j], embeddings=vecs[i:j].tolist(), metadatas=metas[i:j])

# Pull the registry from GitHub to know which batches exist
REGISTRY_URL = f"https://raw.githubusercontent.com/AHMerrill/anti-echo-chamber/main/artifacts/artifacts_registry.json"
REGISTRY = requests.get(REGISTRY_URL, timeout=20).json()

client, topic_coll, stance_coll = ensure_chroma()
added = 0

# Replay all batches in order
for b in REGISTRY.get("batches", []):
    paths = b.get("hf_paths") or b.get("paths") or {}
    if not all(k in paths for k in ["embeddings_topic","embeddings_stance","metadata","manifest"]):
        continue

    # Download artifacts for this batch
    t_local = Path(hf_hub_download(HF_DATASET_ID, paths["embeddings_topic"], repo_type="dataset"))
    s_local = Path(hf_hub_download(HF_DATASET_ID, paths["embeddings_stance"], repo_type="dataset"))
    m_local = Path(hf_hub_download(HF_DATASET_ID, paths["metadata"], repo_type="dataset"))

    # Load ids, metadata, and vectors with shape checks
    ids, metas = read_metadata_jsonl(m_local)
    t_vecs = load_npz(t_local, EMB_DIM)
    s_vecs = load_npz(s_local, EMB_DIM)
    if t_vecs.shape[0] != len(ids) or s_vecs.shape[0] != len(ids):
        raise ValueError(f"Row count mismatch in batch {b.get('batch_id')}")

    # Upsert both collections
    upsert_chunks(topic_coll, ids, t_vecs, metas)
    upsert_chunks(stance_coll, ids, s_vecs, metas)
    added += len(ids)
    print(f"Ingested {b.get('batch_id')} +{len(ids)}")

print({"topic_count": topic_coll.count(), "stance_count": stance_coll.count(), "docs_added": added})


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Hugging Face login: OK
HF dataset found: zanimal/anti-echo-artifacts
{
  "registry_version": 2,
  "models": {
    "topic": "sentence-transformers/all-MiniLM-L6-v2",
    "stance": "sentence-transformers/all-MiniLM-L6-v2",
    "dim": 384
  },
  "batch_count": 1
}
Batches overview:
- batch_20251011T232938Z_283ca40f | docs=None | created_at=20251011T232938Z


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Setup 6 of N — scraping tunables and Guardian feeds

Purpose
Define scrape quotas, date floor, and feed list. Export to environment so the scraper reads configuration without edits.

Why this matters
Allows you to adjust scrape size and distribution per run from a single cell.

Outputs
Environment variables set and a printed summary.

In [None]:
# Setup 6 of N: scraping tunables and feeds
# Adjust only this cell to change scrape size and feed mix.

import os, json

# How many total articles to save this run across all feeds
MAX_ARTICLES = 250

# Optional hard cap per feed. Use None to remove the cap.
MAX_PER_FEED = None

# Only include items published on or after this date (ISO). Use "" to ignore.
DATE_FROM = "2025-07-01"

# If true, refetch even if the URL was seen already in index.json
FORCE_REFETCH = False

# If true, split evenly across feeds and send the remainder to QUOTA_REMAINDER_TO
EVEN_SPLIT = True
QUOTA_REMAINDER_TO = "commentisfree"

# Guardian RSS feeds. Add or remove as needed.
GUARDIAN_FEEDS = [
    ("world","https://www.theguardian.com/world/rss"),
    ("uk-news","https://www.theguardian.com/uk-news/rss"),
    ("us-news","https://www.theguardian.com/us-news/rss"),
    ("politics","https://www.theguardian.com/politics/rss"),
    ("environment","https://www.theguardian.com/uk/environment/rss"),
    ("climate-crisis","https://www.theguardian.com/environment/climate-crisis/rss"),
    ("technology","https://www.theguardian.com/uk/technology/rss"),
    ("science","https://www.theguardian.com/science/rss"),
    ("business","https://www.theguardian.com/uk/business/rss"),
    ("culture","https://www.theguardian.com/uk/culture/rss"),
    ("commentisfree","https://www.theguardian.com/commentisfree/rss")
]

# Export to environment so the scraper cell reads these values
os.environ["MAX_ARTICLES"] = str(MAX_ARTICLES)
os.environ["MAX_PER_FEED"] = "" if MAX_PER_FEED is None else str(MAX_PER_FEED)
os.environ["DATE_FROM"] = "" if not DATE_FROM else DATE_FROM
os.environ["FORCE_REFETCH"] = "true" if FORCE_REFETCH else "false"
os.environ["EVEN_SPLIT"] = "true" if EVEN_SPLIT else "false"
os.environ["QUOTA_REMAINDER_TO"] = QUOTA_REMAINDER_TO
os.environ["GUARDIAN_FEEDS_JSON"] = json.dumps(GUARDIAN_FEEDS)

print("Feeds configured:", len(GUARDIAN_FEEDS))
print("MAX_ARTICLES:", MAX_ARTICLES, "MAX_PER_FEED:", MAX_PER_FEED, "DATE_FROM:", DATE_FROM)


ValueError: An instance of Chroma already exists for /content/anti_echo/chroma_db with different settings

Setup 7 of N — Guardian scraper with dedupe

Purpose
Scrape Guardian feeds, skip URLs already in feeds/index.json, save raw/{id}.txt and {id}.meta.json, and update both feeds/index.json and feeds/feeds_state.json.

Why this matters
Prevents duplicates, keeps state consistent across runs, and prepares clean inputs for embedding.

Outputs
New articles saved under raw/, updated feeds/index.json and feeds/feeds_state.json, and a summary.

In [None]:
# Setup 7 of N: scraper with dedupe
# Reads tunables from env and updates local index and feed state.

import os, re, json, hashlib, datetime as dt
from pathlib import Path
from urllib.parse import urlparse
from email.utils import parsedate_to_datetime
import feedparser, trafilatura

PROJECT_ROOT = Path("/content/anti_echo").resolve()
RAW_DIR = PROJECT_ROOT / "raw"
FEEDS_DIR = PROJECT_ROOT / "feeds"
FEEDS_DIR.mkdir(parents=True, exist_ok=True)

INDEX_PATH = FEEDS_DIR / "index.json"
STATE_PATH = FEEDS_DIR / "feeds_state.json"

# Load tunables from previous cell via environment
GUARDIAN_FEEDS = json.loads(os.environ["GUARDIAN_FEEDS_JSON"])
MAX_ARTICLES = int(os.environ.get("MAX_ARTICLES", "30"))
MAX_PER_FEED = os.environ.get("MAX_PER_FEED","")
MAX_PER_FEED = None if MAX_PER_FEED == "" else int(MAX_PER_FEED)
DATE_FROM = os.environ.get("DATE_FROM","") or None
FORCE_REFETCH = os.environ.get("FORCE_REFETCH","false").lower() == "true"
EVEN_SPLIT = os.environ.get("EVEN_SPLIT","true").lower() == "true"
QUOTA_REMAINDER_TO = os.environ.get("QUOTA_REMAINDER_TO","commentisfree")

def now_iso(): return dt.datetime.now(dt.timezone.utc).isoformat()

def load_index():
    # Index tracks URL level status so we do not re scrape
    if INDEX_PATH.exists():
        try:
            return json.loads(INDEX_PATH.read_text(encoding="utf-8"))
        except Exception:
            pass
    return {"last_updated": None, "items": {}}

def save_index(idx):
    idx["last_updated"] = now_iso()
    INDEX_PATH.write_text(json.dumps(idx, indent=2), encoding="utf-8")

def load_state():
    # feeds_state holds per feed ring buffers and timestamps
    if STATE_PATH.exists():
        try:
            return json.loads(STATE_PATH.read_text(encoding="utf-8"))
        except Exception:
            pass
    return {"version":1,"updated_at":None,"feeds":{}}

index = load_index()
feeds_state = load_state()

# Initialize per feed state if missing
fs = feeds_state.setdefault("feeds", {})
for name, feed_url in GUARDIAN_FEEDS:
    fs.setdefault(name, {
        "feed_url": feed_url,
        "last_cursor_iso": None,
        "recent_url_hashes": [],
        "recent_url_hashes_max": 1000 if name == "commentisfree" else 500,
        "last_run_at": None,
        "last_run_by": "colab"
    })

def parse_date(entry):
    # Robustly pull a published or updated date from a feed entry
    for k in ["published","updated"]:
        val = getattr(entry, k, None) or entry.get(k)
        if val:
            try: return parsedate_to_datetime(val)
            except Exception: pass
    return None

def in_range(d, lower_iso):
    # Filter by DATE_FROM if provided
    if not lower_iso: return True
    try: floor = dt.datetime.fromisoformat(lower_iso).replace(tzinfo=dt.timezone.utc)
    except Exception: return True
    if d is None: return True
    if d.tzinfo is None: d = d.replace(tzinfo=dt.timezone.utc)
    return d >= floor

def normalize_text(txt): return re.sub(r"\s+"," ", txt.strip().lower())
def sha256_text(txt): return hashlib.sha256(txt.encode()).hexdigest()

def slugify(text, maxlen=60):
    # Safe filename slug from title
    s = re.sub(r"[^a-zA-Z0-9]+","-", text).strip("-").lower()
    return s[:maxlen] or "untitled"

def get_title(html, fallback="Untitled"):
    # Simple HTML title extraction as a fallback
    m = re.search(r"<title>(.*?)</title>", html or "", flags=re.I|re.S)
    return re.sub(r"\s+"," ", m.group(1)).strip() if m else fallback

def fetch_article(url):
    # Fetch HTML and extract readable main text
    html = trafilatura.fetch_url(url, no_ssl=False)
    if not html:
        raise RuntimeError("fetch failed")
    text = trafilatura.extract(html, include_comments=False, include_tables=False) or ""
    title = get_title(html, "Untitled")
    if not text.strip():
        raise RuntimeError("extraction empty")
    return title, text

def already_cached(url):
    # Check if index marks this URL as successfully processed
    return url in index["items"] and index["items"][url].get("status") == "ok"

def mark(url, status):
    # Record the outcome for this URL
    index["items"][url] = {"status": status, "fetched_at": now_iso()}
    save_index(index)

# Compute quotas so we do not oversample a single feed
feed_names = [n for n,_ in GUARDIAN_FEEDS]
if EVEN_SPLIT:
    base = MAX_ARTICLES // len(feed_names)
    rem = MAX_ARTICLES % len(feed_names)
    quotas = {n: base for n in feed_names}
    quotas[QUOTA_REMAINDER_TO if QUOTA_REMAINDER_TO in quotas else feed_names[0]] += rem
else:
    quotas = {n: 0 for n in feed_names}
    quotas[QUOTA_REMAINDER_TO if QUOTA_REMAINDER_TO in quotas else feed_names[0]] = MAX_ARTICLES
if isinstance(MAX_PER_FEED, int):
    quotas = {k: min(v, MAX_PER_FEED) for k,v in quotas.items()}
print("Quotas:", quotas)

saved_global = 0
errors_global = 0
seen_global = set()  # guard against duplicates across feeds

for name, feed_url in GUARDIAN_FEEDS:
    if saved_global >= MAX_ARTICLES:
        break
    quota = quotas.get(name, 0)
    if quota <= 0:
        continue

    # Parse RSS
    fp = feedparser.parse(feed_url)
    items = []
    for e in fp.entries:
        url = getattr(e, "link", None)
        if not url:
            continue
        pub = parse_date(e)
        if in_range(pub, DATE_FROM):
            items.append({"url": url, "published": pub})

    # De duplicate and order newest first
    uniq = []
    seen = set()
    for it in sorted(items, key=lambda x: (x["published"] or dt.datetime.min), reverse=True):
        if it["url"] in seen:
            continue
        seen.add(it["url"])
        uniq.append(it)

    saved_this = 0
    for it in uniq:
        if saved_global >= MAX_ARTICLES or saved_this >= quota:
            break
        url = it["url"]
        if url in seen_global:
            continue
        seen_global.add(url)

        if already_cached(url) and not FORCE_REFETCH:
            print(f"skip (cached) [{name}]: {url}")
            continue

        try:
            # Fetch and extract
            title, text = fetch_article(url)

            # Build deterministic id using domain, title slug, and text hash
            domain = urlparse(url).netloc
            slug = slugify(title)
            h = sha256_text(normalize_text(text))
            art_id = f"{domain}-{slug}-{h[:12]}"

            # Save artifacts
            txt_path = RAW_DIR / f"{art_id}.txt"
            meta_path = RAW_DIR / f"{art_id}.meta.json"
            txt_path.write_text(text, encoding="utf-8")
            meta = {
                "id": art_id,
                "url": url,
                "title": title,
                "source": "theguardian",
                "section": name,
                "domain": domain,
                "published": it["published"].isoformat() if it["published"] else None,
                "sha256": h,
                "chars": len(text),
                "saved_at": now_iso()
            }
            meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")

            # Update indexes
            mark(url, "ok")
            ring = fs[name]["recent_url_hashes"]
            ring.append(sha256_text(url)[:12])
            if len(ring) > fs[name]["recent_url_hashes_max"]:
                fs[name]["recent_url_hashes"] = ring[-fs[name]["recent_url_hashes_max"]:]
            fs[name]["last_run_at"] = now_iso()

            saved_this += 1
            saved_global += 1
            print(f"saved [{name}]: {txt_path.name} | {title[:90]}")
        except Exception as e:
            # Log error and move on
            mark(url, "error")
            errors_global += 1
            print(f"error [{name}]: {url} | {type(e).__name__}: {str(e)[:140]}")

# Persist the feeds_state after the run
feeds_state["updated_at"] = now_iso()
STATE_PATH.write_text(json.dumps(feeds_state, indent=2), encoding="utf-8")

# Final summary
print(json.dumps({
    "saved_total": saved_global,
    "errors_total": errors_global,
    "index_items": len(json.loads(INDEX_PATH.read_text()).get("items",{})),
    "feeds_state_path": str(STATE_PATH)
}, indent=2))


Tunables and Guardian feeds set.
Feeds configured: 42
MAX_ARTICLES=250, MAX_PER_FEED=None, DATE_FROM=2025-07-01, EVEN_SPLIT=True, REMAINDER_TO=commentisfree


Setup 8 of N — persist feed state to HF and GitHub

Purpose
Snapshot feeds/feeds_state.json and feeds/index.json to HF as timestamped copies and latest pointers, and commit the same to GitHub. This keeps HF as the single source of truth while providing Git history.

Why this matters
Future runs and collaborators can always restore state. The UI you build later can also read the latest pointers.

Outputs
Four files on HF and two files in GitHub updated, with a short summary.

In [None]:
# Setup 8 of N: persist feed state to HF and GitHub
# Uploads timestamped snapshots and maintains latest pointers on HF and GitHub.

import os, json, base64
from datetime import datetime, timezone
from pathlib import Path
from huggingface_hub import upload_file
import requests
from getpass import getpass

PROJECT_ROOT = Path("/content/anti_echo").resolve()
FEEDS_DIR = PROJECT_ROOT / "feeds"
STATE_PATH = FEEDS_DIR / "feeds_state.json"
INDEX_PATH = FEEDS_DIR / "index.json"

# Inputs
HF_DATASET_ID = os.environ["HF_DATASET_ID"]
HF_TOKEN = os.environ.get("HF_TOKEN","").strip() or getpass("Enter HF token: ")
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN","").strip() or getpass("Enter GitHub token: ")
REPO_OWNER = "AHMerrill"
REPO_NAME = "anti-echo-chamber"
BRANCH = "main"

# Prepare timestamped names plus stable latest names
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
uploads = [
    (STATE_PATH, f"feeds/feeds_state_{ts}.json"),
    (INDEX_PATH, f"feeds/feed_index_{ts}.json"),
    (STATE_PATH, "feeds/feeds_state_latest.json"),
    (INDEX_PATH, "feeds/feed_index_latest.json"),
]

# Upload to HF dataset
print("Uploading feed state to HF...")
for local, remote in uploads:
    upload_file(
        path_or_fileobj=str(local),
        path_in_repo=remote,
        repo_id=HF_DATASET_ID,
        repo_type="dataset",
        token=HF_TOKEN
    )
print("HF upload complete")

# Helper to upsert files in GitHub repo via REST API
def gh_put(local_path: Path, repo_path: str, message: str):
    url = f"https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/contents/{repo_path}"
    headers = {"Authorization": f"Bearer {GITHUB_TOKEN}", "Accept": "application/vnd.github+json"}
    content = local_path.read_bytes()
    # Fetch existing sha to update file in place if it already exists
    r = requests.get(url, headers=headers, timeout=20)
    sha = r.json().get("sha") if r.status_code == 200 else None
    payload = {"message": message, "content": base64.b64encode(content).decode(), "branch": BRANCH}
    if sha:
        payload["sha"] = sha
    resp = requests.put(url, headers=headers, json=payload, timeout=30)
    if resp.status_code not in (200,201):
        raise RuntimeError(f"GitHub push failed for {repo_path}: {resp.status_code} {resp.text[:300]}")

print("Committing feed state to GitHub...")
commit_msg = f"Update feed state and index {ts}"
for local, repo_path in [
    (STATE_PATH, f"feeds/feeds_state_{ts}.json"),
    (INDEX_PATH, f"feeds/feed_index_{ts}.json"),
    (STATE_PATH, "feeds/feeds_state_latest.json"),
    (INDEX_PATH, "feeds/feed_index_latest.json"),
]:
    gh_put(local, repo_path, commit_msg)
print("GitHub commit complete")


RuntimeError: Found no downloadable metadata files in registry.

Setup 9 of N — topic embeddings to Chroma

Purpose
Generate multi topic embeddings per article and upsert into the topic collection. This models what the article is about. Each article may get multiple topic vectors.

Why this matters
Topical neighbors power the first half of contrastive retrieval.

Outputs
Vectors upserted into the topic collection with structured metadata.

In [None]:
# Setup 9 of N: topic embeddings

import json, time, numpy as np, nltk, torch
from pathlib import Path
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import chromadb

PROJECT_ROOT = Path("/content/anti_echo").resolve()
RAW_DIR = PROJECT_ROOT / "raw"
device = "cuda" if torch.cuda.is_available() else "cpu"

topic_model_name = CONFIG["embeddings"]["topic_model"]
topic_dim = int(CONFIG["embeddings"]["dim"])
topic_dtype = CONFIG["embeddings"]["dtype"]
chunk_tokens = int(CONFIG["embeddings"]["chunk_tokens"])

# Ensure NLTK data
for pkg in ["punkt","punkt_tab"]:
    try: nltk.data.find(f"tokenizers/{pkg}")
    except LookupError: nltk.download(pkg)

tokenizer = AutoTokenizer.from_pretrained(topic_model_name, use_fast=True)
embedder = SentenceTransformer(topic_model_name, device=device)

# Get topic collection handle from persistent Chroma
client = chromadb.PersistentClient(path=str(PROJECT_ROOT / CONFIG["chroma"]["dir"]))
topic_coll = client.get_collection(CONFIG["chroma_collections"]["topic"])

def sent_split(text):
    # Use NLTK to split text into sentences for clustering
    return [s.strip() for s in nltk.sent_tokenize(text) if s.strip()]

def encode(texts, batch=16):
    # Batch encode using SentenceTransformer
    return embedder.encode(texts, convert_to_numpy=True, batch_size=batch, show_progress_bar=False)

def chunk_by_tokens(text, max_tokens=512, overlap=64):
    # Token chunking to keep each chunk within model context
    ids = tokenizer(text, add_special_tokens=False, return_attention_mask=False)["input_ids"]
    step = max_tokens - overlap
    chunks = []
    for i in range(0, len(ids), step):
        j = min(i+max_tokens, len(ids))
        piece = tokenizer.decode(ids[i:j], skip_special_tokens=True)
        if piece.strip():
            chunks.append(piece)
    return chunks

def sanitize(meta: dict):
    # Ensure metadata fields are JSON serializable primitives
    out = {}
    for k,v in meta.items():
        if isinstance(v,(str,int,float,bool)) or v is None:
            out[k] = "" if v is None else v
        else:
            out[k] = str(v)
    return out

def topic_vecs(text):
    # Produce 1 to 8 topic vectors by clustering sentence embeddings, then mean pooling token chunks
    sents = sent_split(text)
    if not sents:
        return []
    if len(sents) < 2:
        v = encode([" ".join(sents)])[0]
        return [v.astype(np.float16) if topic_dtype == "float16" else v.astype(np.float32)]
    emb = encode(sents)
    k = min(max(1, len(sents)//8), 8)
    labels = AgglomerativeClustering(n_clusters=k).fit_predict(emb)
    segs = [" ".join([s for s,l in zip(sents, labels) if l == lab]) for lab in sorted(set(labels))]
    out = []
    for seg in segs:
        ids = tokenizer(seg, add_special_tokens=False)["input_ids"][:512]
        seg_trunc = tokenizer.decode(ids, skip_special_tokens=True)
        chunks = chunk_by_tokens(seg_trunc, chunk_tokens, 64)
        if not chunks:
            continue
        pooled = encode(chunks).mean(axis=0)
        out.append(pooled.astype(np.float16) if topic_dtype == "float16" else pooled.astype(np.float32))
    return out

def upsert_in_chunks(collection, ids, vectors, metadatas, chunk=2048):
    # Chroma bulk upsert helper
    n = len(ids)
    for i in range(0, n, chunk):
        j = min(i + chunk, n)
        collection.upsert(
            ids=ids[i:j],
            embeddings=vectors[i:j].tolist(),
            metadatas=metadatas[i:j],
        )

start = time.time()
added = 0

for txt_path in RAW_DIR.glob("*.txt"):
    meta_path = txt_path.with_suffix(".meta.json")
    if not meta_path.exists():
        continue
    text = txt_path.read_text(encoding="utf-8").strip()
    if not text:
        continue
    meta = json.loads(meta_path.read_text(encoding="utf-8"))

    vecs = topic_vecs(text)
    if not vecs:
        continue

    ids = [f"{meta['id']}::topic::{i}" for i in range(len(vecs))]
    metas = [sanitize({**meta, "topic_index": i, "topic_model": topic_model_name}) for i in range(len(vecs))]
    upsert_in_chunks(topic_coll, ids, np.vstack(vecs), metas)
    added += len(vecs)

print(f"Topic upserts: {added} in {round(time.time()-start,2)}s")


HF restore not available: EntryNotFoundError: 404 Client Error. (Request ID: Root=1-68eb32ef-5aa20d691554c8a514566010;5837e9de-b70c-4d5e-a8a4-714a1a262107)

Entry Not Found for url: https://huggingface.co/datasets/zanimal/anti-echo-artifacts/resolve/main/feeds/feeds_state_latest.json.
GitHub latest feed state not found.
No prior feed state found. Starting fresh.


Setup 10 of N — stance embeddings to Chroma

Purpose
Summarize each article, embed the summary, and upsert one stance vector per article. This models how the article argues. The stance space will support later contrast queries.

Why this matters
Enables similarity in topic while allowing opposition in stance during retrieval.

Outputs
One stance vector per article upserted with the summary stored in metadata.

In [None]:
# Setup 10 of N: stance embeddings

import json, time, numpy as np, torch
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
import chromadb

PROJECT_ROOT = Path("/content/anti_echo").resolve()
RAW_DIR = PROJECT_ROOT / "raw"
device = "cuda" if torch.cuda.is_available() else "cpu"

stance_model_name = CONFIG["embeddings"]["stance_model"]
stance_dtype = CONFIG["embeddings"]["dtype"]
summarizer_name = CONFIG.get("summarizer",{}).get("model","facebook/bart-large-cnn") if torch.cuda.is_available() else "sshleifer/distilbart-cnn-12-6"

tok_sum = AutoTokenizer.from_pretrained(summarizer_name)
model_sum = AutoModelForSeq2SeqLM.from_pretrained(summarizer_name).to(device)
embedder = SentenceTransformer(stance_model_name, device=device)

client = chromadb.PersistentClient(path=str(PROJECT_ROOT / CONFIG["chroma"]["dir"]))
stance_coll = client.get_collection(CONFIG["chroma_collections"]["stance"])

def sanitize(meta: dict):
    out = {}
    for k,v in meta.items():
        if isinstance(v,(str,int,float,bool)) or v is None:
            out[k] = "" if v is None else v
        else:
            out[k] = str(v)
    return out

def summarize(text, max_in=1024, max_out=150):
    # Safe summarization with truncation to control runtime and memory
    inputs = tok_sum([text], return_tensors="pt", truncation=True, max_length=max_in).to(device)
    with torch.no_grad():
        out = model_sum.generate(**inputs, max_length=max_out, num_beams=4, early_stopping=True)
    return tok_sum.batch_decode(out, skip_special_tokens=True)[0].strip()

added = 0
skipped = 0

for txt_path in RAW_DIR.glob("*.txt"):
    meta_path = txt_path.with_suffix(".meta.json")
    if not meta_path.exists():
        continue
    meta = json.loads(meta_path.read_text(encoding="utf-8"))
    text = txt_path.read_text(encoding="utf-8").strip()
    if not text:
        continue

    try:
        summary = summarize(text)
        if not summary:
            skipped += 1
            continue
        vec = embedder.encode([summary], convert_to_numpy=True)[0]
        vec = vec.astype(np.float16) if stance_dtype == "float16" else vec.astype(np.float32)
        ids = [f"{meta['id']}::stance::0"]
        metas = [sanitize({**meta, "stance_summary": summary, "stance_model": stance_model_name, "summary_model": summarizer_name})]
        stance_coll.upsert(ids=ids, embeddings=[vec.tolist()], metadatas=metas)
        added += 1
    except Exception as e:
        skipped += 1
        print("skip:", meta.get("id"), e)

print({"stance_upserts": added, "skipped": skipped})


Per feed quotas for this run:
{
  "world": 5,
  "uk-news": 5,
  "us-news": 5,
  "politics": 5,
  "europe": 5,
  "americas": 5,
  "asia": 5,
  "australia-news": 5,
  "business": 5,
  "money": 5,
  "technology": 5,
  "science": 5,
  "global-development": 5,
  "environment": 5,
  "wildlife": 5,
  "pollution": 5,
  "climate-crisis": 5,
  "sport": 5,
  "football": 5,
  "cricket": 5,
  "tennis": 5,
  "golf": 5,
  "formulaone": 5,
  "cycling": 5,
  "rugby-union": 5,
  "culture": 5,
  "film": 5,
  "music": 5,
  "artanddesign": 5,
  "books": 5,
  "tv-and-radio": 5,
  "lifestyle": 5,
  "family": 5,
  "health": 5,
  "inequality": 5,
  "obituaries": 5,
  "travel": 5,
  "fashion": 5,
  "games": 5,
  "stage": 5,
  "crosswords": 5,
  "commentisfree": 45
}
saved [world]: www.theguardian.com-egypt-confirms-international-leaders-summit-on-monday-to-dis-0efa690f2f6e.txt | Egypt confirms international leaders’ summit on Monday to discuss Gaza ceasefire - live | 
saved [world]: www.theguardian.com-trump-sa

Setup 11 of N — package and push batch to HF, update registry on Git

Purpose
Package current Chroma collections into batch files, upload to HF under batches/{batch_id}/, then update artifacts/artifacts_registry.json in your GitHub repo. HF is the single source of truth. The registry keeps a chronological ledger for rebuilds.

Why this matters
Gives you versioned, reconstructable artifacts and a single source of truth for future runs and for a UI.

Outputs
topic_embeddings.npz, stance_embeddings.npz, metadata.jsonl, manifest.json uploaded to HF and registry updated on GitHub.

In [None]:
# Setup 11 of N: package and push batch, update registry

import os, json, time, uuid, warnings, logging, requests, base64
from datetime import datetime, timezone
from pathlib import Path
import numpy as np
from huggingface_hub import upload_file
import chromadb

logging.getLogger("chromadb").setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=DeprecationWarning)

PROJECT_ROOT = Path("/content/anti_echo").resolve()
BATCH_DIR = PROJECT_ROOT / CONFIG["batch"]["base_dir"]
HF_DATASET_ID = CONFIG["hf_dataset_id"]
REPO_OWNER = "AHMerrill"
REPO_NAME = "anti-echo-chamber"
BRANCH = "main"

client = chromadb.PersistentClient(path=str(PROJECT_ROOT / CONFIG["chroma"]["dir"]))
topic_coll = client.get_collection(CONFIG["chroma_collections"]["topic"])
stance_coll = client.get_collection(CONFIG["chroma_collections"]["stance"])

timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
batch_id = f"batch_{timestamp}_{uuid.uuid4().hex[:8]}"
batch_path = BATCH_DIR / batch_id
batch_path.mkdir(parents=True, exist_ok=True)

# Export all current vectors and aligned metadata
topic_data = topic_coll.get(include=["embeddings","metadatas"])
stance_data = stance_coll.get(include=["embeddings","metadatas"])

topic_vecs = np.array(topic_data["embeddings"], dtype=np.float16)
stance_vecs = np.array(stance_data["embeddings"], dtype=np.float16)
meta_records = topic_data["metadatas"]  # topic and stance share base ids in this scheme

# File paths configured in CONFIG
topic_npz = batch_path / CONFIG["batch"]["topic_file"]
stance_npz = batch_path / CONFIG["batch"]["stance_file"]
meta_path = batch_path / CONFIG["batch"]["metadata_file"]
manifest_path = batch_path / CONFIG["batch"]["manifest_name"]

# Write local artifacts
np.savez_compressed(topic_npz, topic_vecs)
np.savez_compressed(stance_npz, stance_vecs)
with meta_path.open("w", encoding="utf-8") as f:
    for m in meta_records:
        json.dump(m, f)
        f.write("\n")

manifest = {
    "batch_id": batch_id,
    "created_at": timestamp,
    "models": CONFIG["embeddings"],
    "counts": {"topic": len(topic_vecs), "stance": len(stance_vecs)},
    "hf_dataset_id": HF_DATASET_ID,
    "paths": {
        "embeddings_topic": f"batches/{batch_id}/{topic_npz.name}",
        "embeddings_stance": f"batches/{batch_id}/{stance_npz.name}",
        "metadata": f"batches/{batch_id}/{meta_path.name}",
        "manifest": f"batches/{batch_id}/{manifest_path.name}",
    }
}
manifest_path.write_text(json.dumps(manifest, indent=2), encoding="utf-8")
print("Manifest:", json.dumps(manifest, indent=2))

# Upload artifacts to HF dataset
print("Uploading batch to HF...")
for fpath in [topic_npz, stance_npz, meta_path, manifest_path]:
    upload_file(
        path_or_fileobj=str(fpath),
        path_in_repo=f"batches/{batch_id}/{fpath.name}",
        repo_id=HF_DATASET_ID,
        repo_type="dataset",
        token=os.environ["HF_TOKEN"]
    )
print("HF batch upload complete")

# Update registry on GitHub
REGISTRY_URL = f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{BRANCH}/artifacts/artifacts_registry.json"
try:
    registry = requests.get(REGISTRY_URL, timeout=20).json()
except Exception:
    registry = {"version": 1, "models": {}, "batches": []}

registry.setdefault("batches", []).append(manifest)
if isinstance(registry.get("version"), int):
    registry["version"] += 1
else:
    registry["version"] = 1

new_registry_bytes = json.dumps(registry, indent=2).encode("utf-8")
url = f"https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/contents/artifacts/artifacts_registry.json"
headers = {"Authorization": f"Bearer {os.environ['GITHUB_TOKEN']}", "Accept": "application/vnd.github+json"}
r = requests.get(url, headers=headers, timeout=20)
sha = r.json().get("sha") if r.status_code == 200 else None
payload = {
    "message": f"Update artifacts registry {timestamp}",
    "content": base64.b64encode(new_registry_bytes).decode(),
    "branch": BRANCH
}
if sha:
    payload["sha"] = sha
resp = requests.put(url, headers=headers, json=payload, timeout=30)
if resp.status_code not in (200,201):
    raise RuntimeError(f"GitHub registry push failed: {resp.status_code} {resp.text[:300]}")
print("Registry updated on GitHub")


Setup 12 of N — quick sanity query

Purpose
Verify both Chroma collections are populated and aligned. Useful when wiring a future UI that compares similar topics but dissimilar stances.

Why this matters
Catches empty collections or mismatched counts early.

Outputs
Counts for both collections and the collection names.

In [None]:
# Setup 12 of N: quick sanity query

import chromadb
from pathlib import Path

PROJECT_ROOT = Path("/content/anti_echo").resolve()
client = chromadb.PersistentClient(path=str(PROJECT_ROOT / CONFIG["chroma"]["dir"]))
topic_coll = client.get_collection(CONFIG["chroma_collections"]["topic"])
stance_coll = client.get_collection(CONFIG["chroma_collections"]["stance"])

print({
    "topic_count": topic_coll.count(),
    "stance_count": stance_coll.count(),
    "collections": CONFIG["chroma_collections"]
})


Embedding model: sentence-transformers/all-MiniLM-L6-v2, dim=384, dtype=float16, device=cuda


Token indices sequence length is longer than the specified maximum sequence length for this model (2486 > 512). Running this sequence through the model will result in indexing errors


Upserted 871 topic embeddings to collection news_topic in 19.87s
