## Setup 1 of N: environment and workspace

Goal
- Install core dependencies for scraping, embeddings, and Chroma
- Create a clean workspace in `/content/anti_echo`
- Print basic environment info so collaborators can debug quickly

Notes
- No Drive mount
- Keep installs minimal and pinned where sensible


In [1]:
# Setup 1 of N: environment and workspace
# Colab safe. No Drive mount.

import os
import sys
import subprocess
import textwrap
from pathlib import Path

def pip_install(pkgs):
    cmd = [sys.executable, "-m", "pip", "install", "-q"] + pkgs
    print("Installing:", " ".join(pkgs))
    subprocess.check_call(cmd)

# Core deps
pip_install([
    "feedparser==6.0.10",
    "trafilatura>=1.6.2,<2.0",
    "sentence-transformers>=2.6.1,<3.0",
    "chromadb>=0.5.5,<0.6.0",
    "huggingface_hub>=0.24.0,<0.28.0",
    "pyyaml>=6.0.1,<7.0",
    "numpy>=1.26.4,<3.0",
    "tqdm>=4.66.0,<5.0",
    "requests>=2.31.0,<3.0",
    "rapidfuzz>=3.6.0,<4.0"
])

# Optional but helpful
try:
    import torch
except Exception:
    pip_install(["torch>=2.2.0,<3.0"])

# Workspace layout
PROJECT_ROOT = Path("/content/anti_echo").resolve()
SUBDIRS = [
    "raw",
    "batches",
    "chroma_db",
    "logs",
    "feeds",
    "tmp"
]

for d in SUBDIRS:
    (PROJECT_ROOT / d).mkdir(parents=True, exist_ok=True)

# Environment tweaks
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Diagnostics
import platform, json
from importlib.metadata import version, PackageNotFoundError

def v(name):
    try:
        return version(name)
    except PackageNotFoundError:
        return "not-installed"

info = {
    "python": sys.version.split()[0],
    "platform": platform.platform(),
    "cuda_available": False,
    "packages": {
        "feedparser": v("feedparser"),
        "trafilatura": v("trafilatura"),
        "sentence-transformers": v("sentence-transformers"),
        "chromadb": v("chromadb"),
        "huggingface_hub": v("huggingface-hub"),
        "pyyaml": v("PyYAML"),
        "numpy": v("numpy"),
        "rapidfuzz": v("rapidfuzz"),
        "torch": v("torch"),
        "tqdm": v("tqdm"),
        "requests": v("requests"),
    },
    "paths": {
        "project_root": str(PROJECT_ROOT),
        "raw": str(PROJECT_ROOT / "raw"),
        "batches": str(PROJECT_ROOT / "batches"),
        "chroma_db": str(PROJECT_ROOT / "chroma_db"),
        "logs": str(PROJECT_ROOT / "logs"),
        "feeds": str(PROJECT_ROOT / "feeds"),
        "tmp": str(PROJECT_ROOT / "tmp"),
    }
}

try:
    import torch
    info["cuda_available"] = bool(torch.cuda.is_available())
    if info["cuda_available"]:
        info["cuda_device_name"] = torch.cuda.get_device_name(0)
except Exception:
    pass

print(json.dumps(info, indent=2))

# Place a small README in the workspace for orientation
workspace_readme = PROJECT_ROOT / "README_WORKSPACE.txt"
if not workspace_readme.exists():
    workspace_readme.write_text(textwrap.dedent("""
        anti echo chamber - Colab workspace
        This directory is ephemeral per session.
        Do not commit files from here directly.
        Subdirs:
          raw        - local scraped texts and meta for this session
          batches    - locally packaged batches before HF upload
          chroma_db  - local Chroma rebuild target
          logs       - run logs
          feeds      - runtime feed artifacts
          tmp        - scratch space
    """).strip() + "\n", encoding="utf-8")
print(f"Workspace ready at {PROJECT_ROOT}")


Installing: feedparser==6.0.10 trafilatura>=1.6.2,<2.0 sentence-transformers>=2.6.1,<3.0 chromadb>=0.5.5,<0.6.0 huggingface_hub>=0.24.0,<0.28.0 pyyaml>=6.0.1,<7.0 numpy>=1.26.4,<3.0 tqdm>=4.66.0,<5.0 requests>=2.31.0,<3.0 rapidfuzz>=3.6.0,<4.0
{
  "python": "3.12.11",
  "platform": "Linux-6.6.97+-x86_64-with-glibc2.35",
  "cuda_available": true,
  "packages": {
    "feedparser": "6.0.10",
    "trafilatura": "1.8.1",
    "sentence-transformers": "2.7.0",
    "chromadb": "0.5.23",
    "huggingface_hub": "0.27.1",
    "pyyaml": "6.0.3",
    "numpy": "1.26.4",
    "rapidfuzz": "3.14.1",
    "torch": "2.8.0+cu126",
    "tqdm": "4.67.1",
    "requests": "2.32.4"
  },
  "paths": {
    "project_root": "/content/anti_echo",
    "raw": "/content/anti_echo/raw",
    "batches": "/content/anti_echo/batches",
    "chroma_db": "/content/anti_echo/chroma_db",
    "logs": "/content/anti_echo/logs",
    "feeds": "/content/anti_echo/feeds",
    "tmp": "/content/anti_echo/tmp"
  },
  "cuda_device_name": "

## Setup 2 of N: config and paths bootstrap (robust fetch)

Goal
- Load shared config from GitHub with fallback paths
- Cache config locally for this session
- Initialize runtime paths and print key settings

Note
- Tries multiple candidate filenames for `stance_axes` and `topic_labels` in case they are saved with .json or .yaml


In [5]:
# Setup 2 of N: config and paths bootstrap (robust fetch)

import os
import json
import yaml
import requests
from pathlib import Path

PROJECT_ROOT = Path("/content/anti_echo").resolve()
CONFIG_CACHE = PROJECT_ROOT / "config_cache"
CONFIG_CACHE.mkdir(parents=True, exist_ok=True)

REPO_OWNER = "AHMerrill"
REPO_NAME = "anti-echo-chamber"
BRANCH = "main"

def raw_url(path: str) -> str:
    return f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{BRANCH}/{path.lstrip('/')}"

def fetch_text_first(paths):
    last_err = None
    tried = []
    for p in paths:
        url = raw_url(p)
        tried.append(url)
        try:
            r = requests.get(url, timeout=20)
            if r.status_code == 200 and r.text.strip():
                return r.text, p, url
        except Exception as e:
            last_err = e
    msg = "Could not fetch any of the candidate paths.\nTried:\n" + "\n".join(tried)
    if last_err:
        msg += f"\nLast error: {type(last_err).__name__}: {last_err}"
    raise RuntimeError(msg)

# Candidate repo paths
CFG_CANDIDATES = [
    "config/config.yaml",
    "config/config.yml",
    "config/config.json",
]
STANCE_CANDIDATES = [
    "config/stance_axes.json",
    "config/stance_axes.yaml",
    "config/stance_axes.yml",
    "config/stance_axes",
]
TOPIC_CANDIDATES = [
    "config/topic_labels.json",
    "config/topic_labels.yaml",
    "config/topic_labels.yml",
    "config/topic_labels",
]

# Fetch config files
cfg_txt, cfg_path, cfg_url = fetch_text_first(CFG_CANDIDATES)
stance_txt, stance_path, stance_url = fetch_text_first(STANCE_CANDIDATES)
topic_txt, topic_path, topic_url = fetch_text_first(TOPIC_CANDIDATES)

# Cache copies
(CONFIG_CACHE / Path(cfg_path).name).write_text(cfg_txt, encoding="utf-8")
(CONFIG_CACHE / (Path(stance_path).name if Path(stance_path).suffix else "stance_axes.json")).write_text(stance_txt, encoding="utf-8")
(CONFIG_CACHE / (Path(topic_path).name if Path(topic_path).suffix else "topic_labels.json")).write_text(topic_txt, encoding="utf-8")

# Parse helpers
def parse_maybe_json_or_yaml(txt: str):
    txt = txt.strip()
    # try json first
    try:
        return json.loads(txt)
    except Exception:
        pass
    # then yaml
    try:
        return yaml.safe_load(txt)
    except Exception as e:
        raise ValueError(f"Failed to parse as JSON or YAML: {e}")

# Parse into Python objects
if cfg_path.endswith((".yaml", ".yml")):
    CONFIG = yaml.safe_load(cfg_txt)
elif cfg_path.endswith(".json"):
    CONFIG = json.loads(cfg_txt)
else:
    # default to YAML for config
    CONFIG = yaml.safe_load(cfg_txt)

STANCE_AXES = parse_maybe_json_or_yaml(stance_txt)
TOPIC_LABELS = parse_maybe_json_or_yaml(topic_txt)

# Validate minimum keys
required_cfg_keys = ["hf_dataset_id", "chroma_collections", "embeddings", "batch", "ids", "chroma"]
missing = [k for k in required_cfg_keys if k not in CONFIG]
if missing:
    raise ValueError(f"Missing required config keys: {missing}")

# Create runtime subdirs
for key, path in {
    "raw": "raw",
    "batches": CONFIG["batch"]["base_dir"],
    "chroma_db": CONFIG["chroma"]["dir"],
    "logs": CONFIG.get("logging", {}).get("save_dir", "logs"),
    "tmp": "tmp"
}.items():
    (PROJECT_ROOT / path).mkdir(parents=True, exist_ok=True)

# Print a concise summary
summary = {
    "hf_dataset_id": CONFIG["hf_dataset_id"],
    "collections": CONFIG["chroma_collections"],
    "embeddings": {
        "topic_model": CONFIG["embeddings"]["topic_model"],
        "stance_model": CONFIG["embeddings"]["stance_model"],
        "dim": CONFIG["embeddings"]["dim"],
        "dtype": CONFIG["embeddings"]["dtype"],
        "pooling": CONFIG["embeddings"]["pooling"],
        "chunk_tokens": CONFIG["embeddings"]["chunk_tokens"]
    },
    "summarizer": CONFIG.get("summarizer", {}),
    "batch_files": {
        "topic_file": CONFIG["batch"]["topic_file"],
        "stance_file": CONFIG["batch"]["stance_file"],
        "metadata_file": CONFIG["batch"]["metadata_file"],
        "manifest_name": CONFIG["batch"]["manifest_name"],
        "base_dir": CONFIG["batch"]["base_dir"]
    },
    "id_policy": CONFIG["ids"],
    "paths": {
        "project_root": str(PROJECT_ROOT),
        "config_cache": str(CONFIG_CACHE),
        "raw": str(PROJECT_ROOT / "raw"),
        "batches": str(PROJECT_ROOT / CONFIG["batch"]["base_dir"]),
        "chroma_db": str(PROJECT_ROOT / CONFIG["chroma"]["dir"]),
        "logs": str(PROJECT_ROOT / CONFIG.get("logging", {}).get("save_dir", "logs")),
        "tmp": str(PROJECT_ROOT / "tmp")
    },
    "loaded": {
        "stance_axes_count": len(STANCE_AXES) if isinstance(STANCE_AXES, (list, dict)) else "unknown",
        "topic_labels_count": len(TOPIC_LABELS) if isinstance(TOPIC_LABELS, (list, dict)) else "unknown"
    },
    "source_urls": {
        "config": cfg_url,
        "stance_axes": stance_url,
        "topic_labels": topic_url
    }
}

print(json.dumps(summary, indent=2))

# Make HF dataset id available to later cells
os.environ["HF_DATASET_ID"] = CONFIG["hf_dataset_id"]


{
  "hf_dataset_id": "zanimal/anti-echo-artifacts",
  "collections": {
    "topic": "news_topic",
    "stance": "news_stance"
  },
  "embeddings": {
    "topic_model": "sentence-transformers/all-MiniLM-L6-v2",
    "stance_model": "sentence-transformers/all-MiniLM-L6-v2",
    "dim": 384,
    "dtype": "float16",
    "pooling": "mean",
    "chunk_tokens": 512
  },
  "summarizer": {
    "model": "facebook/bart-large-cnn",
    "target_sentences": 5,
    "truncation": 2048
  },
  "batch_files": {
    "topic_file": "embeddings_topic.npz",
    "stance_file": "embeddings_stance.npz",
    "metadata_file": "metadata.jsonl",
    "manifest_name": "manifest.json",
    "base_dir": "batches"
  },
  "id_policy": {
    "scheme": "domain-slug-sha12",
    "hash": "sha256",
    "normalize_whitespace": true,
    "lowercase": true
  },
  "paths": {
    "project_root": "/content/anti_echo",
    "config_cache": "/content/anti_echo/config_cache",
    "raw": "/content/anti_echo/raw",
    "batches": "/content/ant

## Setup 3 of N: Hugging Face auth and registry pull

Goal
- Authenticate to Hugging Face with HF_TOKEN
- Fetch the batch registry from GitHub
- Validate the registry schema and summarize batches

Notes
- If HF_TOKEN is not set, you can still proceed to read public data but uploads will fail later
- The registry lives at artifacts/artifacts_registry.json in your GitHub repo


In [7]:
# --- temporary auth cell for Colab session ---
import os
from getpass import getpass

if "HF_TOKEN" not in os.environ or not os.environ["HF_TOKEN"].strip():
    os.environ["HF_TOKEN"] = getpass("Enter your Hugging Face token: ")

print("HF_TOKEN set in environment for this session (will reset when runtime restarts).")


Enter your Hugging Face token: ··········
HF_TOKEN set in environment for this session (will reset when runtime restarts).


In [8]:
# Setup 3 of N: Hugging Face auth and registry pull

import os
import json
import requests
from pathlib import Path
from huggingface_hub import login, HfApi

PROJECT_ROOT = Path("/content/anti_echo").resolve()
CACHE_DIR = PROJECT_ROOT / "registry_cache"
CACHE_DIR.mkdir(parents=True, exist_ok=True)

REPO_OWNER = "AHMerrill"
REPO_NAME = "anti-echo-chamber"
BRANCH = "main"

def raw_url(path: str) -> str:
    return f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{BRANCH}/{path.lstrip('/')}"

# 1) HF auth
HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
if HF_TOKEN:
    try:
        login(token=HF_TOKEN, add_to_git_credential=False)
        print("Hugging Face login: OK")
    except Exception as e:
        print(f"Warning: HF login failed: {type(e).__name__}: {e}")
else:
    print("Warning: HF_TOKEN not set. You can read public artifacts but cannot upload.")

# 2) Validate the dataset exists
HF_DATASET_ID = os.environ.get("HF_DATASET_ID", "").strip()
if not HF_DATASET_ID:
    raise RuntimeError("HF_DATASET_ID not set in environment. It should have been set by Setup 2 from config.")
try:
    api = HfApi()
    ds_info = api.repo_info(HF_DATASET_ID, repo_type="dataset")
    print(f"HF dataset found: {HF_DATASET_ID}")
except Exception as e:
    print(f"Warning: Could not verify HF dataset {HF_DATASET_ID}: {type(e).__name__}: {e}")

# 3) Pull registry from GitHub
REGISTRY_URL = raw_url("artifacts/artifacts_registry.json")
r = requests.get(REGISTRY_URL, timeout=20)
if r.status_code != 200:
    raise RuntimeError(f"Failed to fetch registry from {REGISTRY_URL}. Status {r.status_code}")
registry_txt = r.text
(REGISTRY_CACHE_PATH := CACHE_DIR / "artifacts_registry.json").write_text(registry_txt, encoding="utf-8")

try:
    REGISTRY = json.loads(registry_txt)
except Exception as e:
    raise ValueError(f"Registry JSON parse failed: {e}")

# 4) Minimal schema checks and summary
required_top = ["version", "models", "batches"]
missing = [k for k in required_top if k not in REGISTRY]
if missing:
    raise ValueError(f"Registry missing required keys: {missing}")

models_block = REGISTRY.get("models", {})
batches = REGISTRY.get("batches", [])
model_summary = {
    "topic": models_block.get("topic"),
    "stance": models_block.get("stance"),
    "dim": models_block.get("dim")
}

summary = {
    "registry_version": REGISTRY.get("version"),
    "models": model_summary,
    "batch_count": len(batches),
}

# Print concise summary
print(json.dumps(summary, indent=2))

# If batches exist, show a compact table
if batches:
    rows = []
    for b in batches:
        rows.append({
            "batch_id": b.get("batch_id"),
            "docs": b.get("counts", {}).get("docs"),
            "created_at": b.get("created_at"),
        })
    # Keep it readable
    print("Batches overview:")
    for row in rows:
        print(f"- {row['batch_id']} | docs={row['docs']} | created_at={row['created_at']}")
else:
    print("No batches listed yet in artifacts_registry.json")

# Make available to later cells
os.environ["REGISTRY_PATH"] = str(REGISTRY_CACHE_PATH)


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Hugging Face login: OK
HF dataset found: zanimal/anti-echo-artifacts
{
  "registry_version": 1,
  "models": {
    "topic": "sentence-transformers/all-MiniLM-L6-v2",
    "stance": "sentence-transformers/all-MiniLM-L6-v2",
    "dim": 384
  },
  "batch_count": 0
}
No batches listed yet in artifacts_registry.json


## Setup 4 of N: Chroma rebuild or initialize

Goal
- Create a persistent Chroma client under `/content/anti_echo/chroma_db`
- Ensure two collections exist: `news_topic` and `news_stance`
- If batches are listed in the registry, download and ingest them in order
- If no batches yet, initialize empty collections and print a clear summary

Notes
- Uses `artifacts/artifacts_registry.json` as the source of truth
- Validates shapes and dims before inserting
- Safe to re run


In [9]:
# Setup 4 of N: Chroma rebuild or initialize

import os
import io
import json
import numpy as np
from pathlib import Path
from typing import List, Dict, Tuple
from huggingface_hub import hf_hub_download
import chromadb

# Inputs from prior cells
PROJECT_ROOT = Path("/content/anti_echo").resolve()
CHROMA_DIR = PROJECT_ROOT / "chroma_db"
REGISTRY_PATH = Path(os.environ.get("REGISTRY_PATH", PROJECT_ROOT / "registry_cache" / "artifacts_registry.json"))
HF_DATASET_ID = os.environ["HF_DATASET_ID"]

# CONFIG must already be loaded in memory by Setup 2
try:
    CONFIG
except NameError:
    raise RuntimeError("CONFIG is not defined. Please run Setup 2 first.")

COLL_TOPIC = CONFIG["chroma_collections"]["topic"]
COLL_STANCE = CONFIG["chroma_collections"]["stance"]
EMB_DIM = int(CONFIG["embeddings"]["dim"])

def load_registry(path: Path) -> Dict:
    if not path.exists():
        raise FileNotFoundError(f"Registry not found at {path}")
    return json.loads(path.read_text(encoding="utf-8"))

def read_metadata_jsonl(fp: Path) -> Tuple[List[str], List[Dict]]:
    ids = []
    metas = []
    with fp.open("r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            obj = json.loads(line)
            ids.append(obj["id"])
            metas.append(obj)
    return ids, metas

def load_npz_vectors(fp: Path, expected_dim: int) -> np.ndarray:
    arr = np.load(fp)["arr_0"] if "arr_0" in np.load(fp).files else np.load(fp, allow_pickle=False)
    # If saved as a direct array with no key, np.load returns an ndarray, not an NpzFile
    if isinstance(arr, np.lib.npyio.NpzFile):
        # handle the case where npz contains a named array
        keys = list(arr.files)
        if not keys:
            raise ValueError(f"No arrays found in {fp.name}")
        arr = arr[keys[0]]
    vecs = np.array(arr)
    if vecs.ndim != 2 or vecs.shape[1] != expected_dim:
        raise ValueError(f"Bad embedding shape in {fp.name}. Got {vecs.shape}, expected [N, {expected_dim}]")
    if not np.isfinite(vecs).all():
        raise ValueError(f"Non finite values found in {fp.name}")
    return vecs

def ensure_chroma():
    client = chromadb.PersistentClient(path=str(CHROMA_DIR))
    topic = client.get_or_create_collection(name=COLL_TOPIC, metadata={"hnsw:space": "cosine"})
    stance = client.get_or_create_collection(name=COLL_STANCE, metadata={"hnsw:space": "cosine"})
    return client, topic, stance

def upsert_in_chunks(collection, ids: List[str], vectors: np.ndarray, metadatas: List[Dict], chunk: int = 2048):
    n = len(ids)
    for i in range(0, n, chunk):
        j = min(i + chunk, n)
        collection.upsert(
            ids=ids[i:j],
            embeddings=vectors[i:j].tolist(),
            metadatas=metadatas[i:j],
        )

def ingest_batch_record(batch: Dict, topic_coll, stance_coll) -> Dict:
    # Expect registry to store relative HF paths or full URLs. Prefer relative paths under batches/<batch_id>/*
    topic_path = batch.get("hf_paths", {}).get("embeddings_topic")
    stance_path = batch.get("hf_paths", {}).get("embeddings_stance")
    meta_path = batch.get("hf_paths", {}).get("metadata")
    manifest_path = batch.get("hf_paths", {}).get("manifest")
    if not all([topic_path, stance_path, meta_path, manifest_path]):
        raise ValueError(f"Incomplete hf_paths in registry for batch {batch.get('batch_id')}")

    # Download artifacts from HF dataset
    t_local = Path(hf_hub_download(repo_id=HF_DATASET_ID, repo_type="dataset", filename=topic_path))
    s_local = Path(hf_hub_download(repo_id=HF_DATASET_ID, repo_type="dataset", filename=stance_path))
    m_local = Path(hf_hub_download(repo_id=HF_DATASET_ID, repo_type="dataset", filename=meta_path))
    _ = Path(hf_hub_download(repo_id=HF_DATASET_ID, repo_type="dataset", filename=manifest_path))

    # Load metadata and embeddings
    ids, metas = read_metadata_jsonl(m_local)
    t_vecs = load_npz_vectors(t_local, EMB_DIM)
    s_vecs = load_npz_vectors(s_local, EMB_DIM)

    if len(ids) != t_vecs.shape[0] or len(ids) != s_vecs.shape[0]:
        raise ValueError(f"Row count mismatch in batch {batch.get('batch_id')}")

    # Upsert to collections
    upsert_in_chunks(topic_coll, ids, t_vecs, metas)
    upsert_in_chunks(stance_coll, ids, s_vecs, metas)

    return {
        "batch_id": batch.get("batch_id"),
        "docs": len(ids),
        "topic_count": topic_coll.count(),
        "stance_count": stance_coll.count(),
    }

# Run
REGISTRY = load_registry(REGISTRY_PATH)
batches = REGISTRY.get("batches", [])

client, topic_coll, stance_coll = ensure_chroma()

if not batches:
    print("No batches in registry. Initialized empty Chroma collections.")
    print({
        "topic_collection": COLL_TOPIC,
        "stance_collection": COLL_STANCE,
        "topic_count": topic_coll.count(),
        "stance_count": stance_coll.count(),
        "store": str(CHROMA_DIR),
    })
else:
    print(f"Ingesting {len(batches)} batch(es) from HF dataset {HF_DATASET_ID}")
    totals = []
    for b in batches:
        try:
            res = ingest_batch_record(b, topic_coll, stance_coll)
            print(f"Ingested batch {res['batch_id']}: +{res['docs']} docs")
            totals.append(res)
        except Exception as e:
            print(f"Warning: failed to ingest batch {b.get('batch_id')}: {type(e).__name__}: {e}")

    print("Chroma rebuild summary:")
    print({
        "topic_count": topic_coll.count(),
        "stance_count": stance_coll.count(),
        "batches_ingested": len(totals),
        "store": str(CHROMA_DIR),
    })


ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


No batches in registry. Initialized empty Chroma collections.
{'topic_collection': 'news_topic', 'stance_collection': 'news_stance', 'topic_count': 0, 'stance_count': 0, 'store': '/content/anti_echo/chroma_db'}


## Setup 5A: tunables and Guardian feeds

Use this cell to:
- Set how many total articles to scrape per run
- Set optional per feed caps
- Pick the date floor
- Control even distribution across feeds (with remainder to a preferred feed)
- Define the full Guardian feed list in one place

Notes
- The current default will scrape 30 total articles, evenly split across the feeds below, with any remainder to Comment is Free.
- If MAX_ARTICLES is smaller than the number of feeds, many feeds will get 0 for that run. Increase MAX_ARTICLES to cover more feeds per run.
- We will wire the scraper to read these values from the environment and JSON so you can change them here only.


In [10]:
# Setup 5A: tunables and Guardian feeds

import os, json

# ---- How many articles and distribution policy ----
MAX_ARTICLES = 250              # total across all feeds this run
MAX_PER_FEED = None            # None for no hard cap, or set an int (e.g., 3)
DATE_FROM = "2025-07-01"       # ISO-8601 UTC lower bound; set None to ignore
FORCE_REFETCH = False          # True to re-download even if cached
EVEN_SPLIT = True              # True to evenly split MAX_ARTICLES across feeds
QUOTA_REMAINDER_TO = "commentisfree"  # where to send the remainder from the even split

# ---- Guardian feeds (edit here to add/remove) ----
GUARDIAN_FEEDS = [
    ("world",           "https://www.theguardian.com/world/rss"),
    ("uk-news",         "https://www.theguardian.com/uk-news/rss"),
    ("us-news",         "https://www.theguardian.com/us-news/rss"),
    ("politics",        "https://www.theguardian.com/politics/rss"),
    ("europe",          "https://www.theguardian.com/world/europe/rss"),
    ("americas",        "https://www.theguardian.com/world/americas/rss"),
    ("asia",            "https://www.theguardian.com/world/asia/rss"),
    ("australia-news",  "https://www.theguardian.com/australia-news/rss"),
    ("business",        "https://www.theguardian.com/uk/business/rss"),
    ("money",           "https://www.theguardian.com/uk/money/rss"),
    ("technology",      "https://www.theguardian.com/uk/technology/rss"),
    ("science",         "https://www.theguardian.com/science/rss"),
    ("global-development","https://www.theguardian.com/global-development/rss"),
    ("environment",     "https://www.theguardian.com/uk/environment/rss"),
    ("wildlife",        "https://www.theguardian.com/environment/wildlife/rss"),
    ("pollution",       "https://www.theguardian.com/environment/pollution/rss"),
    ("climate-crisis",  "https://www.theguardian.com/environment/climate-crisis/rss"),
    ("sport",           "https://www.theguardian.com/uk/sport/rss"),
    ("football",        "https://www.theguardian.com/football/rss"),
    ("cricket",         "https://www.theguardian.com/sport/cricket/rss"),
    ("tennis",          "https://www.theguardian.com/sport/tennis/rss"),
    ("golf",            "https://www.theguardian.com/sport/golf/rss"),
    ("formulaone",      "https://www.theguardian.com/sport/formulaone/rss"),
    ("cycling",         "https://www.theguardian.com/sport/cycling/rss"),
    ("rugby-union",     "https://www.theguardian.com/sport/rugby-union/rss"),
    ("culture",         "https://www.theguardian.com/uk/culture/rss"),
    ("film",            "https://www.theguardian.com/uk/film/rss"),
    ("music",           "https://www.theguardian.com/music/rss"),
    ("artanddesign",    "https://www.theguardian.com/artanddesign/rss"),
    ("books",           "https://www.theguardian.com/books/rss"),
    ("tv-and-radio",    "https://www.theguardian.com/uk/tv-and-radio/rss"),
    ("lifestyle",       "https://www.theguardian.com/uk/lifeandstyle/rss"),
    ("family",          "https://www.theguardian.com/lifeandstyle/family/rss"),
    ("health",          "https://www.theguardian.com/lifeandstyle/health-and-wellbeing/rss"),
    ("inequality",      "https://www.theguardian.com/inequality/rss"),
    ("obituaries",      "https://www.theguardian.com/tone/obituaries/rss"),
    ("travel",          "https://www.theguardian.com/uk/travel/rss"),
    ("fashion",         "https://www.theguardian.com/fashion/rss"),
    ("games",           "https://www.theguardian.com/games/rss"),
    ("stage",           "https://www.theguardian.com/stage/rss"),
    ("crosswords",      "https://www.theguardian.com/crosswords/rss"),
    ("commentisfree",   "https://www.theguardian.com/commentisfree/rss")  # opinion
]

# ---- Export to environment so the scraper can read without edits ----
os.environ["MAX_ARTICLES"] = str(MAX_ARTICLES)
os.environ["MAX_PER_FEED"] = "" if MAX_PER_FEED is None else str(MAX_PER_FEED)
os.environ["DATE_FROM"] = "" if DATE_FROM in (None, "") else DATE_FROM
os.environ["FORCE_REFETCH"] = "true" if FORCE_REFETCH else "false"
os.environ["EVEN_SPLIT"] = "true" if EVEN_SPLIT else "false"
os.environ["QUOTA_REMAINDER_TO"] = QUOTA_REMAINDER_TO

# Serialize feeds to JSON as a list of [name, url]
os.environ["GUARDIAN_FEEDS_JSON"] = json.dumps(GUARDIAN_FEEDS)

print("Tunables and Guardian feeds set.")
print(f"Feeds configured: {len(GUARDIAN_FEEDS)}")
print(f"MAX_ARTICLES={MAX_ARTICLES}, MAX_PER_FEED={MAX_PER_FEED}, DATE_FROM={DATE_FROM}, EVEN_SPLIT={EVEN_SPLIT}, REMAINDER_TO={QUOTA_REMAINDER_TO}")


Tunables and Guardian feeds set.
Feeds configured: 42
MAX_ARTICLES=250, MAX_PER_FEED=None, DATE_FROM=2025-07-01, EVEN_SPLIT=True, REMAINDER_TO=commentisfree
