## Setup 1 of N: environment and workspace

Goal
- Install core dependencies for scraping, embeddings, and Chroma
- Create a clean workspace in `/content/anti_echo`
- Print basic environment info so collaborators can debug quickly

Notes
- No Drive mount
- Keep installs minimal and pinned where sensible


In [1]:
# Setup 1 of N: environment and workspace
# Colab safe. No Drive mount.

import os
import sys
import subprocess
import textwrap
from pathlib import Path

def pip_install(pkgs):
    cmd = [sys.executable, "-m", "pip", "install", "-q"] + pkgs
    print("Installing:", " ".join(pkgs))
    subprocess.check_call(cmd)

# Core deps
pip_install([
    "feedparser==6.0.10",
    "trafilatura>=1.6.2,<2.0",
    "sentence-transformers>=2.6.1,<3.0",
    "chromadb>=0.5.5,<0.6.0",
    "huggingface_hub>=0.24.0,<0.28.0",
    "pyyaml>=6.0.1,<7.0",
    "numpy>=1.26.4,<3.0",
    "tqdm>=4.66.0,<5.0",
    "requests>=2.31.0,<3.0",
    "rapidfuzz>=3.6.0,<4.0"
])

# Optional but helpful
try:
    import torch
except Exception:
    pip_install(["torch>=2.2.0,<3.0"])

# Workspace layout
PROJECT_ROOT = Path("/content/anti_echo").resolve()
SUBDIRS = [
    "raw",
    "batches",
    "chroma_db",
    "logs",
    "feeds",
    "tmp"
]

for d in SUBDIRS:
    (PROJECT_ROOT / d).mkdir(parents=True, exist_ok=True)

# Environment tweaks
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Diagnostics
import platform, json
from importlib.metadata import version, PackageNotFoundError

def v(name):
    try:
        return version(name)
    except PackageNotFoundError:
        return "not-installed"

info = {
    "python": sys.version.split()[0],
    "platform": platform.platform(),
    "cuda_available": False,
    "packages": {
        "feedparser": v("feedparser"),
        "trafilatura": v("trafilatura"),
        "sentence-transformers": v("sentence-transformers"),
        "chromadb": v("chromadb"),
        "huggingface_hub": v("huggingface-hub"),
        "pyyaml": v("PyYAML"),
        "numpy": v("numpy"),
        "rapidfuzz": v("rapidfuzz"),
        "torch": v("torch"),
        "tqdm": v("tqdm"),
        "requests": v("requests"),
    },
    "paths": {
        "project_root": str(PROJECT_ROOT),
        "raw": str(PROJECT_ROOT / "raw"),
        "batches": str(PROJECT_ROOT / "batches"),
        "chroma_db": str(PROJECT_ROOT / "chroma_db"),
        "logs": str(PROJECT_ROOT / "logs"),
        "feeds": str(PROJECT_ROOT / "feeds"),
        "tmp": str(PROJECT_ROOT / "tmp"),
    }
}

try:
    import torch
    info["cuda_available"] = bool(torch.cuda.is_available())
    if info["cuda_available"]:
        info["cuda_device_name"] = torch.cuda.get_device_name(0)
except Exception:
    pass

print(json.dumps(info, indent=2))

# Place a small README in the workspace for orientation
workspace_readme = PROJECT_ROOT / "README_WORKSPACE.txt"
if not workspace_readme.exists():
    workspace_readme.write_text(textwrap.dedent("""
        anti echo chamber - Colab workspace
        This directory is ephemeral per session.
        Do not commit files from here directly.
        Subdirs:
          raw        - local scraped texts and meta for this session
          batches    - locally packaged batches before HF upload
          chroma_db  - local Chroma rebuild target
          logs       - run logs
          feeds      - runtime feed artifacts
          tmp        - scratch space
    """).strip() + "\n", encoding="utf-8")
print(f"Workspace ready at {PROJECT_ROOT}")


Installing: feedparser==6.0.10 trafilatura>=1.6.2,<2.0 sentence-transformers>=2.6.1,<3.0 chromadb>=0.5.5,<0.6.0 huggingface_hub>=0.24.0,<0.28.0 pyyaml>=6.0.1,<7.0 numpy>=1.26.4,<3.0 tqdm>=4.66.0,<5.0 requests>=2.31.0,<3.0 rapidfuzz>=3.6.0,<4.0
{
  "python": "3.12.11",
  "platform": "Linux-6.6.97+-x86_64-with-glibc2.35",
  "cuda_available": true,
  "packages": {
    "feedparser": "6.0.10",
    "trafilatura": "1.12.2",
    "sentence-transformers": "2.7.0",
    "chromadb": "0.5.23",
    "huggingface_hub": "0.27.1",
    "pyyaml": "6.0.3",
    "numpy": "2.0.2",
    "rapidfuzz": "3.14.1",
    "torch": "2.8.0+cu126",
    "tqdm": "4.67.1",
    "requests": "2.32.4"
  },
  "paths": {
    "project_root": "/content/anti_echo",
    "raw": "/content/anti_echo/raw",
    "batches": "/content/anti_echo/batches",
    "chroma_db": "/content/anti_echo/chroma_db",
    "logs": "/content/anti_echo/logs",
    "feeds": "/content/anti_echo/feeds",
    "tmp": "/content/anti_echo/tmp"
  },
  "cuda_device_name": "

## Setup 2 of N: config and paths bootstrap (robust fetch)

Goal
- Load shared config from GitHub with fallback paths
- Cache config locally for this session
- Initialize runtime paths and print key settings

Note
- Tries multiple candidate filenames for `stance_axes` and `topic_labels` in case they are saved with .json or .yaml


In [2]:
# Setup 2 of N: config and paths bootstrap (robust fetch)

import os
import json
import yaml
import requests
from pathlib import Path

PROJECT_ROOT = Path("/content/anti_echo").resolve()
CONFIG_CACHE = PROJECT_ROOT / "config_cache"
CONFIG_CACHE.mkdir(parents=True, exist_ok=True)

REPO_OWNER = "AHMerrill"
REPO_NAME = "anti-echo-chamber"
BRANCH = "main"

def raw_url(path: str) -> str:
    return f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{BRANCH}/{path.lstrip('/')}"

def fetch_text_first(paths):
    last_err = None
    tried = []
    for p in paths:
        url = raw_url(p)
        tried.append(url)
        try:
            r = requests.get(url, timeout=20)
            if r.status_code == 200 and r.text.strip():
                return r.text, p, url
        except Exception as e:
            last_err = e
    msg = "Could not fetch any of the candidate paths.\nTried:\n" + "\n".join(tried)
    if last_err:
        msg += f"\nLast error: {type(last_err).__name__}: {last_err}"
    raise RuntimeError(msg)

# Candidate repo paths
CFG_CANDIDATES = [
    "config/config.yaml",
    "config/config.yml",
    "config/config.json",
]
STANCE_CANDIDATES = [
    "config/stance_axes.json",
    "config/stance_axes.yaml",
    "config/stance_axes.yml",
    "config/stance_axes",
]
TOPIC_CANDIDATES = [
    "config/topic_labels.json",
    "config/topic_labels.yaml",
    "config/topic_labels.yml",
    "config/topic_labels",
]

# Fetch config files
cfg_txt, cfg_path, cfg_url = fetch_text_first(CFG_CANDIDATES)
stance_txt, stance_path, stance_url = fetch_text_first(STANCE_CANDIDATES)
topic_txt, topic_path, topic_url = fetch_text_first(TOPIC_CANDIDATES)

# Cache copies
(CONFIG_CACHE / Path(cfg_path).name).write_text(cfg_txt, encoding="utf-8")
(CONFIG_CACHE / (Path(stance_path).name if Path(stance_path).suffix else "stance_axes.json")).write_text(stance_txt, encoding="utf-8")
(CONFIG_CACHE / (Path(topic_path).name if Path(topic_path).suffix else "topic_labels.json")).write_text(topic_txt, encoding="utf-8")

# Parse helpers
def parse_maybe_json_or_yaml(txt: str):
    txt = txt.strip()
    # try json first
    try:
        return json.loads(txt)
    except Exception:
        pass
    # then yaml
    try:
        return yaml.safe_load(txt)
    except Exception as e:
        raise ValueError(f"Failed to parse as JSON or YAML: {e}")

# Parse into Python objects
if cfg_path.endswith((".yaml", ".yml")):
    CONFIG = yaml.safe_load(cfg_txt)
elif cfg_path.endswith(".json"):
    CONFIG = json.loads(cfg_txt)
else:
    # default to YAML for config
    CONFIG = yaml.safe_load(cfg_txt)

STANCE_AXES = parse_maybe_json_or_yaml(stance_txt)
TOPIC_LABELS = parse_maybe_json_or_yaml(topic_txt)

# Validate minimum keys
required_cfg_keys = ["hf_dataset_id", "chroma_collections", "embeddings", "batch", "ids", "chroma"]
missing = [k for k in required_cfg_keys if k not in CONFIG]
if missing:
    raise ValueError(f"Missing required config keys: {missing}")

# Create runtime subdirs
for key, path in {
    "raw": "raw",
    "batches": CONFIG["batch"]["base_dir"],
    "chroma_db": CONFIG["chroma"]["dir"],
    "logs": CONFIG.get("logging", {}).get("save_dir", "logs"),
    "tmp": "tmp"
}.items():
    (PROJECT_ROOT / path).mkdir(parents=True, exist_ok=True)

# Print a concise summary
summary = {
    "hf_dataset_id": CONFIG["hf_dataset_id"],
    "collections": CONFIG["chroma_collections"],
    "embeddings": {
        "topic_model": CONFIG["embeddings"]["topic_model"],
        "stance_model": CONFIG["embeddings"]["stance_model"],
        "dim": CONFIG["embeddings"]["dim"],
        "dtype": CONFIG["embeddings"]["dtype"],
        "pooling": CONFIG["embeddings"]["pooling"],
        "chunk_tokens": CONFIG["embeddings"]["chunk_tokens"]
    },
    "summarizer": CONFIG.get("summarizer", {}),
    "batch_files": {
        "topic_file": CONFIG["batch"]["topic_file"],
        "stance_file": CONFIG["batch"]["stance_file"],
        "metadata_file": CONFIG["batch"]["metadata_file"],
        "manifest_name": CONFIG["batch"]["manifest_name"],
        "base_dir": CONFIG["batch"]["base_dir"]
    },
    "id_policy": CONFIG["ids"],
    "paths": {
        "project_root": str(PROJECT_ROOT),
        "config_cache": str(CONFIG_CACHE),
        "raw": str(PROJECT_ROOT / "raw"),
        "batches": str(PROJECT_ROOT / CONFIG["batch"]["base_dir"]),
        "chroma_db": str(PROJECT_ROOT / CONFIG["chroma"]["dir"]),
        "logs": str(PROJECT_ROOT / CONFIG.get("logging", {}).get("save_dir", "logs")),
        "tmp": str(PROJECT_ROOT / "tmp")
    },
    "loaded": {
        "stance_axes_count": len(STANCE_AXES) if isinstance(STANCE_AXES, (list, dict)) else "unknown",
        "topic_labels_count": len(TOPIC_LABELS) if isinstance(TOPIC_LABELS, (list, dict)) else "unknown"
    },
    "source_urls": {
        "config": cfg_url,
        "stance_axes": stance_url,
        "topic_labels": topic_url
    }
}

print(json.dumps(summary, indent=2))

# Make HF dataset id available to later cells
os.environ["HF_DATASET_ID"] = CONFIG["hf_dataset_id"]


{
  "hf_dataset_id": "zanimal/anti-echo-artifacts",
  "collections": {
    "topic": "news_topic",
    "stance": "news_stance"
  },
  "embeddings": {
    "topic_model": "sentence-transformers/all-MiniLM-L6-v2",
    "stance_model": "sentence-transformers/all-MiniLM-L6-v2",
    "dim": 384,
    "dtype": "float16",
    "pooling": "mean",
    "chunk_tokens": 512
  },
  "summarizer": {
    "model": "facebook/bart-large-cnn",
    "target_sentences": 5,
    "truncation": 2048
  },
  "batch_files": {
    "topic_file": "embeddings_topic.npz",
    "stance_file": "embeddings_stance.npz",
    "metadata_file": "metadata.jsonl",
    "manifest_name": "manifest.json",
    "base_dir": "batches"
  },
  "id_policy": {
    "scheme": "domain-slug-sha12",
    "hash": "sha256",
    "normalize_whitespace": true,
    "lowercase": true
  },
  "paths": {
    "project_root": "/content/anti_echo",
    "config_cache": "/content/anti_echo/config_cache",
    "raw": "/content/anti_echo/raw",
    "batches": "/content/ant

## Setup 3 of N: Hugging Face auth and registry pull

Goal
- Authenticate to Hugging Face with HF_TOKEN
- Fetch the batch registry from GitHub
- Validate the registry schema and summarize batches

Notes
- If HF_TOKEN is not set, you can still proceed to read public data but uploads will fail later
- The registry lives at artifacts/artifacts_registry.json in your GitHub repo


In [3]:
# Setup 3A: load tokens once for the whole notebook (HF + GitHub)

import os
from getpass import getpass
import requests

def need(envvar, prompt):
    if not os.environ.get(envvar, "").strip():
        os.environ[envvar] = getpass(prompt)
    print(f"{envvar} loaded:", bool(os.environ.get(envvar)))

# Enter each once per Colab session
need("HF_TOKEN", "Enter your Hugging Face token: ")
need("GITHUB_TOKEN", "Enter your GitHub Personal Access Token: ")

# Quick sanity checks (optional)
try:
    r = requests.get("https://api.github.com/user",
                     headers={"Authorization": f"Bearer {os.environ['GITHUB_TOKEN']}",
                              "Accept": "application/vnd.github+json"},
                     timeout=15)
    print("GitHub auth status:", r.status_code)
except Exception as e:
    print("GitHub auth check failed:", type(e).__name__, e)

try:
    from huggingface_hub import whoami
    me = whoami(token=os.environ["HF_TOKEN"])
    print("HF user:", me.get("name") or me.get("email") or "(ok)")
except Exception as e:
    print("HF auth check failed:", type(e).__name__, e)


Enter your Hugging Face token: ··········
HF_TOKEN loaded: True
Enter your GitHub Personal Access Token: ··········
GITHUB_TOKEN loaded: True
GitHub auth status: 200
HF user: zanimal


In [None]:
# # --- temporary auth cell for Colab session ---
# import os
# from getpass import getpass

# if "HF_TOKEN" not in os.environ or not os.environ["HF_TOKEN"].strip():
#     os.environ["HF_TOKEN"] = getpass("Enter your Hugging Face token: ")

# print("HF_TOKEN set in environment for this session (will reset when runtime restarts).")


Enter your Hugging Face token: ··········
HF_TOKEN set in environment for this session (will reset when runtime restarts).


In [4]:
# Setup 3 of N: Hugging Face auth and registry pull

import os
import json
import requests
from pathlib import Path
from huggingface_hub import login, HfApi

PROJECT_ROOT = Path("/content/anti_echo").resolve()
CACHE_DIR = PROJECT_ROOT / "registry_cache"
CACHE_DIR.mkdir(parents=True, exist_ok=True)

REPO_OWNER = "AHMerrill"
REPO_NAME = "anti-echo-chamber"
BRANCH = "main"

def raw_url(path: str) -> str:
    return f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{BRANCH}/{path.lstrip('/')}"

# 1) HF auth
HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
if HF_TOKEN:
    try:
        login(token=HF_TOKEN, add_to_git_credential=False)
        print("Hugging Face login: OK")
    except Exception as e:
        print(f"Warning: HF login failed: {type(e).__name__}: {e}")
else:
    print("Warning: HF_TOKEN not set. You can read public artifacts but cannot upload.")

# 2) Validate the dataset exists
HF_DATASET_ID = os.environ.get("HF_DATASET_ID", "").strip()
if not HF_DATASET_ID:
    raise RuntimeError("HF_DATASET_ID not set in environment. It should have been set by Setup 2 from config.")
try:
    api = HfApi()
    ds_info = api.repo_info(HF_DATASET_ID, repo_type="dataset")
    print(f"HF dataset found: {HF_DATASET_ID}")
except Exception as e:
    print(f"Warning: Could not verify HF dataset {HF_DATASET_ID}: {type(e).__name__}: {e}")

# 3) Pull registry from GitHub
REGISTRY_URL = raw_url("artifacts/artifacts_registry.json")
r = requests.get(REGISTRY_URL, timeout=20)
if r.status_code != 200:
    raise RuntimeError(f"Failed to fetch registry from {REGISTRY_URL}. Status {r.status_code}")
registry_txt = r.text
(REGISTRY_CACHE_PATH := CACHE_DIR / "artifacts_registry.json").write_text(registry_txt, encoding="utf-8")

try:
    REGISTRY = json.loads(registry_txt)
except Exception as e:
    raise ValueError(f"Registry JSON parse failed: {e}")

# 4) Minimal schema checks and summary
required_top = ["version", "models", "batches"]
missing = [k for k in required_top if k not in REGISTRY]
if missing:
    raise ValueError(f"Registry missing required keys: {missing}")

models_block = REGISTRY.get("models", {})
batches = REGISTRY.get("batches", [])
model_summary = {
    "topic": models_block.get("topic"),
    "stance": models_block.get("stance"),
    "dim": models_block.get("dim")
}

summary = {
    "registry_version": REGISTRY.get("version"),
    "models": model_summary,
    "batch_count": len(batches),
}

# Print concise summary
print(json.dumps(summary, indent=2))

# If batches exist, show a compact table
if batches:
    rows = []
    for b in batches:
        rows.append({
            "batch_id": b.get("batch_id"),
            "docs": b.get("counts", {}).get("docs"),
            "created_at": b.get("created_at"),
        })
    # Keep it readable
    print("Batches overview:")
    for row in rows:
        print(f"- {row['batch_id']} | docs={row['docs']} | created_at={row['created_at']}")
else:
    print("No batches listed yet in artifacts_registry.json")

# Make available to later cells
os.environ["REGISTRY_PATH"] = str(REGISTRY_CACHE_PATH)


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Hugging Face login: OK
HF dataset found: zanimal/anti-echo-artifacts
{
  "registry_version": 2,
  "models": {
    "topic": "sentence-transformers/all-MiniLM-L6-v2",
    "stance": "sentence-transformers/all-MiniLM-L6-v2",
    "dim": 384
  },
  "batch_count": 1
}
Batches overview:
- batch_20251011T232938Z_283ca40f | docs=None | created_at=20251011T232938Z


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Setup 4 of N: Chroma rebuild or initialize

Goal
- Create a persistent Chroma client under `/content/anti_echo/chroma_db`
- Ensure two collections exist: `news_topic` and `news_stance`
- If batches are listed in the registry, download and ingest them in order
- If no batches yet, initialize empty collections and print a clear summary

Notes
- Uses `artifacts/artifacts_registry.json` as the source of truth
- Validates shapes and dims before inserting
- Safe to re run


In [8]:
# Hotfix: tolerant Chroma rebuild that accepts either 'hf_paths' or 'paths' in registry

import os, json
from pathlib import Path
import numpy as np
from huggingface_hub import hf_hub_download
import chromadb

PROJECT_ROOT = Path("/content/anti_echo").resolve()
CHROMA_DIR = PROJECT_ROOT / "chroma_db"
REGISTRY_PATH = Path(os.environ["REGISTRY_PATH"])
HF_DATASET_ID = os.environ["HF_DATASET_ID"]

COLL_TOPIC = CONFIG["chroma_collections"]["topic"]
COLL_STANCE = CONFIG["chroma_collections"]["stance"]
EMB_DIM = int(CONFIG["embeddings"]["dim"])

# Disable Chroma telemetry noise
os.environ["CHROMA_TELEMETRY_ENABLED"] = "false"
os.environ["ANONYMIZED_TELEMETRY"] = "false"

def load_registry(path: Path):
    return json.loads(path.read_text(encoding="utf-8"))

def read_metadata_jsonl(fp: Path):
    ids, metas = [], []
    with fp.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            ids.append(obj["id"])
            metas.append(obj)
    return ids, metas

def load_npz_vectors(fp: Path, expected_dim: int):
    arr = np.load(fp, allow_pickle=False)
    if isinstance(arr, np.lib.npyio.NpzFile):
        keys = list(arr.files)
        arr = arr[keys[0]] if keys else None
    if arr is None:
        raise ValueError(f"No array found in {fp.name}")
    vecs = np.asarray(arr)
    if vecs.ndim != 2 or vecs.shape[1] != expected_dim:
        raise ValueError(f"Bad embedding shape in {fp.name}. Got {vecs.shape}, expected [N, {expected_dim}]")
    if not np.isfinite(vecs).all():
        raise ValueError(f"Non finite values in {fp.name}")
    return vecs

def ensure_chroma():
    client = chromadb.PersistentClient(path=str(CHROMA_DIR))
    topic = client.get_or_create_collection(name=COLL_TOPIC, metadata={"hnsw:space": "cosine"})
    stance = client.get_or_create_collection(name=COLL_STANCE, metadata={"hnsw:space": "cosine"})
    return client, topic, stance

def upsert_in_chunks(collection, ids, vectors, metadatas, chunk=2048):
    n = len(ids)
    for i in range(0, n, chunk):
        j = min(i + chunk, n)
        collection.upsert(
            ids=ids[i:j],
            embeddings=vectors[i:j].tolist(),
            metadatas=metadatas[i:j],
        )

def ingest_one(path_block, topic_coll, stance_coll, batch_id="unknown"):
    t_rel = path_block.get("embeddings_topic")
    s_rel = path_block.get("embeddings_stance")
    m_rel = path_block.get("metadata")
    manifest_rel = path_block.get("manifest")
    if not all([t_rel, s_rel, m_rel, manifest_rel]):
        raise ValueError(f"Incomplete paths for batch {batch_id}")

    t_local = Path(hf_hub_download(repo_id=HF_DATASET_ID, repo_type="dataset", filename=t_rel))
    s_local = Path(hf_hub_download(repo_id=HF_DATASET_ID, repo_type="dataset", filename=s_rel))
    m_local = Path(hf_hub_download(repo_id=HF_DATASET_ID, repo_type="dataset", filename=m_rel))
    _ = Path(hf_hub_download(repo_id=HF_DATASET_ID, repo_type="dataset", filename=manifest_rel))

    ids, metas = read_metadata_jsonl(m_local)
    t_vecs = load_npz_vectors(t_local, EMB_DIM)
    s_vecs = load_npz_vectors(s_local, EMB_DIM)
    if len(ids) != t_vecs.shape[0] or len(ids) != s_vecs.shape[0]:
        raise ValueError(f"Row count mismatch in batch {batch_id}")

    upsert_in_chunks(topic_coll, ids, t_vecs, metas)
    upsert_in_chunks(stance_coll, ids, s_vecs, metas)
    return len(ids)

# Load and normalize batches
REGISTRY = load_registry(REGISTRY_PATH)
raw_batches = REGISTRY.get("batches", [])
norm = []
for b in raw_batches:
    p = b.get("hf_paths") or b.get("paths") or {}
    if p:
        norm.append({"batch_id": b.get("batch_id", "unknown"), "paths": p})
    else:
        print(f"Skipping batch with no paths: {b.get('batch_id')}")

client, topic_coll, stance_coll = ensure_chroma()

total = 0
for b in norm:
    try:
        n = ingest_one(b["paths"], topic_coll, stance_coll, batch_id=b["batch_id"])
        print(f"Ingested batch {b['batch_id']}: +{n} docs")
        total += n
    except Exception as e:
        print(f"Warning: failed to ingest {b['batch_id']}: {type(e).__name__}: {e}")

print("Chroma rebuild summary:", {
    "topic_count": topic_coll.count(),
    "stance_count": stance_coll.count(),
    "docs_added": total,
    "store": str(CHROMA_DIR),
})


ValueError: An instance of Chroma already exists for /content/anti_echo/chroma_db with different settings

## Setup 5A: tunables and Guardian feeds

Use this cell to:
- Set how many total articles to scrape per run
- Set optional per feed caps
- Pick the date floor
- Control even distribution across feeds (with remainder to a preferred feed)
- Define the full Guardian feed list in one place

Notes
- The current default will scrape 30 total articles, evenly split across the feeds below, with any remainder to Comment is Free.
- If MAX_ARTICLES is smaller than the number of feeds, many feeds will get 0 for that run. Increase MAX_ARTICLES to cover more feeds per run.
- We will wire the scraper to read these values from the environment and JSON so you can change them here only.


In [None]:
# Setup 5A: tunables and Guardian feeds

import os, json

# ---- How many articles and distribution policy ----
MAX_ARTICLES = 250              # total across all feeds this run
MAX_PER_FEED = None            # None for no hard cap, or set an int (e.g., 3)
DATE_FROM = "2025-07-01"       # ISO-8601 UTC lower bound; set None to ignore
FORCE_REFETCH = False          # True to re-download even if cached
EVEN_SPLIT = True              # True to evenly split MAX_ARTICLES across feeds
QUOTA_REMAINDER_TO = "commentisfree"  # where to send the remainder from the even split

# ---- Guardian feeds (edit here to add/remove) ----
GUARDIAN_FEEDS = [
    ("world",           "https://www.theguardian.com/world/rss"),
    ("uk-news",         "https://www.theguardian.com/uk-news/rss"),
    ("us-news",         "https://www.theguardian.com/us-news/rss"),
    ("politics",        "https://www.theguardian.com/politics/rss"),
    ("europe",          "https://www.theguardian.com/world/europe/rss"),
    ("americas",        "https://www.theguardian.com/world/americas/rss"),
    ("asia",            "https://www.theguardian.com/world/asia/rss"),
    ("australia-news",  "https://www.theguardian.com/australia-news/rss"),
    ("business",        "https://www.theguardian.com/uk/business/rss"),
    ("money",           "https://www.theguardian.com/uk/money/rss"),
    ("technology",      "https://www.theguardian.com/uk/technology/rss"),
    ("science",         "https://www.theguardian.com/science/rss"),
    ("global-development","https://www.theguardian.com/global-development/rss"),
    ("environment",     "https://www.theguardian.com/uk/environment/rss"),
    ("wildlife",        "https://www.theguardian.com/environment/wildlife/rss"),
    ("pollution",       "https://www.theguardian.com/environment/pollution/rss"),
    ("climate-crisis",  "https://www.theguardian.com/environment/climate-crisis/rss"),
    ("sport",           "https://www.theguardian.com/uk/sport/rss"),
    ("football",        "https://www.theguardian.com/football/rss"),
    ("cricket",         "https://www.theguardian.com/sport/cricket/rss"),
    ("tennis",          "https://www.theguardian.com/sport/tennis/rss"),
    ("golf",            "https://www.theguardian.com/sport/golf/rss"),
    ("formulaone",      "https://www.theguardian.com/sport/formulaone/rss"),
    ("cycling",         "https://www.theguardian.com/sport/cycling/rss"),
    ("rugby-union",     "https://www.theguardian.com/sport/rugby-union/rss"),
    ("culture",         "https://www.theguardian.com/uk/culture/rss"),
    ("film",            "https://www.theguardian.com/uk/film/rss"),
    ("music",           "https://www.theguardian.com/music/rss"),
    ("artanddesign",    "https://www.theguardian.com/artanddesign/rss"),
    ("books",           "https://www.theguardian.com/books/rss"),
    ("tv-and-radio",    "https://www.theguardian.com/uk/tv-and-radio/rss"),
    ("lifestyle",       "https://www.theguardian.com/uk/lifeandstyle/rss"),
    ("family",          "https://www.theguardian.com/lifeandstyle/family/rss"),
    ("health",          "https://www.theguardian.com/lifeandstyle/health-and-wellbeing/rss"),
    ("inequality",      "https://www.theguardian.com/inequality/rss"),
    ("obituaries",      "https://www.theguardian.com/tone/obituaries/rss"),
    ("travel",          "https://www.theguardian.com/uk/travel/rss"),
    ("fashion",         "https://www.theguardian.com/fashion/rss"),
    ("games",           "https://www.theguardian.com/games/rss"),
    ("stage",           "https://www.theguardian.com/stage/rss"),
    ("crosswords",      "https://www.theguardian.com/crosswords/rss"),
    ("commentisfree",   "https://www.theguardian.com/commentisfree/rss")  # opinion
]

# ---- Export to environment so the scraper can read without edits ----
os.environ["MAX_ARTICLES"] = str(MAX_ARTICLES)
os.environ["MAX_PER_FEED"] = "" if MAX_PER_FEED is None else str(MAX_PER_FEED)
os.environ["DATE_FROM"] = "" if DATE_FROM in (None, "") else DATE_FROM
os.environ["FORCE_REFETCH"] = "true" if FORCE_REFETCH else "false"
os.environ["EVEN_SPLIT"] = "true" if EVEN_SPLIT else "false"
os.environ["QUOTA_REMAINDER_TO"] = QUOTA_REMAINDER_TO

# Serialize feeds to JSON as a list of [name, url]
os.environ["GUARDIAN_FEEDS_JSON"] = json.dumps(GUARDIAN_FEEDS)

print("Tunables and Guardian feeds set.")
print(f"Feeds configured: {len(GUARDIAN_FEEDS)}")
print(f"MAX_ARTICLES={MAX_ARTICLES}, MAX_PER_FEED={MAX_PER_FEED}, DATE_FROM={DATE_FROM}, EVEN_SPLIT={EVEN_SPLIT}, REMAINDER_TO={QUOTA_REMAINDER_TO}")


Tunables and Guardian feeds set.
Feeds configured: 42
MAX_ARTICLES=250, MAX_PER_FEED=None, DATE_FROM=2025-07-01, EVEN_SPLIT=True, REMAINDER_TO=commentisfree


## Setup 5B.0: Restore prior feed state

Purpose
- Rehydrate `feeds/feeds_state.json` and `feeds/index.json` so the scraper skips URLs already scraped in previous runs or by collaborators.

Source of truth
- Hugging Face dataset `feeds/feeds_state_latest.json` and `feeds/feed_index_latest.json`
- Fallback to GitHub `feeds/feeds_state_latest.json` and `feeds/feed_index_latest.json` if HF is missing

Run this once before 5B on a fresh runtime.


In [6]:
# Setup 5B.reconstruct: rebuild index.json and feeds_state.json from HF metadata

import os, json, datetime as dt, re, hashlib
from pathlib import Path
from typing import Dict, List
from huggingface_hub import hf_hub_download

PROJECT_ROOT = Path("/content/anti_echo").resolve()
FEEDS_DIR = PROJECT_ROOT / "feeds"
FEEDS_DIR.mkdir(parents=True, exist_ok=True)
INDEX_PATH = FEEDS_DIR / "index.json"
STATE_PATH = FEEDS_DIR / "feeds_state.json"

# Helper to match earlier hashing in your scraper
def sha256_text(txt: str) -> str:
    return hashlib.sha256(re.sub(r"\s+", " ", txt.strip().lower()).encode("utf-8")).hexdigest()

def now_utc_iso() -> str:
    return dt.datetime.now(dt.timezone.utc).isoformat()

# Load CONFIG, REGISTRY
try:
    CONFIG  # from Setup 2
except NameError:
    raise RuntimeError("CONFIG not found. Run Setup 2 first.")

HF_DATASET_ID = CONFIG["hf_dataset_id"]

# Get registry dict
reg_path_env = os.environ.get("REGISTRY_PATH", "")
if reg_path_env and Path(reg_path_env).exists():
    REGISTRY = json.loads(Path(reg_path_env).read_text(encoding="utf-8"))
else:
    raise RuntimeError("REGISTRY_PATH not set or not found. Run Setup 3 to fetch the registry.")

# Collect metadata JSONL files to read
# Prefer the registry batches. If you added a latest pointer, it will be used next time.
batches = REGISTRY.get("batches", [])
if not batches:
    raise RuntimeError("No batches found in registry. Cannot reconstruct without metadata JSONL.")

meta_files: List[Path] = []
for b in batches:
    meta_rel = b.get("hf_paths", {}).get("metadata")
    if not meta_rel:
        continue
    try:
        local = Path(hf_hub_download(repo_id=HF_DATASET_ID, repo_type="dataset", filename=meta_rel))
        meta_files.append(local)
    except Exception as e:
        print(f"Warning: could not download metadata for batch {b.get('batch_id')}: {type(e).__name__}: {e}")

if not meta_files:
    raise RuntimeError("Found no downloadable metadata files in registry.")

# Build URL set and basic meta
urls_seen = []
url_hashes = []
items = {}  # for index.json

for fp in meta_files:
    with fp.open("r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            try:
                obj = json.loads(line)
            except Exception:
                continue
            u = obj.get("url")
            if not u:
                continue
            if u in items:
                continue
            items[u] = {"status": "ok", "fetched_at": now_utc_iso()}
            urls_seen.append(u)
            url_hashes.append(sha256_text(u)[:12])

# Write index.json
index_obj = {
    "last_updated": now_utc_iso(),
    "items": items
}
INDEX_PATH.write_text(json.dumps(index_obj, indent=2), encoding="utf-8")

# Build feeds_state.json
# If you have GUARDIAN_FEEDS env from 5A, use those feed names. Otherwise make a single bucket.
try:
    import json as _json
    GUARDIAN_FEEDS = _json.loads(os.environ.get("GUARDIAN_FEEDS_JSON", "[]"))
    feed_names = [name for name, _ in GUARDIAN_FEEDS] if GUARDIAN_FEEDS else []
except Exception:
    feed_names = []

if not feed_names:
    feed_names = ["theguardian_world", "theguardian_uk", "theguardian_politics", "theguardian_environment", "theguardian_commentisfree"]

# Fill each feed ring buffer up to its max with the last N url hashes
# We do not know which feed a given URL came from, but dedupe works regardless because 5B checks index["items"]
feeds_block: Dict[str, Dict] = {}
for name in feed_names:
    maxlen = 1000 if "commentisfree" in name else 500
    feeds_block[name] = {
        "feed_url": None,
        "last_cursor_iso": None,
        "recent_url_hashes": url_hashes[-maxlen:],
        "recent_url_hashes_max": maxlen,
        "last_run_at": None,
        "last_run_by": "reconstruct",
        "notes": f"Reconstructed ring buffer for {name}"
    }

state_obj = {
    "version": 1,
    "updated_at": now_utc_iso(),
    "feeds": feeds_block
}
STATE_PATH.write_text(json.dumps(state_obj, indent=2), encoding="utf-8")

print("Reconstruction complete.")
print({
    "urls_in_index": len(items),
    "feeds_in_state": len(feeds_block),
    "example_url": urls_seen[0] if urls_seen else None,
    "index_path": str(INDEX_PATH),
    "feeds_state_path": str(STATE_PATH),
})


RuntimeError: Found no downloadable metadata files in registry.

In [5]:
# Setup 5B.0: Restore prior feed state from HF (preferred) or GitHub (fallback)

import os, json, shutil, requests
from pathlib import Path
from huggingface_hub import hf_hub_download

PROJECT_ROOT = Path("/content/anti_echo").resolve()
FEEDS_DIR = PROJECT_ROOT / "feeds"
FEEDS_DIR.mkdir(parents=True, exist_ok=True)

STATE_PATH = FEEDS_DIR / "feeds_state.json"
INDEX_PATH = FEEDS_DIR / "index.json"

HF_DATASET_ID = os.environ.get("HF_DATASET_ID", "").strip()
REPO_OWNER = "AHMerrill"
REPO_NAME = "anti-echo-chamber"
BRANCH = "main"

def restore_from_hf() -> bool:
    try:
        st_local = hf_hub_download(
            repo_id=HF_DATASET_ID,
            repo_type="dataset",
            filename="feeds/feeds_state_latest.json"
        )
        ix_local = hf_hub_download(
            repo_id=HF_DATASET_ID,
            repo_type="dataset",
            filename="feeds/feed_index_latest.json"
        )
        shutil.copy(st_local, STATE_PATH)
        shutil.copy(ix_local, INDEX_PATH)
        print("Restored feed state from HF: feeds_state_latest.json, feed_index_latest.json")
        return True
    except Exception as e:
        print(f"HF restore not available: {type(e).__name__}: {e}")
        return False

def restore_from_github() -> bool:
    try:
        base = f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{BRANCH}/feeds"
        urls = {
            STATE_PATH: f"{base}/feeds_state_latest.json",
            INDEX_PATH: f"{base}/feed_index_latest.json",
        }
        ok = False
        for dst, url in urls.items():
            r = requests.get(url, timeout=20)
            if r.status_code == 200 and r.text.strip():
                dst.write_text(r.text, encoding="utf-8")
                ok = True
        if ok:
            print("Restored feed state from GitHub latest files.")
        else:
            print("GitHub latest feed state not found.")
        return ok
    except Exception as e:
        print(f"GitHub restore failed: {type(e).__name__}: {e}")
        return False

restored = False
if HF_DATASET_ID:
    restored = restore_from_hf()
if not restored:
    restored = restore_from_github()

if not restored:
    print("No prior feed state found. Starting fresh.")
else:
    try:
        fs = json.loads(STATE_PATH.read_text(encoding="utf-8"))
        ix = json.loads(INDEX_PATH.read_text(encoding="utf-8"))
        print(f"feeds_state.json feeds={len(fs.get('feeds', {}))}, updated_at={fs.get('updated_at')}")
        print(f"index.json items={len(ix.get('items', {}))}, last_updated={ix.get('last_updated')}")
    except Exception as e:
        print(f"Sanity read failed: {e}")


HF restore not available: EntryNotFoundError: 404 Client Error. (Request ID: Root=1-68eb32ef-5aa20d691554c8a514566010;5837e9de-b70c-4d5e-a8a4-714a1a262107)

Entry Not Found for url: https://huggingface.co/datasets/zanimal/anti-echo-artifacts/resolve/main/feeds/feeds_state_latest.json.
GitHub latest feed state not found.
No prior feed state found. Starting fresh.


## Setup 5B: Guardian scraper that reads tunables and evenly splits quotas

What this cell does
- Reads all tunables from the prior cell (env vars) and your `GUARDIAN_FEEDS` list
- Computes per feed quotas by even split of `MAX_ARTICLES`, sending remainder to `QUOTA_REMAINDER_TO`
- Respects `MAX_PER_FEED`, `DATE_FROM`, and `FORCE_REFETCH`
- Saves `raw/{id}.txt` and `raw/{id}.meta.json`
- Updates `feeds/index.json` and writes a local copy of `feeds_state.json` you can commit later

Tip
- Change counts, dates, or feeds only in Setup 5A. Re run this cell to apply.


In [None]:
# Setup 5B: Guardian scraper using tunables from 5A

import os, re, json, hashlib, datetime as dt
from pathlib import Path
from urllib.parse import urlparse
from email.utils import parsedate_to_datetime
import feedparser, trafilatura

PROJECT_ROOT = Path("/content/anti_echo").resolve()
RAW_DIR = PROJECT_ROOT / "raw"
FEEDS_DIR = PROJECT_ROOT / "feeds"
FEEDS_DIR.mkdir(parents=True, exist_ok=True)

# Load tunables from env set in 5A
try:
    GUARDIAN_FEEDS = json.loads(os.environ["GUARDIAN_FEEDS_JSON"])
except Exception as e:
    raise RuntimeError("GUARDIAN_FEEDS_JSON missing. Run Setup 5A first.") from e

MAX_ARTICLES = int(os.environ.get("MAX_ARTICLES", "30"))
MAX_PER_FEED = os.environ.get("MAX_PER_FEED", "")
MAX_PER_FEED = None if MAX_PER_FEED == "" else int(MAX_PER_FEED)
DATE_FROM = os.environ.get("DATE_FROM", "") or None
FORCE_REFETCH = os.environ.get("FORCE_REFETCH", "false").lower() == "true"
EVEN_SPLIT = os.environ.get("EVEN_SPLIT", "true").lower() == "true"
QUOTA_REMAINDER_TO = os.environ.get("QUOTA_REMAINDER_TO", "commentisfree")

INDEX_PATH = FEEDS_DIR / "index.json"
STATE_PATH = FEEDS_DIR / "feeds_state.json"

def now_utc():
    return dt.datetime.now(dt.timezone.utc).isoformat()

def load_index_local():
    if INDEX_PATH.exists():
        try:
            return json.loads(INDEX_PATH.read_text(encoding="utf-8"))
        except Exception:
            pass
    return {"last_updated": None, "items": {}}

def save_index_local(idx):
    idx["last_updated"] = now_utc()
    INDEX_PATH.write_text(json.dumps(idx, indent=2), encoding="utf-8")

def load_state():
    if STATE_PATH.exists():
        try:
            return json.loads(STATE_PATH.read_text(encoding="utf-8"))
        except Exception:
            pass
    return {"version": 1, "updated_at": None, "feeds": {}}

index = load_index_local()
feeds_state = load_state()

def parse_entry_date(entry):
    for attr in ("published", "updated"):
        try:
            val = getattr(entry, attr, None) or entry.get(attr)
        except Exception:
            val = None
        if val:
            try:
                return parsedate_to_datetime(val)
            except Exception:
                pass
    return None

def in_date_range(d, lower_iso):
    if not lower_iso:
        return True
    try:
        floor = dt.datetime.fromisoformat(lower_iso).replace(tzinfo=dt.timezone.utc)
    except Exception:
        return True
    if d is None:
        return True
    if d.tzinfo is None:
        d = d.replace(tzinfo=dt.timezone.utc)
    return d >= floor

def normalize_text(txt: str) -> str:
    return re.sub(r"\s+", " ", txt.strip().lower())

def sha256_text(txt: str) -> str:
    import hashlib
    return hashlib.sha256(txt.encode("utf-8")).hexdigest()

def slugify(text: str, maxlen: int = 60) -> str:
    s = re.sub(r"[^a-zA-Z0-9]+", "-", text).strip("-").lower()
    return s[:maxlen] or "untitled"

def get_title_from_html(html: str, fallback: str = "Untitled"):
    m = re.search(r"<title>(.*?)</title>", html or "", flags=re.IGNORECASE | re.DOTALL)
    if m:
        return re.sub(r"\s+", " ", m.group(1)).strip()
    return fallback

def fetch_and_extract(url: str):
    downloaded = trafilatura.fetch_url(url, no_ssl=False)
    if not downloaded:
        raise RuntimeError("failed to fetch")
    text = trafilatura.extract(downloaded, include_comments=False, include_tables=False) or ""
    title = get_title_from_html(downloaded, "Untitled")
    if not text.strip():
        raise RuntimeError("no main text extracted")
    return {"title": title, "text": text}

def save_article(url: str, title: str, text: str, source_name: str):
    domain = urlparse(url).netloc
    slug = slugify(title)
    h = sha256_text(normalize_text(text))
    art_id = f"{domain}-{slug}-{h[:12]}"
    txt_path = RAW_DIR / f"{art_id}.txt"
    meta_path = RAW_DIR / f"{art_id}.meta.json"
    txt_path.write_text(text, encoding="utf-8")
    meta = {
        "id": art_id,
        "url": url,
        "title": title,
        "source": source_name,
        "section": None,
        "domain": domain,
        "published": None,
        "sha256": h,
        "chars": len(text),
        "saved_at": now_utc()
    }
    meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
    return {"id": art_id, "txt_path": txt_path, "meta_path": meta_path}

def already_cached(u: str) -> bool:
    return (u in index["items"]) and (index["items"][u].get("status") == "ok")

def set_seen(u: str, status: str):
    index["items"][u] = {"status": status, "fetched_at": now_utc()}
    save_index_local(index)

# Compute quotas
feed_names = [name for name, _ in GUARDIAN_FEEDS]
num_feeds = len(feed_names)
if num_feeds == 0:
    raise RuntimeError("No Guardian feeds configured. Edit Setup 5A first.")

if EVEN_SPLIT:
    base = MAX_ARTICLES // num_feeds
    rem = MAX_ARTICLES % num_feeds
    quotas = {name: base for name in feed_names}
    if QUOTA_REMAINDER_TO in quotas:
        quotas[QUOTA_REMAINDER_TO] += rem
    else:
        quotas[feed_names[0]] += rem
else:
    # If not even split, give everything to QUOTA_REMAINDER_TO
    quotas = {name: 0 for name in feed_names}
    quotas[QUOTA_REMAINDER_TO if QUOTA_REMAINDER_TO in feed_names else feed_names[0]] = MAX_ARTICLES

# Apply per feed hard cap if set
if isinstance(MAX_PER_FEED, int) and MAX_PER_FEED > 0:
    for k in quotas:
        quotas[k] = min(quotas[k], MAX_PER_FEED)

print("Per feed quotas for this run:")
print(json.dumps(quotas, indent=2))

saved_global = 0
errors_global = 0
globally_seen = set()

# Ensure feeds_state has entries for these feeds
fs = feeds_state.setdefault("feeds", {})
for name, feed_url in GUARDIAN_FEEDS:
    if name not in fs:
        fs[name] = {
            "feed_url": feed_url,
            "last_cursor_iso": None,
            "recent_url_hashes": [],
            "recent_url_hashes_max": 500 if name != "commentisfree" else 1000,
            "last_run_at": None,
            "last_run_by": "colab",
            "notes": f"Guardian {name}"
        }

for name, feed_url in GUARDIAN_FEEDS:
    if saved_global >= MAX_ARTICLES:
        break
    quota = quotas.get(name, 0)
    if quota <= 0:
        continue

    saved_this_feed = 0
    fp = feedparser.parse(feed_url)

    # Collect and filter entries
    items = []
    for e in fp.entries:
        url = getattr(e, "link", None)
        if not url:
            continue
        pub = parse_entry_date(e)
        if not in_date_range(pub, DATE_FROM):
            continue
        items.append({"url": url, "published": pub})

    # Sort newest first and de-dupe
    seen_urls = set()
    uniq = []
    for it in sorted(items, key=lambda x: (x["published"] or dt.datetime.min), reverse=True):
        if it["url"] in seen_urls:
            continue
        seen_urls.add(it["url"])
        uniq.append(it)

    for item in uniq:
        if saved_global >= MAX_ARTICLES or saved_this_feed >= quota:
            break

        url = item["url"]
        if url in globally_seen:
            continue
        globally_seen.add(url)

        if already_cached(url) and not FORCE_REFETCH:
            print(f"skip (cached) [{name}]: {url}")
            continue

        try:
            res = fetch_and_extract(url)
            out = save_article(url, res["title"], res["text"], source_name="theguardian")
            set_seen(url, "ok")
            saved_this_feed += 1
            saved_global += 1
            # Update ring buffer and timestamps
            h = sha256_text(url)[:12]
            ring = fs[name]["recent_url_hashes"]
            ring.append(h)
            maxlen = fs[name]["recent_url_hashes_max"]
            if len(ring) > maxlen:
                fs[name]["recent_url_hashes"] = ring[-maxlen:]
            fs[name]["last_run_at"] = now_utc()
            print(f"saved [{name}]: {out['txt_path'].name} | {res['title'][:90]}")
        except Exception as exc:
            print(f"error [{name}]: {url} | {type(exc).__name__}: {str(exc)[:140]}")
            set_seen(url, "error")
            errors_global += 1

# Write back feeds_state.json locally so you can commit later
feeds_state["updated_at"] = now_utc()
STATE_PATH.write_text(json.dumps(feeds_state, indent=2), encoding="utf-8")

print("\nSummary")
print(json.dumps({
    "saved_total": saved_global,
    "errors_total": errors_global,
    "max_articles": MAX_ARTICLES,
    "max_per_feed": MAX_PER_FEED,
    "date_from": DATE_FROM,
    "even_split": EVEN_SPLIT,
    "remainder_to": QUOTA_REMAINDER_TO,
    "feeds_count": len(GUARDIAN_FEEDS),
    "raw_dir": str(RAW_DIR),
    "index_path": str(INDEX_PATH),
    "feeds_state_local": str(STATE_PATH)
}, indent=2))


Per feed quotas for this run:
{
  "world": 5,
  "uk-news": 5,
  "us-news": 5,
  "politics": 5,
  "europe": 5,
  "americas": 5,
  "asia": 5,
  "australia-news": 5,
  "business": 5,
  "money": 5,
  "technology": 5,
  "science": 5,
  "global-development": 5,
  "environment": 5,
  "wildlife": 5,
  "pollution": 5,
  "climate-crisis": 5,
  "sport": 5,
  "football": 5,
  "cricket": 5,
  "tennis": 5,
  "golf": 5,
  "formulaone": 5,
  "cycling": 5,
  "rugby-union": 5,
  "culture": 5,
  "film": 5,
  "music": 5,
  "artanddesign": 5,
  "books": 5,
  "tv-and-radio": 5,
  "lifestyle": 5,
  "family": 5,
  "health": 5,
  "inequality": 5,
  "obituaries": 5,
  "travel": 5,
  "fashion": 5,
  "games": 5,
  "stage": 5,
  "crosswords": 5,
  "commentisfree": 45
}
saved [world]: www.theguardian.com-egypt-confirms-international-leaders-summit-on-monday-to-dis-0efa690f2f6e.txt | Egypt confirms international leaders’ summit on Monday to discuss Gaza ceasefire - live | 
saved [world]: www.theguardian.com-trump-sa

## Setup 5C: Persist feed state to HF and GitHub

Purpose
- Snapshot `feeds/feeds_state.json` and `feeds/index.json` after each scrape.
- Upload to HF as timestamped copies and as `*_latest.json`.
- Commit to GitHub under `feeds/` as both timestamped and `*_latest.json`.

Requires
- `HF_TOKEN` loaded earlier
- `GITHUB_TOKEN` present (fine grained or classic, with contents read/write)


In [None]:
# Setup 5C: Persist feed state to HF dataset and GitHub

import os, json, base64
from datetime import datetime, timezone
from pathlib import Path
from getpass import getpass
from huggingface_hub import upload_file
import requests

PROJECT_ROOT = Path("/content/anti_echo").resolve()
FEEDS_DIR = PROJECT_ROOT / "feeds"
STATE_PATH = FEEDS_DIR / "feeds_state.json"
INDEX_PATH = FEEDS_DIR / "index.json"

# Inputs
HF_DATASET_ID = os.environ.get("HF_DATASET_ID", "").strip()
if not HF_DATASET_ID:
    raise RuntimeError("HF_DATASET_ID not set. Run Setup 2 earlier.")

if "HF_TOKEN" not in os.environ or not os.environ["HF_TOKEN"].strip():
    os.environ["HF_TOKEN"] = getpass("Enter your Hugging Face token: ")

if "GITHUB_TOKEN" not in os.environ or not os.environ["GITHUB_TOKEN"].strip():
    os.environ["GITHUB_TOKEN"] = getpass("Enter your GitHub token: ")

GITHUB_TOKEN = os.environ["GITHUB_TOKEN"].strip()
HF_TOKEN = os.environ["HF_TOKEN"].strip()

REPO_OWNER = "AHMerrill"
REPO_NAME = "anti-echo-chamber"
BRANCH = "main"

# Validate files exist
if not STATE_PATH.exists() or not INDEX_PATH.exists():
    raise FileNotFoundError("Expected feeds_state.json and index.json to exist. Run 5B first.")

# Timestamped names and latest pointers
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
hf_files = [
    (STATE_PATH, f"feeds/feeds_state_{ts}.json"),
    (INDEX_PATH, f"feeds/feed_index_{ts}.json"),
    (STATE_PATH, "feeds/feeds_state_latest.json"),
    (INDEX_PATH, "feeds/feed_index_latest.json"),
]

print("Uploading feed state to Hugging Face dataset...")
for local, remote in hf_files:
    upload_file(
        path_or_fileobj=str(local),
        path_in_repo=remote,
        repo_id=HF_DATASET_ID,
        repo_type="dataset",
        token=HF_TOKEN,
    )
print("HF upload complete.")

# GitHub commit helper
def gh_put_file(local_path: Path, repo_path: str, message: str):
    url = f"https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/contents/{repo_path}"
    headers = {"Authorization": f"Bearer {GITHUB_TOKEN}", "Accept": "application/vnd.github+json"}
    content = local_path.read_bytes()
    # get existing sha if present
    r = requests.get(url, headers=headers, timeout=20)
    sha = r.json().get("sha") if r.status_code == 200 else None
    payload = {
        "message": message,
        "content": base64.b64encode(content).decode("utf-8"),
        "branch": BRANCH,
    }
    if sha:
        payload["sha"] = sha
    resp = requests.put(url, headers=headers, json=payload, timeout=30)
    if resp.status_code not in (200, 201):
        raise RuntimeError(f"GitHub push failed for {repo_path}: {resp.status_code} {resp.text[:300]}")

print("Committing feed state to GitHub...")
commit_msg = f"Update feed state and index - {ts}"
# commit both timestamped history copies and latest pointers
gh_files = [
    (STATE_PATH, f"feeds/feeds_state_{ts}.json"),
    (INDEX_PATH, f"feeds/feed_index_{ts}.json"),
    (STATE_PATH, "feeds/feeds_state_latest.json"),
    (INDEX_PATH, "feeds/feed_index_latest.json"),
]
for local, repo_path in gh_files:
    gh_put_file(local, repo_path, commit_msg)

print("GitHub commit complete.")

# Small echo summary
fs = json.loads(STATE_PATH.read_text(encoding="utf-8"))
ix = json.loads(INDEX_PATH.read_text(encoding="utf-8"))
print({
    "feeds_state_feeds": len(fs.get("feeds", {})),
    "index_items": len(ix.get("items", {})),
    "timestamp": ts,
    "hf_dataset": HF_DATASET_ID,
    "github_repo": f"{REPO_OWNER}/{REPO_NAME}",
})


## Setup 6A and 6B: Multi-topic and Multi-stance Embeddings

This section implements the **core embedding stage** for the Anti Echo Chamber system.

All prior setups (1–5B) have prepared:
- Local workspace at `/content/anti_echo`
- Raw article text files under `/content/anti_echo/raw`
- Metadata `.meta.json` sidecars for each article
- Config (`CONFIG`) loaded from the repo with model, dim, dtype, and chunk parameters
- Chroma collections (`topic_coll`, `stance_coll`) initialized by Setup 4

These two cells will:
1. **Generate topic embeddings** (`Setup 6A`) — modeling what each article is about.
2. **Generate stance embeddings** (`Setup 6B`) — modeling how each article argues.

Both results are stored in the active Chroma collections.

---

### **6A. Multi-topic Embeddings**
- **Model:** `sentence-transformers/all-MiniLM-L6-v2`
- **Dim:** 384  
- **Dtype:** `float16`
- **Pooling:** Mean pooled over 512-token chunks
- **Segmentation:** Sentence clustering (1–8 topics per article)
- **Storage:** `topic_coll` (Chroma collection for topics)
- **Output IDs:** `article_id::topic::k`
- **Metadata Includes:** article ID, title, url, source, topic index, model info

Each article’s text is segmented into multiple topical clusters.
Every cluster is chunked and embedded separately to support multi-theme representation.

---

### **6B. Multi-stance Embeddings**
- **Summarizer:** `facebook/bart-large-cnn`
- **Embedding Model:** `sentence-transformers/all-MiniLM-L6-v2`
- **Dim:** 384  
- **Dtype:** `float16`
- **Storage:** `stance_coll` (Chroma collection for stances)
- **Output IDs:** `article_id::stance::k`
- **Metadata Includes:** article ID, title, url, source, stance summary, model info

Each article’s full text is summarized to capture its dominant argument or position.
The summary is embedded to allow contrasting stance comparisons later.

---

### **After this step**
Running both 6A and 6B will populate your local Chroma database (`/content/anti_echo/chroma_db`)
with two complementary vector spaces:

| Space | Captures | Purpose |
|-------|-----------|----------|
| **Topic Space** | Core subject matter of each article | Enables topical retrieval |
| **Stance Space** | Underlying tone or argument | Enables stance contrast search |

This brings the pipeline up to the point where retrieval and RAG-style contrastive queries
can be implemented in later stages (Setup 7+).


In [None]:
# Setup 6A: Multi-topic embeddings (final robust version)
# Handles NLTK punkt_tab, long text truncation, metadata sanitization, and single-sentence cases

import json, time
import numpy as np
import torch
import nltk
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from sklearn.cluster import AgglomerativeClustering
from pathlib import Path

# --- Ensure required NLTK data (punkt_tab fix) ---
for pkg in ["punkt", "punkt_tab"]:
    try:
        nltk.data.find(f"tokenizers/{pkg}")
    except LookupError:
        nltk.download(pkg)

# --- Environment and Config ---
RAW_DIR = Path("/content/anti_echo/raw")
device = "cuda" if torch.cuda.is_available() else "cpu"

topic_model_name = CONFIG["embeddings"]["topic_model"]
topic_dim = int(CONFIG["embeddings"]["dim"])
topic_dtype = CONFIG["embeddings"]["dtype"]
chunk_tokens = int(CONFIG["embeddings"]["chunk_tokens"])
coll_topic = CONFIG["chroma_collections"]["topic"]

tokenizer = AutoTokenizer.from_pretrained(topic_model_name, use_fast=True)
embedder = SentenceTransformer(topic_model_name, device=device)

print(f"Embedding model: {topic_model_name}, dim={topic_dim}, dtype={topic_dtype}, device={device}")

# --- Helper functions ---

def sent_split(text):
    return [s.strip() for s in nltk.sent_tokenize(text) if s.strip()]

def encode_batch(texts, batch=16):
    return embedder.encode(
        texts,
        convert_to_numpy=True,
        batch_size=batch,
        show_progress_bar=False
    )

def chunk_by_tokens(text, max_tokens=512, overlap=64):
    ids = tokenizer(text, add_special_tokens=False, return_attention_mask=False)["input_ids"]
    step = max_tokens - overlap
    chunks = []
    for i in range(0, len(ids), step):
        j = min(i + max_tokens, len(ids))
        piece = tokenizer.decode(ids[i:j], skip_special_tokens=True)
        if piece.strip():
            chunks.append(piece)
    return chunks

def sanitize_metadata(meta: dict) -> dict:
    clean = {}
    for k, v in meta.items():
        if isinstance(v, (str, int, float, bool)):
            clean[k] = v
        elif v is None:
            clean[k] = ""
        else:
            try:
                clean[k] = str(v)
            except Exception:
                clean[k] = ""
    return clean

def topic_vectors_for_article(text):
    sents = sent_split(text)
    if not sents:
        return []
    # --- handle single-sentence case ---
    if len(sents) < 2:
        v = encode_batch([" ".join(sents)])[0]
        return [v.astype(np.float16) if topic_dtype == "float16" else v.astype(np.float32)]

    emb = encode_batch(sents)
    k = min(max(1, len(sents)//8), 8)
    labels = AgglomerativeClustering(n_clusters=k).fit_predict(emb)
    segs = []
    for lab in sorted(set(labels)):
        segs.append(" ".join([s for s, l in zip(sents, labels) if l == lab]))
    vecs = []
    for seg in segs:
        # Truncate overly long segments to 512 tokens to avoid model warning
        ids = tokenizer(seg, add_special_tokens=False)["input_ids"][:512]
        seg_trunc = tokenizer.decode(ids, skip_special_tokens=True)
        chunks = chunk_by_tokens(seg_trunc, chunk_tokens, 64)
        if not chunks:
            continue
        chunk_embs = encode_batch(chunks)
        pooled = chunk_embs.mean(axis=0)
        vecs.append(
            pooled.astype(np.float16) if topic_dtype == "float16" else pooled.astype(np.float32)
        )
    return vecs

# --- Execution ---
start = time.time()
count = 0

for txt_path in RAW_DIR.glob("*.txt"):
    meta_path = txt_path.with_suffix(".meta.json")
    if not meta_path.exists():
        continue
    meta = json.loads(meta_path.read_text(encoding="utf-8"))
    text = txt_path.read_text(encoding="utf-8").strip()
    if not text:
        continue

    vecs = topic_vectors_for_article(text)
    if not vecs:
        continue

    ids = [f"{meta['id']}::topic::{i}" for i in range(len(vecs))]
    metas = [sanitize_metadata({**meta, "topic_index": i, "topic_model": topic_model_name}) for i in range(len(vecs))]
    upsert_in_chunks(topic_coll, ids, np.vstack(vecs), metas)
    count += len(vecs)

print(f"Upserted {count} topic embeddings to collection {coll_topic} in {round(time.time()-start,2)}s")


Embedding model: sentence-transformers/all-MiniLM-L6-v2, dim=384, dtype=float16, device=cuda


Token indices sequence length is longer than the specified maximum sequence length for this model (2486 > 512). Running this sequence through the model will result in indexing errors


Upserted 871 topic embeddings to collection news_topic in 19.87s


In [None]:
# Setup 6B: Multi-stance embeddings (final robust version)
# Handles metadata sanitization, summarization length safety, and errors

import json, time
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
from pathlib import Path

RAW_DIR = Path("/content/anti_echo/raw")
device = "cuda" if torch.cuda.is_available() else "cpu"

stance_model_name = CONFIG["embeddings"]["stance_model"]
stance_dim = int(CONFIG["embeddings"]["dim"])
stance_dtype = CONFIG["embeddings"]["dtype"]
# Choose summarizer dynamically based on environment
if torch.cuda.is_available():
    summarizer_name = CONFIG["summarizer"]["model"]  # usually facebook/bart-large-cnn
    print("GPU detected — using full summarizer:", summarizer_name)
else:
    summarizer_name = "sshleifer/distilbart-cnn-12-6"
    print("No GPU detected — using smaller summarizer for CPU mode:", summarizer_name)
coll_stance = CONFIG["chroma_collections"]["stance"]

tok_sum = AutoTokenizer.from_pretrained(summarizer_name)
model_sum = AutoModelForSeq2SeqLM.from_pretrained(summarizer_name).to(device)
embedder = SentenceTransformer(stance_model_name, device=device)

print(f"Summarizer: {summarizer_name}\nEmbedder: {stance_model_name}, dim={stance_dim}, dtype={stance_dtype}, device={device}")

# --- Helper functions ---

def sanitize_metadata(meta: dict) -> dict:
    clean = {}
    for k, v in meta.items():
        if isinstance(v, (str, int, float, bool)):
            clean[k] = v
        elif v is None:
            clean[k] = ""
        else:
            try:
                clean[k] = str(v)
            except Exception:
                clean[k] = ""
    return clean

def summarize_text(text: str, max_in=1024, max_out=150) -> str:
    """Summarize text safely, truncating long input."""
    inputs = tok_sum(
        [text],
        return_tensors="pt",
        truncation=True,
        max_length=max_in,
    ).to(device)
    with torch.no_grad():
        out = model_sum.generate(
            **inputs,
            max_length=max_out,
            num_beams=4,
            early_stopping=True,
        )
    return tok_sum.batch_decode(out, skip_special_tokens=True)[0].strip()

# --- Execution ---

start = time.time()
count = 0
skipped = 0

for txt_path in RAW_DIR.glob("*.txt"):
    meta_path = txt_path.with_suffix(".meta.json")
    if not meta_path.exists():
        continue
    meta = json.loads(meta_path.read_text(encoding="utf-8"))
    text = txt_path.read_text(encoding="utf-8").strip()
    if not text:
        continue

    try:
        summary = summarize_text(text)
    except Exception as e:
        print(f"Warning: summarization failed for {meta.get('id')}: {e}")
        skipped += 1
        continue

    if not summary:
        skipped += 1
        continue

    try:
        vec = embedder.encode([summary], convert_to_numpy=True, normalize_embeddings=False)[0]
        vec = vec.astype(np.float16) if stance_dtype == "float16" else vec.astype(np.float32)
    except Exception as e:
        print(f"Warning: embedding failed for {meta.get('id')}: {e}")
        skipped += 1
        continue

    ids = [f"{meta['id']}::stance::0"]
    metas = [
        sanitize_metadata({
            **meta,
            "stance_summary": summary,
            "stance_model": stance_model_name,
            "summary_model": summarizer_name,
        })
    ]
    upsert_in_chunks(stance_coll, ids, np.vstack([vec]), metas)
    count += 1

elapsed = round(time.time() - start, 2)
print(f"Upserted {count} stance embeddings, skipped {skipped}, to collection {coll_stance} in {elapsed}s")


Summarizer: facebook/bart-large-cnn
Embedder: sentence-transformers/all-MiniLM-L6-v2, dim=384, dtype=float16, device=cuda
Upserted 218 stance embeddings, skipped 0, to collection news_stance in 207.16s


# **Setup 7: Batch Packaging and Checkpoint Push**

Now that all topic and stance embeddings are complete,  
we package the results into versioned batch artifacts and push them to the cloud.

**Steps:**
1. Collect embeddings and metadata from local ChromaDB.  
2. Save artifacts to `/content/anti_echo/batches/{batch_id}/`:
   - `topic_embeddings.npz`
   - `stance_embeddings.npz`
   - `metadata.jsonl`
   - `manifest.json`
3. Upload all artifacts to the Hugging Face dataset (`zanimal/anti-echo-artifacts`).  
4. Update and commit `artifacts_registry.json` on GitHub (Setup 7B).

This ensures Chroma can be fully rebuilt later from Hugging Face data,  
and the project registry remains synchronized between GitHub and HF.


In [None]:
# Setup 7A: Package embeddings + push to Hugging Face
# Fixed for Chroma >=0.5.5 (no 'ids' in include list)
# Silences Chroma telemetry warnings

import os, json, time, uuid, warnings, logging
from datetime import datetime, timezone
import numpy as np
from huggingface_hub import HfApi, upload_file
import requests
from pathlib import Path

# --- Silence Chroma telemetry noise ---
logging.getLogger("chromadb").setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=DeprecationWarning)

PROJECT_ROOT = Path("/content/anti_echo").resolve()
BATCH_DIR = PROJECT_ROOT / CONFIG["batch"]["base_dir"]
HF_DATASET_ID = CONFIG["hf_dataset_id"]
REGISTRY_URL = f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{BRANCH}/artifacts/artifacts_registry.json"

client, topic_coll, stance_coll = ensure_chroma()

timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
batch_id = f"batch_{timestamp}_{uuid.uuid4().hex[:8]}"
batch_path = BATCH_DIR / batch_id
batch_path.mkdir(parents=True, exist_ok=True)

print(f"Packaging new batch: {batch_id}")

# --- Export from Chroma ---
topic_data = topic_coll.get(include=["embeddings", "metadatas"])
stance_data = stance_coll.get(include=["embeddings", "metadatas"])

topic_vecs = np.array(topic_data["embeddings"], dtype=np.float16)
stance_vecs = np.array(stance_data["embeddings"], dtype=np.float16)
meta_records = topic_data["metadatas"]

meta_path = batch_path / CONFIG["batch"]["metadata_file"]
topic_npz = batch_path / CONFIG["batch"]["topic_file"]
stance_npz = batch_path / CONFIG["batch"]["stance_file"]
manifest_path = batch_path / CONFIG["batch"]["manifest_name"]

# --- Write local files ---
print("Saving batch artifacts...")
np.savez_compressed(topic_npz, topic_vecs)
np.savez_compressed(stance_npz, stance_vecs)

with meta_path.open("w", encoding="utf-8") as f:
    for m in meta_records:
        json.dump(m, f)
        f.write("\n")

manifest = {
    "batch_id": batch_id,
    "created_at": timestamp,
    "models": CONFIG["embeddings"],
    "counts": {
        "topic": len(topic_vecs),
        "stance": len(stance_vecs)
    },
    "hf_dataset_id": HF_DATASET_ID,
    "paths": {
        "embeddings_topic": f"batches/{batch_id}/{topic_npz.name}",
        "embeddings_stance": f"batches/{batch_id}/{stance_npz.name}",
        "metadata": f"batches/{batch_id}/{meta_path.name}",
        "manifest": f"batches/{batch_id}/{manifest_path.name}"
    }
}
manifest_path.write_text(json.dumps(manifest, indent=2), encoding="utf-8")
print(json.dumps(manifest, indent=2))

# --- Upload to Hugging Face ---
api = HfApi()
print(f"Uploading batch {batch_id} to Hugging Face dataset: {HF_DATASET_ID}")

for fpath in [topic_npz, stance_npz, meta_path, manifest_path]:
    rel = f"batches/{batch_id}/{fpath.name}"
    upload_file(
        path_or_fileobj=str(fpath),
        path_in_repo=rel,
        repo_id=HF_DATASET_ID,
        repo_type="dataset",
        token=os.environ["HF_TOKEN"]
    )
print("✅ Upload complete.")

# --- Update GitHub registry (local only for now) ---
try:
    reg_resp = requests.get(REGISTRY_URL, timeout=20)
    registry = reg_resp.json()
except Exception:
    registry = {"version": 1, "models": {}, "batches": []}

registry.setdefault("batches", []).append(manifest)
new_registry_path = PROJECT_ROOT / "artifacts_registry_updated.json"
new_registry_path.write_text(json.dumps(registry, indent=2), encoding="utf-8")

print(f"Registry updated locally: {new_registry_path}")
print("Ready for Setup 7B (auto GitHub push).")


ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event CollectionGetEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event CollectionGetEvent: capture() takes 1 positional argument but 3 were given


Packaging new batch: batch_20251011T232938Z_283ca40f
Saving batch artifacts...
{
  "batch_id": "batch_20251011T232938Z_283ca40f",
  "created_at": "20251011T232938Z",
  "models": {
    "topic_model": "sentence-transformers/all-MiniLM-L6-v2",
    "stance_model": "sentence-transformers/all-MiniLM-L6-v2",
    "dim": 384,
    "dtype": "float16",
    "pooling": "mean",
    "chunk_tokens": 512
  },
  "counts": {
    "topic": 0,
    "stance": 0
  },
  "hf_dataset_id": "zanimal/anti-echo-artifacts",
  "paths": {
    "embeddings_topic": "batches/batch_20251011T232938Z_283ca40f/embeddings_topic.npz",
    "embeddings_stance": "batches/batch_20251011T232938Z_283ca40f/embeddings_stance.npz",
    "metadata": "batches/batch_20251011T232938Z_283ca40f/metadata.jsonl",
    "manifest": "batches/batch_20251011T232938Z_283ca40f/manifest.json"
  }
}
Uploading batch batch_20251011T232938Z_283ca40f to Hugging Face dataset: zanimal/anti-echo-artifacts


embeddings_topic.npz:   0%|          | 0.00/204 [00:00<?, ?B/s]

✅ Upload complete.
Registry updated locally: /content/anti_echo/artifacts_registry_updated.json
Ready for Setup 7B (auto GitHub push).


In [None]:
# Setup 7B: Auto-push updated registry JSON to GitHub
# Works for fine-grained or classic tokens
# Requires: artifacts_registry_updated.json from Setup 7A

import base64
import json
import os
import requests
from datetime import datetime
from pathlib import Path
from getpass import getpass

# --- Basic config ---
REPO_OWNER = "AHMerrill"
REPO_NAME = "anti-echo-chamber"
BRANCH = "main"
FILE_PATH = "artifacts/artifacts_registry.json"
LOCAL_REGISTRY = Path("/content/anti_echo/artifacts_registry_updated.json")

# --- Get token securely ---
if "GITHUB_TOKEN" not in os.environ or not os.environ["GITHUB_TOKEN"].strip():
    os.environ["GITHUB_TOKEN"] = getpass("Enter your GitHub Personal Access Token: ")

TOKEN = os.environ["GITHUB_TOKEN"].strip()
if not TOKEN:
    raise RuntimeError("GitHub token not provided. Please rerun and paste it when prompted.")

print(f"Pushing updated registry to {REPO_OWNER}/{REPO_NAME}:{BRANCH}/{FILE_PATH}")

# --- Load the local registry ---
if not LOCAL_REGISTRY.exists():
    raise FileNotFoundError(f"Local registry not found: {LOCAL_REGISTRY}")

registry = json.loads(LOCAL_REGISTRY.read_text(encoding="utf-8"))

# --- Version bump ---
if "version" not in registry or not isinstance(registry["version"], int):
    registry["version"] = 1
else:
    registry["version"] += 1

# --- Prepare updated JSON content ---
new_content = json.dumps(registry, indent=2)
message = f"Update artifacts registry v{registry['version']} - {datetime.utcnow().isoformat()}"

# --- Get current file SHA (for GitHub API) ---
url = f"https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/contents/{FILE_PATH}"
headers = {"Authorization": f"Bearer {TOKEN}", "Accept": "application/vnd.github+json"}
r = requests.get(url, headers=headers, timeout=20)

if r.status_code == 200:
    sha = r.json()["sha"]
    print(f"Existing registry found (SHA={sha[:8]}...), updating.")
else:
    sha = None
    print("No existing registry found; creating new file.")

# --- Commit payload ---
payload = {
    "message": message,
    "content": base64.b64encode(new_content.encode("utf-8")).decode("utf-8"),
    "branch": BRANCH,
}
if sha:
    payload["sha"] = sha

# --- Push to GitHub ---
resp = requests.put(url, headers=headers, json=payload, timeout=30)

if resp.status_code in (200, 201):
    commit_url = resp.json().get("commit", {}).get("html_url")
    print(f"✅ Successfully pushed updated registry to GitHub.")
    print(f"Commit URL: {commit_url}")
else:
    print(f"❌ GitHub push failed ({resp.status_code}).")
    print(resp.text[:500])


Enter your GitHub Personal Access Token: ··········
Pushing updated registry to AHMerrill/anti-echo-chamber:main/artifacts/artifacts_registry.json
Existing registry found (SHA=5dc1537a...), updating.
✅ Successfully pushed updated registry to GitHub.
Commit URL: https://github.com/AHMerrill/anti-echo-chamber/commit/9f5d1ee5c245e67f77947058fad9ecfe3d0cc517
