Note: this was built on colab pro and designed for an A100 GPU / cuda. It will still run on other runtimes, just slower

In [17]:
# !rm -rf /content/anti_echo

In [1]:
!rm -rf /content/sample_data/

Setup 1 of N - environment and workspace

Purpose
Install core dependencies, set up a clean workspace at /content/anti_echo, and print diagnostics so collaborators can debug quickly.

Why this matters
Pinned installs reduce breakage. A consistent folder layout keeps artifacts predictable. Diagnostics help when the runtime changes.

Outputs
Directories created under /content/anti_echo, environment flags set, package versions printed.

In [2]:
# Setup 1 of N: environment and workspace
# Colab safe. No Drive mount. Heavy comments for clarity.

import os
import sys
import subprocess
import textwrap
from pathlib import Path

def pip_install(pkgs):
    # Quiet installs but still show what is being installed for reproducibility
    cmd = [sys.executable, "-m", "pip", "install", "-q"] + pkgs
    print("Installing:", " ".join(pkgs))
    subprocess.check_call(cmd)

# Core dependencies with sane pins or upper bounds
pip_install([
    "feedparser==6.0.10",                    # RSS parsing
    "trafilatura>=1.6.2,<2.0",               # robust article extraction
    "sentence-transformers>=2.6.1,<3.0",     # embeddings
    "chromadb>=0.5.5,<0.6.0",                # local vector store
    "huggingface_hub>=0.24.0,<0.28.0",       # HF dataset and file ops
    "pyyaml>=6.0.1,<7.0",                    # config parsing
    "numpy>=1.26.4,<3.0",                    # arrays
    "tqdm>=4.66.0,<5.0",                     # progress
    "requests>=2.31.0,<3.0",                 # HTTP
    "rapidfuzz>=3.6.0,<4.0",                 # dedupe or fuzzy utils
    "scikit-learn>=1.4.0,<2.0",              # clustering for topics
    "transformers>=4.43.0,<5.0",             # summarization model
    "nltk>=3.8.1,<4.0"                       # sentence splitting
])

# Optional accelerator. If import fails, install a compatible torch
try:
    import torch
except Exception:
    pip_install(["torch>=2.2.0,<3.0"])
    import torch

# Define a single project root for all artifacts in this session
PROJECT_ROOT = Path("/content/anti_echo").resolve()
SUBDIRS = [
    "raw",        # scraped article text and sidecar metadata
    "batches",    # packaged embeddings and manifest before HF upload
    "chroma_db",  # local persistent Chroma store
    "logs",       # run logs
    "feeds",      # index and feed state
    "tmp"         # scratch space
]
for d in SUBDIRS:
    (PROJECT_ROOT / d).mkdir(parents=True, exist_ok=True)

# Set environment flags to reduce noise and avoid accidental multithreading bugs
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CHROMA_TELEMETRY_ENABLED"] = "false"
os.environ["ANONYMIZED_TELEMETRY"] = "false"

# Print environment diagnostics for reproducibility and easy debugging
import platform, json
from importlib.metadata import version, PackageNotFoundError

def v(name):
    try:
        return version(name)
    except PackageNotFoundError:
        return "not-installed"

info = {
    "python": sys.version.split()[0],
    "platform": platform.platform(),
    "packages": {
        "feedparser": v("feedparser"),
        "trafilatura": v("trafilatura"),
        "sentence-transformers": v("sentence-transformers"),
        "chromadb": v("chromadb"),
        "huggingface_hub": v("huggingface-hub"),
        "PyYAML": v("PyYAML"),
        "numpy": v("numpy"),
        "rapidfuzz": v("rapidfuzz"),
        "torch": v("torch"),
        "tqdm": v("tqdm"),
        "requests": v("requests"),
        "scikit-learn": v("scikit-learn"),
        "transformers": v("transformers"),
        "nltk": v("nltk"),
    },
    "paths": {
        "project_root": str(PROJECT_ROOT),
        "raw": str(PROJECT_ROOT / "raw"),
        "batches": str(PROJECT_ROOT / "batches"),
        "chroma_db": str(PROJECT_ROOT / "chroma_db"),
        "logs": str(PROJECT_ROOT / "logs"),
        "feeds": str(PROJECT_ROOT / "feeds"),
        "tmp": str(PROJECT_ROOT / "tmp"),
    }
}

# CUDA info is helpful to know if summarization can use GPU
info["cuda_available"] = bool(torch.cuda.is_available())
if info["cuda_available"]:
    info["cuda_device_name"] = torch.cuda.get_device_name(0)

print(json.dumps(info, indent=2))

# A tiny workspace readme helps collaborators quickly orient
workspace_readme = PROJECT_ROOT / "README_WORKSPACE.txt"
if not workspace_readme.exists():
    workspace_readme.write_text(textwrap.dedent("""
        anti echo chamber - Colab workspace
        This directory is ephemeral per session.
        Do not commit files from here directly.
        Subdirs:
          raw        - local scraped texts and meta for this session
          batches    - locally packaged batches before HF upload
          chroma_db  - local Chroma rebuild target
          logs       - run logs
          feeds      - runtime feed artifacts
          tmp        - scratch space
    """).strip() + "\n", encoding="utf-8")
print(f"Workspace ready at {PROJECT_ROOT}")


Installing: feedparser==6.0.10 trafilatura>=1.6.2,<2.0 sentence-transformers>=2.6.1,<3.0 chromadb>=0.5.5,<0.6.0 huggingface_hub>=0.24.0,<0.28.0 pyyaml>=6.0.1,<7.0 numpy>=1.26.4,<3.0 tqdm>=4.66.0,<5.0 requests>=2.31.0,<3.0 rapidfuzz>=3.6.0,<4.0 scikit-learn>=1.4.0,<2.0 transformers>=4.43.0,<5.0 nltk>=3.8.1,<4.0
{
  "python": "3.12.12",
  "platform": "Linux-6.6.105+-x86_64-with-glibc2.35",
  "packages": {
    "feedparser": "6.0.10",
    "trafilatura": "1.12.2",
    "sentence-transformers": "2.7.0",
    "chromadb": "0.5.23",
    "huggingface_hub": "0.27.1",
    "PyYAML": "6.0.3",
    "numpy": "2.0.2",
    "rapidfuzz": "3.14.1",
    "torch": "2.8.0+cu126",
    "tqdm": "4.67.1",
    "requests": "2.32.4",
    "scikit-learn": "1.6.1",
    "transformers": "4.46.3",
    "nltk": "3.9.1"
  },
  "paths": {
    "project_root": "/content/anti_echo",
    "raw": "/content/anti_echo/raw",
    "batches": "/content/anti_echo/batches",
    "chroma_db": "/content/anti_echo/chroma_db",
    "logs": "/content

Setup 2 of N - config and paths bootstrap

Purpose
Pull config and label maps from your GitHub repo, cache them locally in the Colab session, validate required keys, and create runtime dirs from config.

Why this matters
Single source of truth for model names, dims, batch filenames, and collection names. Keeps the notebook aligned with the repo.

Outputs
CONFIG, STANCE_AXES, TOPIC_LABELS in memory, directories created, and HF_DATASET_ID exported to env.


In [3]:
# ==============================
# Setup 2 of N: config and paths bootstrap (v5.0, config-driven)
# Loads config.yaml (local or GitHub), fetches topics.json and ideological JSONs,
# and ensures everything is cached locally for downstream setups.
# ==============================

import os, json, yaml, requests
from pathlib import Path

PROJECT_ROOT = Path("/content/anti_echo").resolve()
CONFIG_CACHE = PROJECT_ROOT / "config_cache"
CONFIG_DIR   = PROJECT_ROOT / "config"
for d in (CONFIG_CACHE, CONFIG_DIR):
    d.mkdir(parents=True, exist_ok=True)

# --- Candidate config files ---
CFG_CANDIDATES = ["config/config.yaml", "config/config.yml", "config/config.json"]

# --- Try to load local config first ---
local_cfg = PROJECT_ROOT / "config.yaml"
if local_cfg.exists():
    cfg_txt = local_cfg.read_text(encoding="utf-8")
    cfg_path, cfg_url = str(local_cfg), "(local)"
else:
    # Temporary bootstrap to old defaults just to fetch YAML
    TMP_REPO_OWNER = "AHMerrill"
    TMP_REPO_NAME  = "unstructured-project"
    TMP_BRANCH     = "main"

    def raw_url_temp(path):
        return f"https://raw.githubusercontent.com/{TMP_REPO_OWNER}/{TMP_REPO_NAME}/{TMP_BRANCH}/{path.lstrip('/')}"

    # --- Fetch remote YAML ---
    def fetch_text_first(paths):
        last_err = None
        for p in paths:
            url = raw_url_temp(p)
            try:
                r = requests.get(url, timeout=20)
                if r.status_code == 200 and r.text.strip():
                    return r.text, p, url
            except Exception as e:
                last_err = e
        raise RuntimeError(f"Could not fetch any of {paths}. Last error: {last_err}")

    cfg_txt, cfg_path, cfg_url = fetch_text_first(CFG_CANDIDATES)
    (CONFIG_CACHE / Path(cfg_path).name).write_text(cfg_txt, encoding="utf-8")

# --- Parse YAML/JSON into CONFIG ---
CONFIG = yaml.safe_load(cfg_txt) if cfg_path.endswith((".yaml", ".yml")) else json.loads(cfg_txt)

# --- Use GitHub values from YAML going forward ---
REPO_OWNER = CONFIG["github"]["owner"]
REPO_NAME  = CONFIG["github"]["repo"]
BRANCH     = CONFIG["github"].get("branch", "main")
HF_DATASET_ID = CONFIG["hf_dataset_id"]

def raw_url(path: str) -> str:
    """Compose a raw GitHub URL for a given path in the configured repo."""
    return f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{BRANCH}/{path.lstrip('/')}"

# --- Fetch topics.json ---
TOPICS_PATH = CONFIG_DIR / "topics.json"
TOPICS_URL  = raw_url("config/topics.json")

if not TOPICS_PATH.exists():
    print(f"Fetching topics.json from {TOPICS_URL} ...")
    r = requests.get(TOPICS_URL, timeout=20)
    r.raise_for_status()
    TOPICS_PATH.write_text(r.text, encoding="utf-8")

try:
    TOPIC_LABELS = json.load(open(TOPICS_PATH, encoding="utf-8"))
    print(f"Loaded {len(TOPIC_LABELS)} topic clusters from topics.json")
except Exception as e:
    print(f"Warning: Failed to load topics.json ({e})")
    TOPIC_LABELS = {}

# --- Fetch optional topic_anchors.npz ---
ANCHORS_PATH = CONFIG_DIR / "topic_anchors.npz"
ANCHORS_URL  = raw_url("config/topic_anchors.npz")
if not ANCHORS_PATH.exists():
    print(f"Fetching topic_anchors.npz from {ANCHORS_URL} ...")
    r = requests.get(ANCHORS_URL, timeout=30)
    if r.status_code == 200:
        ANCHORS_PATH.write_bytes(r.content)
    else:
        print(f"Warning: topic_anchors.npz not found ({r.status_code})")

# --- Ensure ideological JSONs are cached locally ---
def fetch_if_missing(filename):
    local_path = CONFIG_DIR / filename
    if not local_path.exists():
        url = raw_url(f"config/{filename}")
        print(f"Fetching {filename} from {url} ...")
        r = requests.get(url, timeout=20)
        r.raise_for_status()
        local_path.write_text(r.text, encoding="utf-8")
    return local_path

LEANINGS_PATH = fetch_if_missing("political_leanings.json")
STANCES_PATH  = fetch_if_missing("implied_stances.json")
BIAS_PATH     = fetch_if_missing("source_bias.json")

POLITICAL_LEANINGS = json.load(open(LEANINGS_PATH, encoding="utf-8"))
IMPLIED_STANCES    = json.load(open(STANCES_PATH, encoding="utf-8"))
SOURCE_BIAS        = json.load(open(BIAS_PATH, encoding="utf-8"))

# --- Mirror config.yaml into /config ---
CONFIG_PATH = CONFIG_DIR / "config.yaml"
CONFIG_PATH.write_text(cfg_txt, encoding="utf-8")

# --- Validate essentials ---
required_cfg_keys = ["hf_dataset_id", "chroma_collections", "embeddings", "batch", "ids", "chroma"]
missing = [k for k in required_cfg_keys if k not in CONFIG]
if missing:
    raise ValueError(f"Missing required config keys: {missing}")

# --- Ensure directories exist ---
(Path(PROJECT_ROOT / CONFIG["batch"]["base_dir"])).mkdir(parents=True, exist_ok=True)
(Path(PROJECT_ROOT / CONFIG["chroma"]["dir"])).mkdir(parents=True, exist_ok=True)
(Path(PROJECT_ROOT / CONFIG.get("logging", {}).get("save_dir", "logs"))).mkdir(parents=True, exist_ok=True)

# --- Runtime summary ---
print(json.dumps({
    "github_repo": f"{REPO_OWNER}/{REPO_NAME}",
    "hf_dataset_id": HF_DATASET_ID,
    "collections": CONFIG["chroma_collections"],
    "embeddings": CONFIG["embeddings"],
    "topics": {"count": len(TOPIC_LABELS), "source": str(TOPICS_PATH)},
    "ideology_files": {
        "political_leanings": str(LEANINGS_PATH),
        "implied_stances": str(STANCES_PATH),
        "source_bias": str(BIAS_PATH)
    },
    "paths": {
        "batches": str(PROJECT_ROOT / CONFIG["batch"]["base_dir"]),
        "chroma_db": str(PROJECT_ROOT / CONFIG["chroma"]["dir"]),
        "config_yaml": str(CONFIG_PATH)
    },
    "config_source": cfg_url
}, indent=2))

os.environ["HF_DATASET_ID"] = HF_DATASET_ID


Fetching topics.json from https://raw.githubusercontent.com/AHMerrill/unstructured-project/main/config/topics.json ...
Loaded 22 topic clusters from topics.json
Fetching topic_anchors.npz from https://raw.githubusercontent.com/AHMerrill/unstructured-project/main/config/topic_anchors.npz ...
Fetching political_leanings.json from https://raw.githubusercontent.com/AHMerrill/unstructured-project/main/config/political_leanings.json ...
Fetching implied_stances.json from https://raw.githubusercontent.com/AHMerrill/unstructured-project/main/config/implied_stances.json ...
Fetching source_bias.json from https://raw.githubusercontent.com/AHMerrill/unstructured-project/main/config/source_bias.json ...
{
  "github_repo": "AHMerrill/unstructured-project",
  "hf_dataset_id": "zanimal/unstructured-project",
  "collections": {
    "topic": "unstructured_topic",
    "stance": "unstructured_stance"
  },
  "embeddings": {
    "topic_model": "intfloat/e5-base-v2",
    "stance_model": "all-mpnet-base-v2",

Setup 3 of N - authentication for Hugging Face and GitHub

Purpose
Load tokens into environment and verify access. Later cells use these tokens to push to HF and update your GitHub registry.

Why this matters
Catching auth issues early prevents failing halfway through a run.

Outputs
Logged in to HF, GitHub token validated.

In [4]:
# Setup 3 of N: auth for Hugging Face and GitHub
# Prompts only if tokens are not already in the environment.

import os
import requests
from getpass import getpass
from huggingface_hub import login, whoami

def need(envvar, prompt):
    # Request once per session if missing
    if not os.environ.get(envvar, "").strip():
        os.environ[envvar] = getpass(prompt)
    print(f"{envvar} loaded:", bool(os.environ.get(envvar)))

# Gather tokens
need("HF_TOKEN", "Enter your Hugging Face token: ")
need("GITHUB_TOKEN", "Enter your GitHub Personal Access Token: ")

# Sign in to HF so upload_file works later
try:
    login(token=os.environ["HF_TOKEN"], add_to_git_credential=False)
    print("HF login OK:", whoami(token=os.environ["HF_TOKEN"]).get("name","(ok)"))
except Exception as e:
    print("HF login failed:", e)

# Quick GitHub check to confirm token scopes
try:
    r = requests.get(
        "https://api.github.com/user",
        headers={"Authorization": f"Bearer {os.environ['GITHUB_TOKEN']}"},
        timeout=15
    )
    print("GitHub auth status:", r.status_code)
except Exception as e:
    print("GitHub auth check failed:", e)


Enter your Hugging Face token: ··········
HF_TOKEN loaded: True
Enter your GitHub Personal Access Token: ··········
GITHUB_TOKEN loaded: True


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


HF login OK: zanimal
GitHub auth status: 200


In [5]:
import os
tok = os.getenv("HF_TOKEN", "")
print("Length:", len(tok))
print("Starts with hf_:", tok.startswith("hf_"))
print("Visible prefix:", tok[:10])


Length: 37
Starts with hf_: True
Visible prefix: hf_vvjiawf


In [6]:
# OpenAI API login (hidden input)
import os
from getpass import getpass

if "OPENAI_API_KEY" not in os.environ or not os.environ["OPENAI_API_KEY"].strip():
    os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")
print("OpenAI API key loaded:", bool(os.environ.get("OPENAI_API_KEY")))


Enter your OpenAI API key: ··········
OpenAI API key loaded: True


Setup 4 of N - restore feed index and feed state

Purpose
Restore feeds/index.json and feeds/feeds_state.json from HF latest pointers, fallback to GitHub if missing, or reconstruct from HF batch metadata if neither exists.

Why this matters
Prevents re scraping the same URLs, and keeps numbering and batching consistent across runs.

Outputs
feeds/index.json and feeds/feeds_state.json present locally with a quick summary.

In [7]:
# Setup 4 of N: restore feed index and state
# Guarantees local index and state exist before scraping.
# Refactored to pull GitHub and HF configuration dynamically.

import os, json, shutil, requests, datetime as dt, re, hashlib
from pathlib import Path
from huggingface_hub import hf_hub_download

PROJECT_ROOT = Path("/content/anti_echo").resolve()
FEEDS_DIR = PROJECT_ROOT / "feeds"
FEEDS_DIR.mkdir(parents=True, exist_ok=True)

INDEX_PATH = FEEDS_DIR / "index.json"
STATE_PATH = FEEDS_DIR / "feeds_state.json"

# --- Config-driven settings (fallback to env vars if not present) ---
HF_DATASET_ID = CONFIG.get("hf_dataset_id", os.getenv("HF_DATASET_ID", ""))
github_cfg = CONFIG.get("github", {})
REPO_OWNER = github_cfg.get("owner", os.getenv("GITHUB_OWNER", ""))
REPO_NAME  = github_cfg.get("repo", os.getenv("GITHUB_REPO", ""))
BRANCH     = github_cfg.get("branch", os.getenv("GITHUB_BRANCH", "main"))

def try_hf_restore() -> bool:
    # Prefer HF because it is the single source of truth in this design
    try:
        st = hf_hub_download(HF_DATASET_ID, "feeds/feeds_state_latest.json", repo_type="dataset")
        ix = hf_hub_download(HF_DATASET_ID, "feeds/feed_index_latest.json", repo_type="dataset")
        shutil.copy(st, STATE_PATH)
        shutil.copy(ix, INDEX_PATH)
        print("Restored feed state from HF latest")
        return True
    except Exception as e:
        print("HF latest not found:", e)
        return False

def try_github_restore() -> bool:
    # Fallback if HF latest pointers are not present yet
    try:
        base = f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{BRANCH}/feeds"
        got = False
        for src, dst in [("feeds_state_latest.json", STATE_PATH), ("feed_index_latest.json", INDEX_PATH)]:
            r = requests.get(f"{base}/{src}", timeout=20)
            if r.status_code == 200 and r.text.strip():
                dst.write_text(r.text, encoding="utf-8")
                got = True
        if got:
            print("Restored feed state from GitHub latest")
        return got
    except Exception as e:
        print("GitHub restore failed:", e)
        return False

restored = try_hf_restore() or try_github_restore()

if not restored:
    # Reconstruct from HF batch metadata listed in artifacts_registry.json
    print("No latest feed state found. Attempting reconstruction from HF batches...")
    REGISTRY_URL = f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{BRANCH}/artifacts/artifacts_registry.json"
    REGISTRY = requests.get(REGISTRY_URL, timeout=20).json()

    # Collect metadata jsonl from each batch
    metas = []
    for b in REGISTRY.get("batches", []):
        meta_rel = (b.get("hf_paths") or b.get("paths") or {}).get("metadata")
        if not meta_rel:
            continue
        try:
            metas.append(Path(hf_hub_download(HF_DATASET_ID, meta_rel, repo_type="dataset")))
        except Exception as e:
            print("Skip meta fetch:", e)

    # Build a minimal index of known URLs to prevent re-scraping
    items = {}
    def norm(txt): return re.sub(r"\s+", " ", txt.strip().lower())
    def sha256_text(txt): return hashlib.sha256(norm(txt).encode()).hexdigest()

    for fp in metas:
        for line in fp.read_text(encoding="utf-8").splitlines():
            if not line.strip():
                continue
            try:
                obj = json.loads(line)
            except Exception:
                continue
            u = obj.get("url")
            if u and u not in items:
                items[u] = {"status": "ok", "fetched_at": dt.datetime.now(dt.timezone.utc).isoformat()}

    INDEX_PATH.write_text(
        json.dumps(
            {"last_updated": dt.datetime.now(dt.timezone.utc).isoformat(), "items": items},
            indent=2,
        ),
        encoding="utf-8",
    )

    # Create a simple ring buffer structure for each feed
    url_hashes = [sha256_text(u)[:12] for u in items.keys()]
    feeds_block = {
        "commentisfree": {
            "feed_url": None,
            "recent_url_hashes": url_hashes[-1000:],
            "recent_url_hashes_max": 1000,
        },
        "theguardian": {
            "feed_url": None,
            "recent_url_hashes": url_hashes[-500:],
            "recent_url_hashes_max": 500,
        },
    }
    STATE_PATH.write_text(
        json.dumps(
            {"version": 1, "updated_at": dt.datetime.now(dt.timezone.utc).isoformat(), "feeds": feeds_block},
            indent=2,
        ),
        encoding="utf-8",
    )
    print("Reconstruction complete")

# Echo a small summary so you can verify
ix = json.loads(INDEX_PATH.read_text(encoding="utf-8"))
st = json.loads(STATE_PATH.read_text(encoding="utf-8"))
print({"index_items": len(ix.get("items", {})), "feeds": list(st.get("feeds", {}).keys())})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


feeds_state_latest.json: 0.00B [00:00, ?B/s]

feed_index_latest.json: 0.00B [00:00, ?B/s]

Restored feed state from HF latest
{'index_items': 13, 'feeds': ['commentisfree', 'theguardian', 'vox', 'cnn-opinion', 'guardian', 'bbc', 'cityjournal', 'dailycaller', 'theconversation', 'aljazeera', 'foxnews', 'npr', 'reason', 'thefederalist', 'msnbc', 'nypost']}


Setup 5 of N - rebuild Chroma from HF batches

Purpose
Create or refresh local Chroma collections from the HF dataset using artifacts/artifacts_registry.json as the ledger. HF is the source of truth. Local Chroma is a cache.

Why this matters
Ensures your retrieval state is consistent before adding new data. No duplicated rows. Clean numbering follows registry order.

Outputs
Two Chroma collections present with counts: topic and stance.

In [8]:
# ===============================================
# Setup 5 of N — Rebuild Chroma (Config-Driven + Idempotent + Schema-Aware + Graceful Empty-State)
# ===============================================

import os, json, numpy as np, logging, requests, hashlib
from pathlib import Path
from huggingface_hub import hf_hub_download
import chromadb

# Silence telemetry
os.environ["CHROMA_TELEMETRY_ENABLED"] = "false"
logging.getLogger("chromadb.telemetry").setLevel(logging.ERROR)

# --- Paths and Config ---
PROJECT_ROOT = Path("/content/anti_echo").resolve()
CHROMA_DIR = PROJECT_ROOT / CONFIG["chroma"]["dir"]
CHROMA_DIR.mkdir(parents=True, exist_ok=True)
STATE_PATH = CHROMA_DIR / "ingested_batches.json"

# --- Config-driven variables with safe fallbacks ---
HF_DATASET_ID = CONFIG.get("hf_dataset_id", os.getenv("HF_DATASET_ID", ""))
github_cfg = CONFIG.get("github", {})
REPO_OWNER = github_cfg.get("owner", os.getenv("GITHUB_OWNER", ""))
REPO_NAME  = github_cfg.get("repo", os.getenv("GITHUB_REPO", ""))
BRANCH     = github_cfg.get("branch", os.getenv("GITHUB_BRANCH", "main"))

COLL_TOPIC = CONFIG["chroma_collections"]["topic"]
COLL_STANCE = CONFIG["chroma_collections"]["stance"]
EMB_DIM = int(CONFIG["embeddings"]["dim"])
CURRENT_SCHEMA = 1  # expected schema version

# --- Registry source (config-driven) ---
REGISTRY_URL = f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{BRANCH}/artifacts/artifacts_registry.json"

# --- Utilities ---
def sha256_file(fp):
    h = hashlib.sha256()
    with open(fp, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()[:16]

def ensure_chroma():
    client = chromadb.PersistentClient(path=str(CHROMA_DIR))
    t = client.get_or_create_collection(name=COLL_TOPIC, metadata={"hnsw:space": "cosine"})
    s = client.get_or_create_collection(name=COLL_STANCE, metadata={"hnsw:space": "cosine"})
    return client, t, s

def load_npz(fp: Path, dim: int):
    arr = np.load(fp, allow_pickle=False)
    if isinstance(arr, np.lib.npyio.NpzFile):
        arr = arr[list(arr.files)[0]]
    vecs = np.asarray(arr)
    if vecs.ndim != 2 or vecs.shape[1] != dim:
        raise ValueError(f"{fp.name}: expected [N,{dim}] got {vecs.shape}")
    if not np.isfinite(vecs).all():
        raise ValueError(f"{fp.name}: non-finite values")
    return vecs

def read_jsonl(fp: Path):
    with fp.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                yield json.loads(line)

def upsert_chunks(coll, ids, vecs, metas, chunk=2048):
    n = len(ids)
    for i in range(0, n, chunk):
        j = min(i + chunk, n)
        coll.upsert(
            ids=ids[i:j],
            embeddings=vecs[i:j].tolist(),
            metadatas=metas[i:j],
        )

# --- Fetch registry (from GitHub) ---
try:
    REGISTRY = requests.get(REGISTRY_URL, timeout=20).json()
except Exception as e:
    print(f"Failed to fetch registry ({e}). Creating empty Chroma collections.")
    client, t, s = ensure_chroma()
    print(f"Initialized empty Chroma DB at {CHROMA_DIR}")
    print("No remote registry found. You may need to run Setup 9 and Setup 10 to generate your first embeddings.")
    REGISTRY = {"batches": []}

# --- Empty-registry safeguard ---
if not REGISTRY.get("batches"):
    client, t, s = ensure_chroma()
    print("No existing Chroma batches detected.")
    print(f"Created empty local collections: topic={t.name}, stance={s.name}")
    print("Next steps:")
    print(" - Run Setup 8 to embed and upsert topic vectors.")
    print(" - Run Setup 9 to embed and upsert stance vectors.")
    print(" - Run Setup 11 to push your first batch to Hugging Face.")
else:
    # --- Continue rebuild ---
    if STATE_PATH.exists():
        state = json.load(open(STATE_PATH))
    else:
        state = []

    client, topic_coll, stance_coll = ensure_chroma()
    added_rows = 0
    ingested_batches = 0
    skipped_batches = []

    for b in REGISTRY.get("batches", []):
        batch_id = b.get("batch_id")
        sv = int(b.get("schema_version", 1))

        if batch_id in state:
            print(f"Skip already ingested {batch_id}")
            continue
        if sv < CURRENT_SCHEMA:
            print(f"Skip legacy batch {batch_id} (schema {sv} < {CURRENT_SCHEMA})")
            skipped_batches.append(batch_id)
            continue

        paths = b.get("paths") or b.get("hf_paths") or {}
        need = ["embeddings_topic", "embeddings_stance", "metadata_topic", "metadata_stance"]
        if not all(k in paths for k in need):
            print(f"Skip batch {batch_id} (missing required paths)")
            skipped_batches.append(batch_id)
            continue

        try:
            t_vecs = load_npz(Path(hf_hub_download(HF_DATASET_ID, paths["embeddings_topic"], repo_type="dataset")), EMB_DIM)
            s_vecs = load_npz(Path(hf_hub_download(HF_DATASET_ID, paths["embeddings_stance"], repo_type="dataset")), EMB_DIM)
            t_meta_path = Path(hf_hub_download(HF_DATASET_ID, paths["metadata_topic"], repo_type="dataset"))
            s_meta_path = Path(hf_hub_download(HF_DATASET_ID, paths["metadata_stance"], repo_type="dataset"))

            topic_ids, topic_metas = [], []
            for rec in read_jsonl(t_meta_path):
                rid = rec.get("row_id") or f"{rec.get('id','unknown')}::topic::0"
                topic_ids.append(rid)
                topic_metas.append(rec)

            stance_ids, stance_metas = [], []
            for rec in read_jsonl(s_meta_path):
                rid = rec.get("row_id") or f"{rec.get('id','unknown')}::stance::0"
                stance_ids.append(rid)
                stance_metas.append(rec)

            if t_vecs.shape[0] != len(topic_ids):
                raise ValueError(f"{batch_id}: topic vec/meta mismatch")
            if s_vecs.shape[0] != len(stance_ids):
                raise ValueError(f"{batch_id}: stance vec/meta mismatch")

            upsert_chunks(topic_coll, topic_ids, t_vecs, topic_metas)
            upsert_chunks(stance_coll, stance_ids, s_vecs, stance_metas)

            state.append(batch_id)
            json.dump(state, open(STATE_PATH, "w"), indent=2)
            added_rows += len(topic_ids) + len(stance_ids)
            ingested_batches += 1
            print(f"Ingested {batch_id}: {len(topic_ids)} topic, {len(stance_ids)} stance")

        except Exception as e:
            print(f"Failed batch {batch_id}: {e}")
            skipped_batches.append(batch_id)
            continue

    summary = {
        "topic_count": topic_coll.count(),
        "stance_count": stance_coll.count(),
        "rows_added": added_rows,
        "ingested_batches": ingested_batches,
        "skipped_batches": skipped_batches,
        "state_file": str(STATE_PATH),
    }
    print(json.dumps(summary, indent=2))


ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


embeddings_topic.npz:   0%|          | 0.00/89.9k [00:00<?, ?B/s]

embeddings_stance.npz:   0%|          | 0.00/18.7k [00:00<?, ?B/s]

metadata_topic.jsonl: 0.00B [00:00, ?B/s]

metadata_stance.jsonl: 0.00B [00:00, ?B/s]

Ingested batch_20251025T173713Z_5dd9b9dd: 64 topic, 13 stance
{
  "topic_count": 64,
  "stance_count": 13,
  "rows_added": 77,
  "ingested_batches": 1,
  "skipped_batches": [],
  "state_file": "/content/anti_echo/chroma_db/ingested_batches.json"
}


Setup 6 of N - scraping tunables

Purpose:
Define scrape quotas, date floor, and feed list. Export to environment so the scraper reads configuration without edits.

Scope:
Attempted to have an even balance of left, right, center for this experiment.  used allsides.com and chatGPT to help cover a complete spectrum of sites considered biased in particular directions


Why this matters
Allows you to adjust scrape size and distribution per run from a single cell.

Outputs
Environment variables set and a printed summary.

In [9]:
# Setup 6 of N: feed configuration
# Defines the set of RSS/Atom feeds to pull from for balanced, full-text content scraping.

import json, os

# Verified full-text feeds (as of latest probe)
NEWS_FEEDS = [
    ("vox", "https://www.vox.com/rss/index.xml"),
    ("cnn-opinion", "http://rss.cnn.com/rss/cnn_opinion.rss"),
    ("guardian", "https://www.theguardian.com/uk/rss"),
    ("bbc", "https://feeds.bbci.co.uk/news/rss.xml"),
    ("cityjournal", "https://www.city-journal.org/rss.xml"),
    ("dailycaller", "https://dailycaller.com/feed/"),
    ("theconversation", "https://theconversation.com/us/articles.atom"),
    ("aljazeera", "https://www.aljazeera.com/xml/rss/all.xml"),
    ("foxnews", "https://moxie.foxnews.com/google-publisher/latest.xml"),
    ("npr", "https://feeds.npr.org/1001/rss.xml"),
    ("reason", "https://reason.com/feed/"),
    ("thefederalist", "https://thefederalist.com/feed/"),
    ("msnbc", "https://feeds.nbcnews.com/msnbc/public/news"),
    ("nypost", "https://nypost.com/feed/"),
]

# Export to environment for scraper
os.environ["NEWS_FEEDS_JSON"] = json.dumps(NEWS_FEEDS)

# Scraper configuration
# Adjust MAX_ARTICLES and MAX_PER_FEED for your target scale
os.environ["MAX_ARTICLES"] = "2800"       # ≈200 per feed × 14 feeds
os.environ["MAX_PER_FEED"] = "200"
os.environ["EVEN_SPLIT"] = "true"
os.environ["DATE_FROM"] = "2019-01-01"    # scrape articles newer than this date
os.environ["FORCE_REFETCH"] = "false"
os.environ["QUOTA_REMAINDER_TO"] = "theconversation"

print("Configured verified full-text feeds:")
for name, url in NEWS_FEEDS:
    print(f"- {name}: {url}")

print("\nScraper parameters:")
print(json.dumps({
    "MAX_ARTICLES": os.environ["MAX_ARTICLES"],
    "MAX_PER_FEED": os.environ["MAX_PER_FEED"],
    "DATE_FROM": os.environ["DATE_FROM"],
    "EVEN_SPLIT": os.environ["EVEN_SPLIT"],
    "FORCE_REFETCH": os.environ["FORCE_REFETCH"],
    "QUOTA_REMAINDER_TO": os.environ["QUOTA_REMAINDER_TO"]
}, indent=2))


Configured verified full-text feeds:
- vox: https://www.vox.com/rss/index.xml
- cnn-opinion: http://rss.cnn.com/rss/cnn_opinion.rss
- guardian: https://www.theguardian.com/uk/rss
- bbc: https://feeds.bbci.co.uk/news/rss.xml
- cityjournal: https://www.city-journal.org/rss.xml
- dailycaller: https://dailycaller.com/feed/
- theconversation: https://theconversation.com/us/articles.atom
- aljazeera: https://www.aljazeera.com/xml/rss/all.xml
- foxnews: https://moxie.foxnews.com/google-publisher/latest.xml
- npr: https://feeds.npr.org/1001/rss.xml
- reason: https://reason.com/feed/
- thefederalist: https://thefederalist.com/feed/
- msnbc: https://feeds.nbcnews.com/msnbc/public/news
- nypost: https://nypost.com/feed/

Scraper parameters:
{
  "MAX_ARTICLES": "2800",
  "MAX_PER_FEED": "200",
  "DATE_FROM": "2019-01-01",
  "EVEN_SPLIT": "true",
  "FORCE_REFETCH": "false",
  "QUOTA_REMAINDER_TO": "theconversation"
}


Setup 7 of N - scraper with dedupe

Purpose
Scrape the feeds, skip URLs already in feeds/index.json, save raw/{id}.txt and {id}.meta.json, and update both feeds/index.json and feeds/feeds_state.json.

Why this matters
Prevents duplicates, keeps state consistent across runs, and prepares clean inputs for embedding.

Outputs
New articles saved under raw/, updated feeds/index.json and feeds/feeds_state.json, and a summary.

In [None]:
# Setup 7 of N: scraper with dedupe and caching
# Uses feed list from Setup 6 (NEWS_FEEDS_JSON)
# Fetches full text locally; saves metadata for embeddings downstream.

import os, re, json, hashlib, datetime as dt
from pathlib import Path
from urllib.parse import urlparse
from email.utils import parsedate_to_datetime
import feedparser, trafilatura, requests

feedparser.USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"

def safe_parse_feed(url):
    """Fetch RSS/Atom with realistic browser headers to bypass blocks."""
    try:
        headers = {
            "User-Agent": feedparser.USER_AGENT,
            "Accept": "application/rss+xml, application/atom+xml, application/xml;q=0.9,*/*;q=0.8",
            "Referer": "https://www.google.com/",
            "Accept-Language": "en-US,en;q=0.9",
        }
        resp = requests.get(url, headers=headers, timeout=15)
        if resp.status_code != 200:
            print(f"feed fetch error ({resp.status_code}): {url}")
            return feedparser.FeedParserDict(entries=[])
        return feedparser.parse(resp.content)
    except Exception as e:
        print(f"feed fetch exception for {url}: {e}")
        return feedparser.FeedParserDict(entries=[])

# === Paths ===
PROJECT_ROOT = Path("/content/anti_echo").resolve()
RAW_DIR = PROJECT_ROOT / "raw"
FEEDS_DIR = PROJECT_ROOT / "feeds"
FEEDS_DIR.mkdir(parents=True, exist_ok=True)
INDEX_PATH = FEEDS_DIR / "index.json"
STATE_PATH = FEEDS_DIR / "feeds_state.json"

# === Load config ===
FEED_LIST = json.loads(os.environ["NEWS_FEEDS_JSON"])
MAX_ARTICLES = int(os.environ.get("MAX_ARTICLES", "30"))
MAX_PER_FEED = os.environ.get("MAX_PER_FEED", "")
MAX_PER_FEED = None if MAX_PER_FEED == "" else int(MAX_PER_FEED)
DATE_FROM = os.environ.get("DATE_FROM", "") or None
FORCE_REFETCH = os.environ.get("FORCE_REFETCH", "false").lower() == "true"
EVEN_SPLIT = os.environ.get("EVEN_SPLIT", "true").lower() == "true"
QUOTA_REMAINDER_TO = os.environ.get("QUOTA_REMAINDER_TO", "theconversation")

def now_iso(): return dt.datetime.now(dt.timezone.utc).isoformat()

# === State & index handling ===
def load_json(path, default):
    if path.exists():
        try: return json.loads(path.read_text(encoding="utf-8"))
        except Exception: pass
    return default

index = load_json(INDEX_PATH, {"last_updated": None, "items": {}})
feeds_state = load_json(STATE_PATH, {"version": 1, "updated_at": None, "feeds": {}})
fs = feeds_state.setdefault("feeds", {})
for name, feed_url in FEED_LIST:
    fs.setdefault(name, {"feed_url": feed_url, "recent_url_hashes": [], "recent_url_hashes_max": 1000, "last_run_at": None})

def save_json(path, obj):
    obj["last_updated"] = now_iso()
    path.write_text(json.dumps(obj, indent=2), encoding="utf-8")

# === Utility ===
def parse_date(entry):
    for k in ["published", "updated"]:
        val = getattr(entry, k, None) or entry.get(k)
        if val:
            try: return parsedate_to_datetime(val)
            except Exception: pass
    return None

def in_range(d, lower_iso):
    if not lower_iso: return True
    try: floor = dt.datetime.fromisoformat(lower_iso).replace(tzinfo=dt.timezone.utc)
    except Exception: return True
    if d is None: return True
    if d.tzinfo is None: d = d.replace(tzinfo=dt.timezone.utc)
    return d >= floor

def normalize_text(txt): return re.sub(r"\s+", " ", txt.strip().lower())
def sha256_text(txt): return hashlib.sha256(txt.encode()).hexdigest()
def slugify(text, maxlen=60):
    s = re.sub(r"[^a-zA-Z0-9]+", "-", text).strip("-").lower()
    return s[:maxlen] or "untitled"

def fetch_article(url):
    html = trafilatura.fetch_url(url, no_ssl=False)
    if not html:
        raise RuntimeError("fetch failed")
    text = trafilatura.extract(html, include_comments=False, include_tables=False) or ""
    title_match = re.search(r"<title>(.*?)</title>", html or "", flags=re.I | re.S)
    title = re.sub(r"\s+", " ", title_match.group(1)).strip() if title_match else "Untitled"
    if not text.strip():
        raise RuntimeError("extraction empty")
    return title, text

def already_cached(url):
    return url in index["items"] and index["items"][url].get("status") == "ok"

def mark(url, status):
    index["items"][url] = {"status": status, "fetched_at": now_iso()}
    save_json(INDEX_PATH, index)

# === Quotas ===
feed_names = [n for n, _ in FEED_LIST]
if EVEN_SPLIT:
    base = MAX_ARTICLES // len(feed_names)
    rem = MAX_ARTICLES % len(feed_names)
    quotas = {n: base for n in feed_names}
    quotas[QUOTA_REMAINDER_TO if QUOTA_REMAINDER_TO in quotas else feed_names[0]] += rem
else:
    quotas = {n: 0 for n in feed_names}
    quotas[QUOTA_REMAINDER_TO if QUOTA_REMAINDER_TO in quotas else feed_names[0]] = MAX_ARTICLES
if isinstance(MAX_PER_FEED, int):
    quotas = {k: min(v, MAX_PER_FEED) for k, v in quotas.items()}
print("Quotas:", quotas)

# === Main fetch loop ===
saved_global = 0
errors_global = 0
seen_global = set()

for name, feed_url in FEED_LIST:
    if saved_global >= MAX_ARTICLES: break
    quota = quotas.get(name, 0)
    if quota <= 0: continue

    fp = safe_parse_feed(feed_url)
    if not fp.entries:
        print(f"[{name}] no entries found.")
        continue

    items = []
    for e in fp.entries:
        url = getattr(e, "link", None)
        if not url: continue
        pub = parse_date(e)
        if in_range(pub, DATE_FROM):
            items.append({"url": url, "published": pub})

    uniq, seen = [], set()
    for it in sorted(items, key=lambda x: (x["published"] or dt.datetime.min), reverse=True):
        if it["url"] in seen: continue
        seen.add(it["url"])
        uniq.append(it)

    saved_this = 0
    for it in uniq:
        if saved_global >= MAX_ARTICLES or saved_this >= quota: break
        url = it["url"]
        if url in seen_global: continue
        seen_global.add(url)
        if already_cached(url) and not FORCE_REFETCH:
            print(f"skip (cached) [{name}]: {url}")
            continue
        try:
            title, text = fetch_article(url)
            domain = urlparse(url).netloc
            slug = slugify(title)
            h = sha256_text(normalize_text(text))
            art_id = f"{domain}-{slug}-{h[:12]}"
            RAW_DIR.mkdir(parents=True, exist_ok=True)
            txt_path = RAW_DIR / f"{art_id}.txt"
            meta_path = RAW_DIR / f"{art_id}.meta.json"
            txt_path.write_text(text, encoding="utf-8")
            meta = {
                "id": art_id, "url": url, "title": title, "source": name, "domain": domain,
                "published": it["published"].isoformat() if it["published"] else None,
                "sha256": h, "chars": len(text), "saved_at": now_iso(),
            }
            meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
            mark(url, "ok")
            fs[name]["last_run_at"] = now_iso()
            saved_this += 1
            saved_global += 1
            print(f"saved [{name}]: {txt_path.name} | {title[:90]} | {len(text)} chars")
        except Exception as e:
            mark(url, "error")
            errors_global += 1
            print(f"error [{name}]: {url} | {type(e).__name__}: {str(e)[:120]}")

feeds_state["updated_at"] = now_iso()
STATE_PATH.write_text(json.dumps(feeds_state, indent=2), encoding="utf-8")

print(json.dumps({
    "saved_total": saved_global,
    "errors_total": errors_global,
    "index_items": len(index["items"]),
    "feeds_state_path": str(STATE_PATH)
}, indent=2))


Quotas: {'vox': 200, 'cnn-opinion': 200, 'guardian': 200, 'bbc': 200, 'cityjournal': 200, 'dailycaller': 200, 'theconversation': 200, 'aljazeera': 200, 'foxnews': 200, 'npr': 200, 'reason': 200, 'thefederalist': 200, 'msnbc': 200, 'nypost': 200}
skip (cached) [vox]: https://www.vox.com/policy/465969/ice-protests-chicago-broadview-pastor-pepper-spray
saved [vox]: www.vox.com-caribbean-boat-strikes-is-the-us-killing-innocent-people-vox-ef90a0158dbd.txt | Caribbean boat strikes: Is the US killing innocent people? | Vox | 2668 chars
saved [vox]: www.vox.com-the-nba-betting-scandal-exposes-america-s-sports-gambling-pr-c2abef0f7372.txt | The NBA betting scandal exposes America’s sports gambling problem | Vox | 4814 chars
saved [vox]: www.vox.com-how-the-rich-avoid-paying-taxes-vox-886e9f0df696.txt | How the rich avoid paying taxes | Vox | 11752 chars
saved [vox]: www.vox.com-east-wing-demolition-is-this-legal-and-5-other-questions-ans-b8662a87c9fb.txt | East Wing demolition: Is this legal? A

Setup 8 of N - topic embeddings to Chroma

Purpose
Generate multi topic embeddings per article and upsert into the topic collection. This models what the article is about. Each article may get multiple topic vectors.

Why this matters
Topical neighbors power the first half of contrastive retrieval.

Outputs
Vectors upserted into the topic collection with structured metadata.

In [None]:
# ==============================
# Setup 8 of N — Topic Embeddings (Anchor + OpenAI Hybrid, merged for Code 1)
# ==============================

import os, json, time, numpy as np, nltk, torch, warnings
from pathlib import Path
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
import chromadb
from openai import OpenAI
warnings.filterwarnings("ignore", message="Token indices sequence length is longer")

PROJECT_ROOT = Path("/content/anti_echo").resolve()
RAW_DIR = PROJECT_ROOT / "raw"
device = "cuda" if torch.cuda.is_available() else "cpu"

# --- Config ---
topic_model_name = CONFIG["embeddings"]["topic_model"]
topic_dim = int(CONFIG["embeddings"]["dim"])
topic_dtype = CONFIG["embeddings"]["dtype"]
chunk_tokens = int(CONFIG["embeddings"]["chunk_tokens"])
MAX_TOPICS = CONFIG["topics"].get("max_topics_per_article", 5)
TOPIC_THRESHOLD = CONFIG["topics"].get("similarity_threshold", 0.4)

# --- OpenAI enrichment toggle ---
topic_cfg = CONFIG.get("topics", {})
use_openai = topic_cfg.get("use_openai_enrichment", False)
openai_model = topic_cfg.get("openai_model", "gpt-4o-mini")
openai_field = topic_cfg.get("openai_field", "openai_topic_summary")
openai_temp = float(topic_cfg.get("openai_temperature", 0.4))
openai_max = int(topic_cfg.get("openai_max_tokens", 64))
openai_prompt = topic_cfg.get("openai_prompt", "")
client_openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

print(f"Topic model: {topic_model_name}")
print(f"OpenAI enrichment: {'ON' if use_openai else 'OFF'}")

# --- NLTK sentence tokenizer ---
for pkg in ["punkt", "punkt_tab"]:
    try: nltk.data.find(f"tokenizers/{pkg}")
    except LookupError: nltk.download(pkg)

tokenizer = AutoTokenizer.from_pretrained(topic_model_name, use_fast=True)
tokenizer.model_max_length = 512
embedder = SentenceTransformer(topic_model_name, device=device)

# --- Load precomputed anchors ---
ANCHORS_PATH = PROJECT_ROOT / "config/topic_anchors.npz"
if not ANCHORS_PATH.exists():
    raise FileNotFoundError(f"Missing {ANCHORS_PATH}")
data = np.load(ANCHORS_PATH, allow_pickle=True)
topic_anchors = {k: v for k, v in data.items()}
print(f"Loaded {len(topic_anchors)} topic anchors from disk")

# --- Chroma setup ---
client = chromadb.PersistentClient(path=str(PROJECT_ROOT / CONFIG["chroma"]["dir"]))
topic_coll = client.get_collection(CONFIG["chroma_collections"]["topic"])

# --- Utilities ---
def sent_split(text):
    return [s.strip() for s in nltk.sent_tokenize(text) if s.strip()]

def encode(texts):
    if isinstance(texts, str):
        texts = [texts]
    bs = 4 if torch.cuda.is_available() else 16
    vecs = embedder.encode(texts, batch_size=bs, convert_to_numpy=True,
                           normalize_embeddings=True, show_progress_bar=False)
    return np.array(vecs)

def chunk_by_tokens(text, max_tokens=512, overlap=64):
    ids = tokenizer(text, add_special_tokens=False, return_attention_mask=False)["input_ids"]
    step = max_tokens - overlap
    chunks = []
    for i in range(0, len(ids), step):
        j = min(i + max_tokens, len(ids))
        piece = tokenizer.decode(ids[i:j], skip_special_tokens=True)
        if piece.strip(): chunks.append(piece)
    return chunks

def sanitize(meta: dict):
    out = {}
    for k, v in meta.items():
        if isinstance(v, (str, int, float, bool)) or v is None:
            out[k] = "" if v is None else v
        else:
            out[k] = str(v)
    return out

def topic_vecs(text):
    sents = sent_split(text)
    if not sents: return []
    if len(sents) < 2:
        chunks = chunk_by_tokens(" ".join(sents), chunk_tokens, 64)
        return [encode(chunks).mean(axis=0)]
    emb = encode(sents)
    k = min(max(1, len(sents)//8), 8)
    labels = AgglomerativeClustering(n_clusters=k).fit_predict(emb)
    segs = [" ".join([s for s, l in zip(sents, labels) if l == lab]) for lab in sorted(set(labels))]
    out = []
    for seg in segs:
        chunks = chunk_by_tokens(seg, chunk_tokens, 64)
        if not chunks: continue
        pooled = encode(chunks).mean(axis=0)
        out.append(pooled)
    return out

def match_topics(vec):
    scores = {label: cosine_similarity([vec], [anchor])[0][0] for label, anchor in topic_anchors.items()}
    sorted_topics = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    topics = []
    for i, (k, v) in enumerate(sorted_topics[:MAX_TOPICS]):
        if i == 0 or v >= TOPIC_THRESHOLD:
            topics.append({"topic_label": k, "similarity": float(v)})
    if not topics:
        topics = [{"topic_label": "General / Miscellaneous", "similarity": 0.0}]
    return topics

def upsert_in_chunks(collection, ids, vectors, metadatas, chunk=2048):
    for i in range(0, len(ids), chunk):
        j = min(i + chunk, len(ids))
        collection.upsert(ids=ids[i:j],
                          embeddings=vectors[i:j].tolist(),
                          metadatas=metadatas[i:j])

def generate_openai_topic_sentence(title, text):
    msg = f"{openai_prompt}\n\nTitle: {title}\n\nExcerpt:\n{text[:1500]}"
    try:
        resp = client_openai.chat.completions.create(
            model=openai_model,
            messages=[{"role": "user", "content": msg}],
            max_tokens=openai_max,
            temperature=openai_temp,
        )
        return resp.choices[0].message.content.strip()
    except Exception as e:
        print(f"[OpenAI] topic summary failed: {e}")
        return None

# --- Main Loop ---
start = time.time()
added = processed = 0
for txt_path in RAW_DIR.glob("*.txt"):
    meta_path = txt_path.with_suffix(".meta.json")
    if not meta_path.exists(): continue
    text = txt_path.read_text(encoding="utf-8").strip()
    if not text: continue
    meta = json.load(open(meta_path, encoding="utf-8"))

    vecs = topic_vecs(text)
    if not vecs: continue

    if use_openai:
        gpt_summary = generate_openai_topic_sentence(meta.get("title", ""), text)
        if gpt_summary:
            gpt_vec = encode(gpt_summary)[0]
            vecs.append(gpt_vec)
            meta[openai_field] = gpt_summary

    ids, metas = [], []
    for i, v in enumerate(vecs):
        topics_detected = match_topics(v)
        topics_json = json.dumps(topics_detected, ensure_ascii=False)
        topics_flat = [t["topic_label"] for t in topics_detected]
        top_topic = topics_detected[0]["topic_label"] if topics_detected else ""
        topic_source = (
            "openai_summary"
            if use_openai and i == len(vecs) - 1 and openai_field in meta
            else "anchor_cluster"
        )
        enriched_meta = sanitize({
            **meta,
            "topic_index": i,
            "topic_model": topic_model_name,
            "topic_source": topic_source,
            "topic_labels_json": topics_json,
            "topics_flat": ";".join(topics_flat),
            "top_topic": top_topic
        })
        ids.append(f"{meta['id']}::topic::{i}")
        metas.append(enriched_meta)

    upsert_in_chunks(topic_coll, ids, np.vstack(vecs), metas)
    added += len(vecs)
    processed += 1

print(f"Processed {processed} articles.")
print(f"Topic upserts: {added} vectors ({'with' if use_openai else 'without'} OpenAI enrichment) "
      f"in {round(time.time()-start,2)} s.")


In [None]:
# ==============================
# Topic Embeddings Checker (GPT summaries + anchor clusters)
# ==============================

import numpy as np, json, collections
import chromadb
from pathlib import Path

PROJECT_ROOT = Path("/content/anti_echo").resolve()
client = chromadb.PersistentClient(path=str(PROJECT_ROOT / CONFIG["chroma"]["dir"]))
topic_coll = client.get_collection(CONFIG["chroma_collections"]["topic"])

ANCHORS_PATH = PROJECT_ROOT / "config/topic_anchors.npz"
if ANCHORS_PATH.exists():
    data = np.load(ANCHORS_PATH, allow_pickle=True)
    print(f"Loaded {len(data.files)} topic anchors from {ANCHORS_PATH}")
else:
    print("Warning: topic_anchors.npz not found")

count = topic_coll.count()
print(f"Topic collection contains {count} total vectors\n")
if count == 0:
    print("No topic vectors found."); raise SystemExit()

dump = topic_coll.get(include=["metadatas"], limit=min(5000, count))
metas = dump.get("metadatas", [])

grouped = collections.defaultdict(lambda: {
    "anchor_cluster": 0, "openai_summary": 0,
    "title": None, "source": None, "summary_text": None
})
for m in metas:
    if not isinstance(m, dict): continue
    base_id = m.get("id", "").split("::topic::")[0]
    src = m.get("topic_source", "anchor_cluster")
    grouped[base_id][src] += 1
    grouped[base_id]["title"] = m.get("title", "?")
    grouped[base_id]["source"] = m.get("source", "?")
    if src == "openai_summary" and m.get("openai_topic_summary"):
        grouped[base_id]["summary_text"] = m["openai_topic_summary"]

print("=== Per-Article Topic Summary (with GPT Output) ===\n")
for base_id, info in grouped.items():
    print(f"{info['source']} | {info['title']}")
    print(f"  • anchor_cluster: {info['anchor_cluster']:>3}")
    print(f"  • openai_summary: {info['openai_summary']:>3}")
    print(f"  • total_vectors:  {info['anchor_cluster']+info['openai_summary']:>3}")
    if info["summary_text"]:
        print(f"  • OpenAI topic summary:\n   {info['summary_text']}\n")
    print("-"*100)

anchor_total = sum(v["anchor_cluster"] for v in grouped.values())
openai_total = sum(v["openai_summary"] for v in grouped.values())
print(f"\n=== Aggregate Counts ===")
print(f"Articles processed: {len(grouped)}")
print(f"Anchor vectors:    {anchor_total}")
print(f"OpenAI summaries:  {openai_total}")
print(f"Grand total:      {anchor_total+openai_total}")
print("\n--- Topic Embedding Integrity Check Complete ---")


# Setup 9 of N — Source Bias + Author Tone Alignment (v5.0)

### Purpose
This step analyzes each scraped article to determine **how the author’s ideological tone aligns or diverges** from the **typical bias of the source outlet**.

It adds three new analytical fields to your `stance` collection in Chroma:
- `source_bias`: typical ideological family of the outlet (e.g. "center left", "libertarian right")
- `bias_score`: numeric position on a simplified political spectrum (-1.0 = strong left, +1.0 = strong right)
- `author_tone_match`: boolean indicating whether the detected author tone aligns with the outlet’s known bias

---

### Why this matters
In modern media ecosystems, an outlet’s *institutional bias* doesn’t always reflect the *tone of individual articles*.  
For example:
- A **right-leaning outlet** might publish an article with a **centrist or neutral tone**
- A **left-leaning outlet** might host a **contrarian** op-ed that diverges from its norm

Capturing that divergence lets your system distinguish between:
> “A left outlet being left” versus “A left outlet publishing something surprising.”

That difference is key to building your anti-echo retrieval logic — you can surface articles that:
- Cover **the same topic**
- Come from **opposite ideological directions**
- And either **reinforce** or **contradict** their outlet’s typical stance

---

### How `bias_score` works
Each outlet in `source_bias.json` has a **bias family** (like `"center left"`)  
and a **bias score** (float between -1.0 and +1.0).

| Range | Meaning | Example Outlets |
|--------|----------|----------------|
| -1.0 → -0.6 | Progressive / Left | Vox, The Guardian, MSNBC |
| -0.6 → -0.2 | Center Left | NPR, BBC, NYT |
| -0.2 → +0.2 | Center / Neutral | Reuters, AP |
| +0.2 → +0.6 | Center Right | WSJ, The Economist |
| +0.6 → +1.0 | Conservative / Right | Fox News, Daily Caller, Breitbart |

The article’s **author tone** (detected by GPT-4o-mini) is mapped to the same numeric scale.  
If the two scores are close, the author is “in tune” with the source.

`author_tone_match = abs(source_bias_score - author_tone_score) <= 0.3`

Otherwise, the article is flagged as a **divergent piece** — a sign of internal tension or surprising framing.

---

### Output Fields (stored in Chroma)
| Field | Description |
|--------|-------------|
| `political_leaning` | Article tone / author ideology |
| `source_bias` | Typical outlet bias family |
| `bias_score` | Numeric bias of outlet |
| `author_tone_match` | Whether article tone aligns with outlet bias |
| `implied_stance` | Summary of article’s worldview or policy argument |
| `stance_variant` | “label” or “summary” variant |
| `stance_summary_text` | One-sentence summary of rhetorical tone |
| `stance_embedding` | Vector representation used for retrieval |

---

### Why this design
This allows retrieval experiments such as:
- “Show me articles on the same topic, but from **different bias families**.”
- “Find articles from outlets whose **authors disagree** with the typical tone.”
- “Find the **most contrasting stance** to the one I uploaded.”

These new metadata fields make your dense retrieval space **bias-aware** and **author-sensitive**, giving the comparison tool a richer basis for anti-echo matching.


In [None]:
# ===============================================
# Setup 9 of N — Source Bias + Author Tone Alignment (v5 merged for Code 1)
# ===============================================

import os, json, re, time, torch, chromadb, numpy as np
from pathlib import Path
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from rapidfuzz import fuzz
from collections import Counter

PROJECT_ROOT = Path("/content/anti_echo").resolve()
RAW_DIR = PROJECT_ROOT / "raw"
CONFIG_DIR = PROJECT_ROOT / "config"

SOURCE_BIAS_PATH = CONFIG_DIR / "source_bias.json"
LEANINGS_PATH = CONFIG_DIR / "political_leanings.json"
STANCES_PATH = CONFIG_DIR / "implied_stances.json"

source_bias = json.load(open(SOURCE_BIAS_PATH, encoding="utf-8"))
leanings_map = json.load(open(LEANINGS_PATH, encoding="utf-8"))
stances_map = json.load(open(STANCES_PATH, encoding="utf-8"))

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
device = "cuda" if torch.cuda.is_available() else "cpu"
stance_embedder = SentenceTransformer("all-mpnet-base-v2", device=device)

client_chroma = chromadb.PersistentClient(path=str(PROJECT_ROOT / CONFIG["chroma"]["dir"]))
stance_coll = client_chroma.get_or_create_collection(CONFIG["chroma_collections"]["stance"])

# --- Bias utilities ---
def bias_to_score(name: str) -> float:
    n = (name or "").lower().strip()
    if "progressive" in n or ("left" in n and "center" not in n): return -0.8
    if "center left" in n:  return -0.4
    if n == "center":       return 0.0
    if "center right" in n: return 0.4
    if "conservative" in n or "right" in n: return 0.8
    if "libertarian" in n:  return 0.6
    return 0.0

def tone_to_score(leaning: str) -> float:
    return bias_to_score(leaning)

def tone_match(bias_score, tone_score, threshold=0.3) -> bool:
    return abs(bias_score - tone_score) <= threshold

# --- GPT classification helper ---
def classify_article_tone_and_stance(title: str, text: str) -> dict:
    prompt = f"""
You are a political analyst.
Based on the article below, classify its overall political leaning (tone) and implied stance.

Leaning options: {', '.join(leanings_map.keys())}
Stance examples: {', '.join([s for cat in stances_map.values() for s in cat['families'].keys()])}

Return strict JSON with fields:
- political_leaning
- implied_stance
- summary (one sentence)

Article title: {title}
Excerpt: {text[:2000]}
"""
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=256,
        temperature=0.4
    )
    raw = resp.choices[0].message.content.strip()
    clean = re.sub(r"^```(?:json)?|```$", "", raw, flags=re.MULTILINE).strip()

    try:
        data = json.loads(clean)
        if not isinstance(data, dict): raise ValueError
    except Exception:
        data = {}
        for line in clean.splitlines():
            m = re.match(r'"?([^":]+)"?\s*:\s*"([^"]+)"', line.strip())
            if m: data[m.group(1).strip()] = m.group(2).strip()

    return {
        "political_leaning": data.get("political_leaning", "unknown").lower().strip(),
        "implied_stance": data.get("implied_stance", "unknown").lower().strip(),
        "summary": data.get("summary", "").strip()
    }

# --- Metadata sanitizer ---
def sanitize_metadata(meta: dict) -> dict:
    out = {}
    for k, v in meta.items():
        if v is None: out[k] = ""
        elif isinstance(v, (list, dict)): out[k] = json.dumps(v, ensure_ascii=False)
        elif isinstance(v, (str, int, float, bool)): out[k] = v
        else: out[k] = str(v)
    return out

# --- Main processing loop ---
start = time.time()
added = processed = 0
results_summary = []

for txt_path in RAW_DIR.glob("*.txt"):
    meta_path = txt_path.with_suffix(".meta.json")
    if not meta_path.exists(): continue
    meta = json.load(open(meta_path, encoding="utf-8"))
    title = meta.get("title") or Path(txt_path).stem
    source = (meta.get("source") or "unknown").lower()
    text = txt_path.read_text(encoding="utf-8").strip()

    bias_info = source_bias.get(source, {"bias_family": "unknown", "bias_score": 0.0})
    bias_family = bias_info.get("bias_family", "unknown")
    bias_score = float(bias_info.get("bias_score", 0.0))

    data = classify_article_tone_and_stance(title, text)
    leaning, stance, summary = data["political_leaning"], data["implied_stance"], data["summary"]

    tone_score = tone_to_score(leaning)
    author_match = tone_match(bias_score, tone_score)

    meta_enriched = {
        **meta,
        "political_leaning": leaning,
        "implied_stance": stance,
        "source_bias": bias_family,
        "bias_score": bias_score,
        "author_tone_match": bool(author_match),
        "stance_summary_text": summary
    }
    meta_enriched = sanitize_metadata(meta_enriched)

    embed_text = f"{leaning}\n{stance}\n{summary}"
    vec = stance_embedder.encode(embed_text, normalize_embeddings=True)
    stance_coll.upsert(
        ids=[f"{meta.get('id', Path(txt_path).stem)}::stance::summary"],
        embeddings=[vec.tolist()],
        metadatas=[meta_enriched]
    )

    results_summary.append((source, leaning, stance, author_match))
    added += 1; processed += 1
    print(f"[{source}] {title[:60]}... → leaning={leaning}, stance={stance}")

print(f"\nProcessed {processed} articles, added {added} stance vectors in {round(time.time()-start,1)}s.")


In [None]:
# ===============================================
# Stance / Source Bias Checker (with GPT Stance Summary Output)
# ===============================================

import chromadb, numpy as np, collections
from collections import Counter
from pathlib import Path

PROJECT_ROOT = Path("/content/anti_echo").resolve()
client = chromadb.PersistentClient(path=str(PROJECT_ROOT / CONFIG["chroma"]["dir"]))
stance_coll = client.get_collection(CONFIG["chroma_collections"]["stance"])

count = stance_coll.count()
print(f"Stance collection has {count} total records\n")
if count == 0:
    print("No stance vectors found.")
    raise SystemExit()

dump = stance_coll.get(include=["metadatas"], limit=min(5000, count))
metas = dump.get("metadatas", [])

# Aggregate stats per source
stats = collections.defaultdict(lambda: {"leanings": Counter(), "biases": Counter(), "matches": 0, "total": 0})
examples = collections.defaultdict(list)

for m in metas:
    if not isinstance(m, dict):
        continue
    src = (m.get("source") or "unknown").lower()
    lean = (m.get("political_leaning") or "unknown").lower()
    bias = (m.get("source_bias") or "unknown").lower()
    match = bool(m.get("author_tone_match"))
    summary = m.get("stance_summary_text", "").strip()
    title = m.get("title", "").strip()[:120]

    stats[src]["leanings"][lean] += 1
    stats[src]["biases"][bias] += 1
    stats[src]["matches"] += int(match)
    stats[src]["total"] += 1

    # Keep a few representative summaries per source
    if summary and len(examples[src]) < 3:
        examples[src].append((title, lean, bias, match, summary))

print("=== Source-level Stance Summary ===\n")
for src, d in stats.items():
    total = d["total"]
    match_rate = d["matches"] / total if total else 0
    top_lean = d["leanings"].most_common(1)[0] if d["leanings"] else ("-", 0)
    top_bias = d["biases"].most_common(1)[0] if d["biases"] else ("-", 0)

    print(f"{src.upper()}: {total} articles")
    print(f"  - top leaning: {top_lean[0]} ({top_lean[1]})")
    print(f"  - top bias:    {top_bias[0]} ({top_bias[1]})")
    print(f"  - tone match:  {match_rate:.1%}")

    if examples[src]:
        print("  - Example summaries:")
        for t, lean, bias, match, summ in examples[src]:
            match_str = "match" if match else "divergent"
            print(f"     * {t}")
            print(f"       leaning={lean}, bias={bias}, {match_str}")
            print(f"       summary: {summ[:200]}{'...' if len(summ) > 200 else ''}")
        print()
    print("-" * 90)

total_all = sum(d["total"] for d in stats.values())
overall_match = (sum(d["matches"] for d in stats.values()) / total_all) if total_all else 0

print("=== Overall ===")
print(f"Sources analyzed: {len(stats)}")
print(f"Articles total:   {total_all}")
print(f"Tone alignment:   {overall_match:.1%}")
print("\n--- Stance / Bias Checker Complete ---")


Setup 10 of N - persist feed state to HF and GitHub

Purpose
Snapshot feeds/feeds_state.json and feeds/index.json to HF as timestamped copies and latest pointers, and commit the same to GitHub. This keeps HF as the single source of truth while providing Git history.

Why this matters
Future runs and collaborators can always restore state. The UI you build later can also read the latest pointers.

Outputs
Four files on HF and two files in GitHub updated, with a short summary.

In [None]:
# ===============================================
# Setup 10 of N — persist feed state to HF and GitHub (config-driven)
# ===============================================

import os, json, base64
from datetime import datetime, timezone
from pathlib import Path
from huggingface_hub import upload_file
import requests
from getpass import getpass

PROJECT_ROOT = Path("/content/anti_echo").resolve()
FEEDS_DIR = PROJECT_ROOT / "feeds"
STATE_PATH = FEEDS_DIR / "feeds_state.json"
INDEX_PATH = FEEDS_DIR / "index.json"

HF_DATASET_ID = CONFIG["hf_dataset_id"]
HF_TOKEN = os.environ.get("HF_TOKEN","").strip() or getpass("Enter HF token: ")

GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN","").strip() or getpass("Enter GitHub token: ")
REPO_OWNER = CONFIG["github"]["owner"]
REPO_NAME  = CONFIG["github"]["repo"]
BRANCH     = CONFIG["github"].get("branch", "main")

ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
uploads = [
    (STATE_PATH, f"feeds/feeds_state_{ts}.json"),
    (INDEX_PATH, f"feeds/feed_index_{ts}.json"),
    (STATE_PATH, "feeds/feeds_state_latest.json"),
    (INDEX_PATH, "feeds/feed_index_latest.json"),
]

print("Uploading feed state to HF...")
for local, remote in uploads:
    upload_file(
        path_or_fileobj=str(local),
        path_in_repo=remote,
        repo_id=HF_DATASET_ID,
        repo_type="dataset",
        token=HF_TOKEN
    )
print("HF upload complete")

def gh_put(local_path: Path, repo_path: str, message: str):
    url = f"https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/contents/{repo_path}"
    headers = {"Authorization": f"Bearer {GITHUB_TOKEN}", "Accept": "application/vnd.github+json"}
    content = local_path.read_bytes()
    r = requests.get(url, headers=headers, timeout=20)
    sha = r.json().get("sha") if r.status_code == 200 else None
    payload = {"message": message, "content": base64.b64encode(content).decode(), "branch": BRANCH}
    if sha: payload["sha"] = sha
    resp = requests.put(url, headers=headers, json=payload, timeout=30)
    if resp.status_code not in (200,201):
        raise RuntimeError(f"GitHub push failed for {repo_path}: {resp.status_code} {resp.text[:200]}")

print("Committing feed state to GitHub...")
commit_msg = f"Update feed state and index {ts}"
for local, repo_path in [
    (STATE_PATH, f"feeds/feeds_state_{ts}.json"),
    (INDEX_PATH, f"feeds/feed_index_{ts}.json"),
    (STATE_PATH, "feeds/feeds_state_latest.json"),
    (INDEX_PATH, "feeds/feed_index_latest.json"),
]:
    gh_put(local, repo_path, commit_msg)
print("GitHub commit complete")


Setup 11 of N - package and push batch to HF, update registry on Git

Purpose
Package current Chroma collections into batch files, upload to HF under batches/{batch_id}/, then update artifacts/artifacts_registry.json in your GitHub repo. HF is the single source of truth. The registry keeps a chronological ledger for rebuilds.

Why this matters
Gives you versioned, reconstructable artifacts and a single source of truth for future runs and for a UI.

Outputs
topic_embeddings.npz, stance_embeddings.npz, metadata.jsonl, manifest.json uploaded to HF and registry updated on GitHub.

In [None]:
# ===============================================
# Setup 11 of N — package and push batch, update registry (config-driven)
# ===============================================

import os, json, time, uuid, warnings, logging, requests, base64
from datetime import datetime, timezone
from pathlib import Path
import numpy as np
from huggingface_hub import upload_file
import chromadb

warnings.filterwarnings("ignore", category=DeprecationWarning)
logging.getLogger("chromadb").setLevel(logging.ERROR)

PROJECT_ROOT = Path("/content/anti_echo").resolve()
BATCH_DIR = PROJECT_ROOT / CONFIG["batch"]["base_dir"]
HF_DATASET_ID = CONFIG["hf_dataset_id"]

REPO_OWNER = CONFIG["github"]["owner"]
REPO_NAME  = CONFIG["github"]["repo"]
BRANCH     = CONFIG["github"].get("branch", "main")

client = chromadb.PersistentClient(path=str(PROJECT_ROOT / CONFIG["chroma"]["dir"]))
topic_coll  = client.get_collection(CONFIG["chroma_collections"]["topic"])
stance_coll = client.get_collection(CONFIG["chroma_collections"]["stance"])

timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
batch_id = f"batch_{timestamp}_{uuid.uuid4().hex[:8]}"
batch_path = BATCH_DIR / batch_id
batch_path.mkdir(parents=True, exist_ok=True)

topic_dump = topic_coll.get(include=["embeddings","metadatas"])
stance_dump = stance_coll.get(include=["embeddings","metadatas"])
topic_vecs = np.array(topic_dump["embeddings"], dtype=np.float16)
stance_vecs = np.array(stance_dump["embeddings"], dtype=np.float16)

topic_npz   = batch_path / CONFIG["batch"]["topic_file"]
stance_npz  = batch_path / CONFIG["batch"]["stance_file"]
meta_topic  = batch_path / "metadata_topic.jsonl"
meta_stance = batch_path / "metadata_stance.jsonl"
manifest_path = batch_path / CONFIG["batch"]["manifest_name"]

def write_meta_jsonl(path, ids, metas):
    with path.open("w", encoding="utf-8") as f:
        for rid, m in zip(ids, metas):
            rec = dict(m) if isinstance(m, dict) else {}
            rec["row_id"] = rid
            rec.setdefault("id", rec.get("id",""))
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")

write_meta_jsonl(meta_topic,  topic_dump["ids"],  topic_dump["metadatas"])
write_meta_jsonl(meta_stance, stance_dump["ids"], stance_dump["metadatas"])
np.savez_compressed(topic_npz, topic_vecs)
np.savez_compressed(stance_npz, stance_vecs)

manifest = {
    "schema_version": 2,
    "batch_id": batch_id,
    "created_at": timestamp,
    "models": CONFIG["embeddings"],
    "counts": {"topic": int(topic_vecs.shape[0]), "stance": int(stance_vecs.shape[0])},
    "hf_dataset_id": HF_DATASET_ID,
    "paths": {
        "embeddings_topic": f"batches/{batch_id}/{topic_npz.name}",
        "embeddings_stance": f"batches/{batch_id}/{stance_npz.name}",
        "metadata_topic": f"batches/{batch_id}/{meta_topic.name}",
        "metadata_stance": f"batches/{batch_id}/{meta_stance.name}",
        "manifest": f"batches/{batch_id}/{manifest_path.name}",
    }
}
manifest_path.write_text(json.dumps(manifest, indent=2), encoding="utf-8")

print("Uploading batch to HF...")
HF_TOKEN = os.environ["HF_TOKEN"]
for fpath in [topic_npz, stance_npz, meta_topic, meta_stance, manifest_path]:
    upload_file(
        path_or_fileobj=str(fpath),
        path_in_repo=f"batches/{batch_id}/{fpath.name}",
        repo_id=HF_DATASET_ID,
        repo_type="dataset",
        token=HF_TOKEN
    )
print("HF batch upload complete")

REGISTRY_URL = f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{BRANCH}/artifacts/artifacts_registry.json"
try:
    registry = requests.get(REGISTRY_URL, timeout=20).json()
except Exception:
    registry = {"version": 1, "batches": []}

registry.setdefault("batches", []).append(manifest)
registry["version"] = int(registry.get("version", 0)) + 1

new_registry_bytes = json.dumps(registry, indent=2).encode("utf-8")
url = f"https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/contents/artifacts/artifacts_registry.json"
headers = {"Authorization": f"Bearer {os.environ['GITHUB_TOKEN']}", "Accept": "application/vnd.github+json"}
r = requests.get(url, headers=headers, timeout=20)
sha = r.json().get("sha") if r.status_code == 200 else None
payload = {"message": f"Update artifacts registry {timestamp}",
           "content": base64.b64encode(new_registry_bytes).decode(),
           "branch": BRANCH}
if sha: payload["sha"] = sha
resp = requests.put(url, headers=headers, json=payload, timeout=30)
if resp.status_code not in (200,201):
    raise RuntimeError(f"GitHub registry push failed: {resp.status_code} {resp.text[:200]}")
print("Registry updated on GitHub")
