# Anti Echo Chamber

This notebook does the following things:
- Load topic and stance embeddings from Hugging Face
- Load transformer models for topic, stance, and summarization
- Upload and analyze a news article
- Retrieve similar topics with opposing viewpoints


In [1]:
# ====================================================
# Setup
# ====================================================
!pip install -q chromadb sentence-transformers transformers huggingface-hub pymupdf beautifulsoup4 scikit-learn

import os, json, gc, requests
import numpy as np
from pathlib import Path
from huggingface_hub import hf_hub_download
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
import fitz
import chromadb
from google.colab import files
from IPython.display import display, Markdown

# --- Disable telemetry noise ---
os.environ["CHROMA_TELEMETRY_ENABLED"] = "false"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"

# ====================================================
# Configuration
# ====================================================
HF_DATASET_ID = "zanimal/anti-echo-artifacts"
REPO_OWNER = "AHMerrill"
REPO_NAME = "anti-echo-chamber"
BRANCH = "main"

TOPIC_MODEL_NAME  = "intfloat/e5-base-v2"
STANCE_MODEL_NAME = "Snowflake/snowflake-arctic-embed-l"
SUMMARIZER_MODEL_NAME = "facebook/bart-large-cnn"

CHROMA_DIR = Path("chroma_db")
TOPIC_COLL_NAME = "news_topic"
STANCE_COLL_NAME = "news_stance"


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.9/19.9 MB[0m [31m129.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m115.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m100.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m12.0 MB/s[0m eta [36m0

## Load or Rebuild Chroma from Hugging Face Dataset
This step fetches topic and stance embeddings from your Hugging Face dataset and constructs local Chroma collections.


In [2]:
# ====================================================
# Load or rebuild Chroma collections (auto-compatible across Chroma versions)
# ====================================================

import os, json, requests, numpy as np, chromadb
from huggingface_hub import hf_hub_download
from pathlib import Path

# --- Set environment variables for persistent path (works in 0.5 transitional builds) ---
CHROMA_DIR = Path("/content/chroma_data")
!rm -rf {CHROMA_DIR}
CHROMA_DIR.mkdir(parents=True, exist_ok=True)

os.environ["CHROMA_DB_IMPL"] = "duckdb+parquet"       # ✅ safe for all new versions
os.environ["PERSIST_DIRECTORY"] = str(CHROMA_DIR)
os.environ["ANONYMIZED_TELEMETRY"] = "False"

# --- Initialize client with whatever backend is available ---
client = chromadb.Client()

def is_chroma_empty(client):
    try:
        cols = client.list_collections()
        if not cols:
            return True
        counts = [client.get_collection(c.name).count() for c in cols]
        return all(c == 0 for c in counts)
    except Exception:
        return True

need_rebuild = is_chroma_empty(client)

if not need_rebuild:
    print("Using existing non-empty Chroma database.")
    topic_coll = client.get_or_create_collection(TOPIC_COLL_NAME, metadata={"hnsw:space": "cosine"})
    stance_coll = client.get_or_create_collection(STANCE_COLL_NAME, metadata={"hnsw:space": "cosine"})
else:
    print("Rebuilding Chroma collections from Hugging Face dataset…")
    topic_coll = client.get_or_create_collection(TOPIC_COLL_NAME, metadata={"hnsw:space": "cosine"})
    stance_coll = client.get_or_create_collection(STANCE_COLL_NAME, metadata={"hnsw:space": "cosine"})

    REGISTRY_URL = f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{BRANCH}/artifacts/artifacts_registry.json"
    REGISTRY = requests.get(REGISTRY_URL, timeout=30).json()

    for b in REGISTRY.get("batches", []):
        paths = b.get("paths") or {}
        if not all(k in paths for k in ["embeddings_topic", "embeddings_stance", "metadata_topic", "metadata_stance"]):
            continue

        batch_id = b.get("batch_id", f"batch_{len(paths)}")
        print(f"Ingesting {batch_id} …")

        # --- Load embeddings and metadata ---
        t_vecs = np.load(hf_hub_download(HF_DATASET_ID, paths["embeddings_topic"], repo_type="dataset"))["arr_0"]
        s_vecs = np.load(hf_hub_download(HF_DATASET_ID, paths["embeddings_stance"], repo_type="dataset"))["arr_0"]

        t_meta = [json.loads(l) for l in open(hf_hub_download(HF_DATASET_ID, paths["metadata_topic"], repo_type="dataset"), encoding="utf-8")]
        s_meta = [json.loads(l) for l in open(hf_hub_download(HF_DATASET_ID, paths["metadata_stance"], repo_type="dataset"), encoding="utf-8")]

        # --- Unique IDs per vector ---
        topic_ids = [m.get("row_id") or f"{m.get('id', f'topic::{i}')}::topic::{i}::{batch_id}" for i, m in enumerate(t_meta)]
        stance_ids = [m.get("row_id") or f"{m.get('id', f'stance::{i}')}::stance::{i}::{batch_id}" for i, m in enumerate(s_meta)]

        # --- Deduplicate within batch ---
        def dedupe(ids, vecs, metas):
            seen = set(); out_ids, out_vecs, out_metas = [], [], []
            for uid, v, m in zip(ids, vecs, metas):
                if uid not in seen:
                    seen.add(uid); out_ids.append(uid); out_vecs.append(v); out_metas.append(m)
            return out_ids, out_vecs, out_metas

        topic_ids, t_vecs_list, t_meta = dedupe(topic_ids, t_vecs.tolist(), t_meta)
        stance_ids, s_vecs_list, s_meta = dedupe(stance_ids, s_vecs.tolist(), s_meta)

        # --- Upsert ---
        topic_coll.upsert(ids=topic_ids, embeddings=t_vecs_list, metadatas=t_meta)
        stance_coll.upsert(ids=stance_ids, embeddings=s_vecs_list, metadatas=s_meta)

# --- Final check ---
topic_coll = client.get_collection(TOPIC_COLL_NAME)
stance_coll = client.get_collection(STANCE_COLL_NAME)
print(f"Chroma ready with {topic_coll.count()} topic and {stance_coll.count()} stance vectors.")
print(f"Chroma path: {CHROMA_DIR}")


Rebuilding Chroma collections from Hugging Face dataset…
Ingesting batch_20251014T190038Z_9cc10549 …


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


batches/batch_20251014T190038Z_9cc10549/(…):   0%|          | 0.00/2.35M [00:00<?, ?B/s]

batches/batch_20251014T190038Z_9cc10549/(…):   0%|          | 0.00/936k [00:00<?, ?B/s]

metadata_topic.jsonl: 0.00B [00:00, ?B/s]

metadata_stance.jsonl: 0.00B [00:00, ?B/s]

Chroma ready with 1675 topic and 868 stance vectors.
Chroma path: /content/chroma_data


Creates the local config/ folder if missing.

Downloads your political_leanings.json and implied_stances.json directly from GitHub (AHMerrill/anti-echo-chamber/main/config/...).

Ensures both files exist before your classifier loads them.

Prints “Fetched …” the first time, and “already exists” afterward.

In [14]:
# ====================================================
# Ensure config files exist locally (pull from GitHub)
# ====================================================
import os, requests
from pathlib import Path
import yaml

config_dir = Path("config")
config_dir.mkdir(exist_ok=True)

repo_owner = "AHMerrill"
repo_name  = "anti-echo-chamber"
branch     = "main"

# Files to fetch
config_files = [
    "config/config.yaml",
    "config/political_leanings.json",
    "config/implied_stances.json"
]

for f in config_files:
    url = f"https://raw.githubusercontent.com/{repo_owner}/{repo_name}/{branch}/{f}"
    local_path = Path(f)
    if not local_path.exists():
        print(f"Fetching {f} from GitHub…")
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        local_path.write_text(r.text, encoding="utf-8")
    else:
        print(f"{f} already exists locally.")

# --- Load config.yaml ---
CONFIG_PATH = Path("config/config.yaml")
if CONFIG_PATH.exists():
    with open(CONFIG_PATH, "r", encoding="utf-8") as f:
        CONFIG = yaml.safe_load(f)
    print("Loaded CONFIG from config.yaml")
else:
    raise FileNotFoundError("config/config.yaml not found or failed to fetch.")


Fetching config/config.yaml from GitHub…
config/political_leanings.json already exists locally.
config/implied_stances.json already exists locally.
Loaded CONFIG from config.yaml


## Load Embedding and Summarization Models
We will use the following:
- intfloat/e5-base-v2 for topic and stance embeddings
- facebook/bart-large-cnn for summarization


In [4]:
# ====================================================
# Load embedding + summarization models
# ====================================================
print("Loading models...")
topic_model  = SentenceTransformer(TOPIC_MODEL_NAME)
stance_model = SentenceTransformer(STANCE_MODEL_NAME)
tok_sum  = AutoTokenizer.from_pretrained(SUMMARIZER_MODEL_NAME)
model_sum = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZER_MODEL_NAME)
print("Models loaded successfully.")

Loading models...


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/107 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Models loaded successfully.


## Upload an Article
Upload a .txt, .pdf, or .html file for analysis.


In [5]:
# ====================================================
# Upload and extract article
# ====================================================
uploaded = files.upload()
filename = list(uploaded.keys())[0]
ext = Path(filename).suffix.lower()

def extract_text(file_path):
    if ext == ".txt":
        return open(file_path, encoding="utf-8", errors="ignore").read()
    elif ext == ".pdf":
        text = ""
        with fitz.open(file_path) as doc:
            for page in doc:
                text += page.get_text()
        return text
    elif ext == ".html":
        html = open(file_path, encoding="utf-8", errors="ignore").read()
        soup = BeautifulSoup(html, "html.parser")
        return soup.get_text(separator="\n")
    else:
        raise ValueError("Unsupported file type")

text = extract_text(filename)
display(Markdown(f"**Extracted first 2,000 characters:**\n\n{text[:2000]}..."))

Saving test article.txt to test article.txt


**Extracted first 2,000 characters:**

June 03, 2020
Send in the Troops
Send in the Troops
New York Times
Senator Tom Cotton

This week, rioters have plunged many American cities into anarchy, recalling the widespread violence of the 1960s.

New York City suffered the worst of the riots Monday night, as Mayor Bill de Blasio stood by while Midtown Manhattan descended into lawlessness. Bands of looters roved the streets, smashing and emptying hundreds of businesses. Some even drove exotic cars; the riots were carnivals for the thrill-seeking rich as well as other criminal elements.

Outnumbered police officers, encumbered by feckless politicians, bore the brunt of the violence. In New York State, rioters ran over officers with cars on at least three occasions. In Las Vegas, an officer is in "grave" condition after being shot in the head by a rioter. In St. Louis, four police officers were shot as they attempted to disperse a mob throwing bricks and dumping gasoline; in a separate incident, a 77-year-old retired police captain was shot to death as he tried to stop looters from ransacking a pawnshop. This is "somebody's granddaddy," a bystander screamed at the scene.

Some elites have excused this orgy of violence in the spirit of radical chic, calling it an understandable response to the wrongful death of George Floyd. Those excuses are built on a revolting moral equivalence of rioters and looters to peaceful, law-abiding protesters. A majority who seek to protest peacefully shouldn't be confused with bands of miscreants.

But the rioting has nothing to do with George Floyd, whose bereaved relatives have condemned violence. On the contrary, nihilist criminals are simply out for loot and the thrill of destruction, with cadres of left-wing radicals like antifa infiltrating protest marches to exploit Floyd's death for their own anarchic purposes.

These rioters, if not subdued, not only will destroy the livelihoods of law-abiding citizens but will also take more innocent lives. Many poor communities that still...

## Summarize and Compute Embeddings
This step summarizes the article for stance analysis, then embeds both the summary and full text for topic analysis.


In [15]:
# ====================================================
# Retrieval-time replication of Setup 9 + 10
# ====================================================
import json, numpy as np, nltk, torch, time, requests
from pathlib import Path
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
from IPython.display import Markdown, display

device = "cuda" if torch.cuda.is_available() else "cpu"

# --- Ensure sentence tokenizer ---
for pkg in ["punkt", "punkt_tab"]:
    try:
        nltk.data.find(f"tokenizers/{pkg}")
    except LookupError:
        nltk.download(pkg)

# --- Ensure topics.json and stance/leaning configs ---
config_dir = Path("config"); config_dir.mkdir(exist_ok=True)
repo_owner = "AHMerrill"; repo_name = "anti-echo-chamber"; branch = "main"
for f in ["config/topics.json", "config/political_leanings.json", "config/implied_stances.json"]:
    url = f"https://raw.githubusercontent.com/{repo_owner}/{repo_name}/{branch}/{f}"
    local_path = Path(f)
    if not local_path.exists():
        r = requests.get(url, timeout=30); r.raise_for_status()
        local_path.write_text(r.text, encoding="utf-8")

TOPIC_LABELS = json.load(open("config/topics.json"))
POLITICAL_LEANINGS = json.load(open("config/political_leanings.json"))
IMPLIED_STANCES    = json.load(open("config/implied_stances.json"))
leaning_labels = list(POLITICAL_LEANINGS.keys())
stance_labels  = list(IMPLIED_STANCES.keys())

# --- Topic embedding parameters (same as Setup 9) ---
topic_model_name = CONFIG["embeddings"]["topic_model"]
chunk_tokens = int(CONFIG["embeddings"]["chunk_tokens"])
MAX_TOPICS = CONFIG["topics"].get("max_topics_per_article", 5)
TOPIC_THRESHOLD = CONFIG["topics"].get("similarity_threshold", 0.4)

tokenizer = AutoTokenizer.from_pretrained(topic_model_name, use_fast=True)
tokenizer.model_max_length = 512

embedder = SentenceTransformer(topic_model_name, device=device)

# --- Build topic centroids (same as scraper) ---
print(f"Encoding {len(TOPIC_LABELS)} topic anchors...")
topic_anchors = {}
for label, phrases in TOPIC_LABELS.items():
    embs = embedder.encode(
        phrases,
        normalize_embeddings=True,
        batch_size=8 if device == "cuda" else 16,
        show_progress_bar=False,
    )
    topic_anchors[label] = np.mean(np.asarray(embs), axis=0)
print(f"Encoded {len(topic_anchors)} topic centroids.")

# --- Helper functions (identical to scraper) ---
def sent_split(text):
    return [s.strip() for s in nltk.sent_tokenize(text) if s.strip()]

def encode(texts):
    if isinstance(texts, str): texts = [texts]
    bs = 4 if torch.cuda.is_available() else 16
    return np.array(embedder.encode(
        texts,
        batch_size=bs,
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=False,
    ))

def chunk_by_tokens(text, max_tokens=512, overlap=64):
    ids = tokenizer(text, add_special_tokens=False, return_attention_mask=False)["input_ids"]
    step = max_tokens - overlap
    chunks = []
    for i in range(0, len(ids), step):
        j = min(i + max_tokens, len(ids))
        piece = tokenizer.decode(ids[i:j], skip_special_tokens=True)
        if piece.strip(): chunks.append(piece)
    return chunks

def topic_vecs(text):
    sents = sent_split(text)
    if not sents: return []
    if len(sents) < 2:
        chunks = chunk_by_tokens(" ".join(sents), chunk_tokens, 64)
        vecs = encode(chunks)
        return [vecs.mean(axis=0)]
    emb = encode(sents)
    k = min(max(1, len(sents)//8), 8)
    labels = AgglomerativeClustering(n_clusters=k).fit_predict(emb)
    segs = [" ".join([s for s, l in zip(sents, labels) if l == lab]) for lab in sorted(set(labels))]
    out = []
    for seg in segs:
        chunks = chunk_by_tokens(seg, chunk_tokens, 64)
        if not chunks: continue
        pooled = encode(chunks).mean(axis=0)
        out.append(pooled)
    return out

def match_topics(vec):
    scores = {label: cosine_similarity([vec], [anchor])[0][0] for label, anchor in topic_anchors.items()}
    sorted_topics = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    topics = []
    for i, (k, v) in enumerate(sorted_topics[:MAX_TOPICS]):
        if i == 0 or v >= TOPIC_THRESHOLD:
            topics.append({"topic_label": k, "similarity": float(v)})
    if not topics:
        topics = [{"topic_label": "General / Miscellaneous", "similarity": 0.0}]
    return topics

def sanitize(meta: dict):
    out = {}
    for k, v in meta.items():
        if isinstance(v, (str, int, float, bool)) or v is None:
            out[k] = "" if v is None else v
        else:
            out[k] = str(v)
    return out

# --- Compute topic vectors + matches (Setup 9 parity) ---
vecs = topic_vecs(text)
topic_results = []
for i, v in enumerate(vecs):
    topics_detected = match_topics(v)
    topics_json = json.dumps(topics_detected, ensure_ascii=False)
    topics_flat = [t["topic_label"] for t in topics_detected]
    top_topic = topics_detected[0]["topic_label"] if topics_detected else ""
    topic_results.append({
        "topic_index": i,
        "topic_labels_json": topics_json,
        "topics_flat": ";".join(topics_flat),
        "top_topic": top_topic,
    })

# --- Setup 10 parity: stance and summary embeddings ---
flan_tok = AutoTokenizer.from_pretrained("google/flan-t5-large")
flan_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large").to(device)
summarizer = pipeline("summarization", model=model_sum, tokenizer=tok_sum, device=0 if device == "cuda" else -1)

allowed_leanings = ", ".join(leaning_labels)
allowed_stances = ", ".join(stance_labels)
prompt = (
    "You are a classification model. Decide the author's implied political leaning and implied stance.\n"
    f"Political leaning (choose exactly one): {allowed_leanings}\n"
    f"Implied stance (choose exactly one): {allowed_stances}\n"
    "Respond with ONLY the two chosen labels, each on its own line.\n\n"
    f"Article:\n{text[:2000]}"
)
inputs = flan_tok(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
out_ids = flan_model.generate(**inputs, max_new_tokens=32, num_beams=4)
output = flan_tok.decode(out_ids[0], skip_special_tokens=True).lower().splitlines()
leaning = output[0].strip() if len(output) > 0 else "apolitical_or_unknown"
stance  = output[1].strip() if len(output) > 1 else "unknown"

summary = summarizer(text[:3000], max_length=40, min_length=15, do_sample=False)[0]["summary_text"].strip()
hybrid_text = f"{leaning}\n{stance}\n{summary}".strip()

stance_vec = stance_model.encode([hybrid_text], normalize_embeddings=True)[0]
topic_vecs_all = topic_model.encode([summary, text[:3000]], normalize_embeddings=True)
topic_vec_mean = topic_vecs_all.mean(axis=0)

# --- Display (human readable) ---
display(Markdown(f"### Summary\n> {summary}"))
display(Markdown(f"**Detected Leaning:** {leaning}  **Implied Stance:** {stance}"))
display(Markdown(f"**Top Topics:** {topic_results[0]['topics_flat']}"))


Encoding 1400 topic anchors...
Encoded 1400 topic centroids.


Device set to use cuda:0


### Summary
> Rioters have plunged many American cities into anarchy, recalling the widespread violence of the 1960s. Outnumbered police officers, encumbered by feckless politicians, bore the brunt of

**Detected Leaning:** populist_left  **Implied Stance:** unknown

**Top Topics:** Politics / US / Federal / Subdomain 1;Politics / US / Federal / Subdomain 2;Politics / US / Federal / Subdomain 3;Politics / US / Federal / Subdomain 4;Politics / US / Federal / Subdomain 5

## Query for Similar Topics
Retrieve the 100 most similar articles by topic embedding.


## Rank by Opposing Stance
We compute cosine similarity between stance embeddings and display opposing viewpoints first.


In [26]:
# ====================================================
# Retrieve and rank by topic overlap + stance opposition
# ====================================================

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# --- Normalize the averaged topic vector ---
topic_vec_mean = topic_vec_mean / np.linalg.norm(topic_vec_mean)

# --- Query for many potential candidates ---
results = topic_coll.query(
    query_embeddings=[topic_vec_mean.tolist()],
    n_results=300,
    include=["metadatas"]
)

# --- Flatten results ---
flat_results = []
if results.get("metadatas"):
    for batch in results["metadatas"]:
        if batch:
            flat_results.extend(batch)

print(f"Fetched {len(flat_results)} topic candidates from Chroma.")

if not flat_results:
    display(Markdown("No topic matches found."))
else:
    # --- Topics detected for the uploaded article ---
    uploaded_topics = set([t["topic_label"] for t in topic_results[0:5] for t in json.loads(topic_results[0]["topic_labels_json"])])

    def compute_topic_overlap(meta):
        chroma_topics = set(str(meta.get("topics_flat", "")).split(";"))
        return len(uploaded_topics.intersection(chroma_topics))

    # --- Compute overlap score (0–5) ---
    for m in flat_results:
        m["topic_overlap"] = compute_topic_overlap(m)

    # --- Filter: require at least 1 overlapping topic ---
    filtered = [m for m in flat_results if m["topic_overlap"] > 0]
    print(f"{len(filtered)} articles share at least one topic with the uploaded article.")

    # ====================================================
    # Optional stance ranking (if stance_coll available)
    # ====================================================
    stance_coll = client.get_or_create_collection("news_stance")
    enriched_results = []

    for meta in filtered:
        base_id = meta.get("id") or meta.get("row_id", "").split("::")[0]
        stance_label = stance_coll.get(ids=[f"{base_id}::stance::label"])
        stance_summary = stance_coll.get(ids=[f"{base_id}::stance::summary"])
        label_meta = stance_label["metadatas"][0] if stance_label and stance_label["metadatas"] else {}
        summary_meta = stance_summary["metadatas"][0] if stance_summary and stance_summary["metadatas"] else {}
        merged = {
            **meta,
            "political_leaning": label_meta.get("political_leaning", "unknown"),
            "implied_stance": label_meta.get("implied_stance", "unknown"),
            "stance_summary_text": summary_meta.get("stance_summary_text", ""),
        }
        enriched_results.append(merged)

    stance_texts = [
        f"{m.get('political_leaning','')}\n{m.get('implied_stance','')}\n{m.get('stance_summary_text','')}"
        for m in enriched_results
    ]
    stance_embeddings = stance_model.encode(stance_texts, normalize_embeddings=True)
    stance_sims = cosine_similarity([stance_vec], stance_embeddings)[0]

    # --- Rank: highest topic overlap first, then lowest stance similarity (most opposing) ---
    ranked = sorted(
        zip(enriched_results, stance_sims),
        key=lambda x: (-x[0]["topic_overlap"], x[1])
    )

    def sim_label(s):
        if s < 0.2: return "Very Dissimilar"
        elif s < 0.4: return "Dissimilar"
        elif s < 0.6: return "Somewhat Similar"
        elif s < 0.8: return "Similar"
        else: return "Very Similar"

    display(Markdown("### Results: Shared Topics, Opposing Perspectives"))

    for meta, sim in ranked[:10]:
        topic_display = meta.get("top_topic") or "(topic unknown)"
        leaning = meta.get("political_leaning", "unknown")
        stance = meta.get("implied_stance", "unknown")
        summary = meta.get("stance_summary_text", "(no summary available)")
        url = meta.get("url", "#")
        title = meta.get("title", "(untitled)")
        domain = meta.get("domain", "unknown")
        overlap = meta.get("topic_overlap", 0)

        md = f"""
**{title}**
Source: {domain}
Topic Overlap: `{overlap}/5`
Topic: *{topic_display}*
Political Leaning: `{leaning}`
Implied Stance: `{stance}`
Stance Similarity: {sim:.2f} ({sim_label(sim)})

{summary}

[Read original article]({url})
"""
        display(Markdown(md))


Fetched 300 topic candidates from Chroma.
91 articles share at least one topic with the uploaded article.


### Results: Shared Topics, Opposing Perspectives


**Johnson announces global effort to nominate Trump for Nobel Peace Prize | Fox News**  
Source: www.foxnews.com  
Topic Overlap: `5/5`  
Topic: *Politics / US / Federal / Subdomain 1*  
Political Leaning: `apolitical_or_unknown`  
Implied Stance: `unknown`  
Stance Similarity: 0.48 (Somewhat Similar)  

House Speaker Mike Johnson, R-La., announced a global effort to nominate President Donald Trump for the Nobel Peace Prize. Johnson made the announcement during his daily government shutdown news conference on the

[Read original article](https://www.foxnews.com/politics/mike-johnson-world-leaders-nominate-trump-nobel-peace-prize-after-israel-hamas-deal)



**Man jailed for five years for threatening to kill Nigel Farage on TikTok**  
Source: www.bbc.com  
Topic Overlap: `5/5`  
Topic: *Politics / US / Federal / Subdomain 1*  
Political Leaning: `apolitical_or_unknown`  
Implied Stance: `unknown`  
Stance Similarity: 0.48 (Somewhat Similar)  

Fayaz Khan, 26, made a gun gesture with his hand and named the Reform UK leader in a TikTok post in October 2024. Khan was one of 65 migrants on board

[Read original article](https://www.bbc.com/news/articles/cj97lkmd23po?at_medium=RSS&at_campaign=rss)



**Brennan Makes Jab At Biden Over Israel Peace Deal**  
Source: thefederalist.com  
Topic Overlap: `5/5`  
Topic: *Politics / US / Federal / Subdomain 1*  
Political Leaning: `apolitical_or_unknown`  
Implied Stance: `unknown`  
Stance Similarity: 0.50 (Somewhat Similar)  

Face The Nation Anchor Margaret Brennan gives President Trump credit for his role in the Israel peace agreement. Brennan also throws former President Joe Biden under the bus. Vice President J.D.

[Read original article](https://thefederalist.com/2025/10/13/brennan-makes-jab-at-biden-over-israel-peace-deal-all-the-question-from-face-the-nation/?utm_source=rss&utm_medium=rss&utm_campaign=brennan-makes-jab-at-biden-over-israel-peace-deal-all-the-question-from-face-the-nation)



**Brennan Makes Jab At Biden Over Israel Peace Deal**  
Source: thefederalist.com  
Topic Overlap: `5/5`  
Topic: *Politics / US / Federal / Subdomain 1*  
Political Leaning: `apolitical_or_unknown`  
Implied Stance: `unknown`  
Stance Similarity: 0.50 (Somewhat Similar)  

Face The Nation Anchor Margaret Brennan gives President Trump credit for his role in the Israel peace agreement. Brennan also throws former President Joe Biden under the bus. Vice President J.D.

[Read original article](https://thefederalist.com/2025/10/13/brennan-makes-jab-at-biden-over-israel-peace-deal-all-the-question-from-face-the-nation/?utm_source=rss&utm_medium=rss&utm_campaign=brennan-makes-jab-at-biden-over-israel-peace-deal-all-the-question-from-face-the-nation)



**play**  
Source: www.aljazeera.com  
Topic Overlap: `5/5`  
Topic: *Politics / US / Federal / Subdomain 1*  
Political Leaning: `apolitical_or_unknown`  
Implied Stance: `unknown`  
Stance Similarity: 0.52 (Somewhat Similar)  

Madagascar’s parliament has voted to impeach embattled President Andry Rajoelina. The vote came hours after he confirmed he had fled the country in the wake of an

[Read original article](https://www.aljazeera.com/news/2025/10/14/who-is-in-charge-of-madagascar-after-president-rajoelina-flees?traffic_source=rss)



**For Trump’s perceived enemies, the process may be the punishment**  
Source: theconversation.com  
Topic Overlap: `5/5`  
Topic: *Politics / US / Federal / Subdomain 1*  
Political Leaning: `apolitical_or_unknown`  
Implied Stance: `unknown`  
Stance Similarity: 0.52 (Somewhat Similar)  

James Comey pleaded not guilty to two criminal charges in a federal court in Alexandria, Virginia, on Oct. 8, 2025. The charges allege that Comey lied to Congress in September 2020 when

[Read original article](https://theconversation.com/for-trumps-perceived-enemies-the-process-may-be-the-punishment-266747)



**Punishing Propagandists Starts With Starving Them Of GOP Guests**  
Source: thefederalist.com  
Topic Overlap: `5/5`  
Topic: *Politics / US / Federal / Subdomain 1*  
Political Leaning: `apolitical_or_unknown`  
Implied Stance: `unknown`  
Stance Similarity: 0.53 (Somewhat Similar)  

J.D. Vance was nearly 13 minutes into a masterful media takedown of ABC News’ George Stephanopoulos when he was muted and the show abruptly cut to a commercial break.

[Read original article](https://thefederalist.com/2025/10/13/punishing-propagandists-like-stephanopoulos-for-playing-dirty-starts-with-starving-them-of-gop-guests/?utm_source=rss&utm_medium=rss&utm_campaign=punishing-propagandists-like-stephanopoulos-for-playing-dirty-starts-with-starving-them-of-gop-guests)



**Welker Asks If Trump Or Biden Better Negotiator: Meet The Press**  
Source: thefederalist.com  
Topic Overlap: `5/5`  
Topic: *Politics / US / Federal / Subdomain 1*  
Political Leaning: `apolitical_or_unknown`  
Implied Stance: `unknown`  
Stance Similarity: 0.53 (Somewhat Similar)  

Meet the Press Anchor Kristen Welker interviews Republicans and Democrats. Welker asks Vice President J.D. Vance few questions about the newly minted peace deal in Israel.

[Read original article](https://thefederalist.com/2025/10/13/welker-suggests-trump-weaponized-doj-to-go-after-comey-letitia-james-all-the-questions-from-meet-the-press/?utm_source=rss&utm_medium=rss&utm_campaign=welker-suggests-trump-weaponized-doj-to-go-after-comey-letitia-james-all-the-questions-from-meet-the-press)



**Welker Asks If Trump Or Biden Better Negotiator: Meet The Press**  
Source: thefederalist.com  
Topic Overlap: `5/5`  
Topic: *Politics / US / Federal / Subdomain 1*  
Political Leaning: `apolitical_or_unknown`  
Implied Stance: `unknown`  
Stance Similarity: 0.53 (Somewhat Similar)  

Meet the Press Anchor Kristen Welker interviews Republicans and Democrats. Welker asks Vice President J.D. Vance few questions about the newly minted peace deal in Israel.

[Read original article](https://thefederalist.com/2025/10/13/welker-suggests-trump-weaponized-doj-to-go-after-comey-letitia-james-all-the-questions-from-meet-the-press/?utm_source=rss&utm_medium=rss&utm_campaign=welker-suggests-trump-weaponized-doj-to-go-after-comey-letitia-james-all-the-questions-from-meet-the-press)



**Up First briefing: Palestinian detainees return to Gaza; ICE tactics : NPR**  
Source: www.npr.org  
Topic Overlap: `5/5`  
Topic: *Politics / US / Federal / Subdomain 1*  
Political Leaning: `apolitical_or_unknown`  
Implied Stance: `unknown`  
Stance Similarity: 0.54 (Somewhat Similar)  

Israel released nearly 2,000 prisoners yesterday as part of the initial phase of a ceasefire agreement with Hamas. Some of the Palestinians who returned to Gaza were journalists, doctors and first responders.

[Read original article](https://www.npr.org/2025/10/14/g-s1-93331/up-first-newsletter-gaza-israel-trump-middle-east-ice)
