# Anti Echo Chamber

This notebook does the following things:
- Load topic and stance embeddings from Hugging Face
- Load transformer models for topic, stance, and summarization
- Upload and analyze a news article
- Retrieve similar topics with opposing viewpoints


In [1]:
# ====================================================
# Setup
# ====================================================
!pip install -q chromadb sentence-transformers transformers huggingface-hub pymupdf beautifulsoup4 scikit-learn

import os, json, gc, requests
import numpy as np
from pathlib import Path
from huggingface_hub import hf_hub_download
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
import fitz
import chromadb
from google.colab import files
from IPython.display import display, Markdown

# --- Disable telemetry noise ---
os.environ["CHROMA_TELEMETRY_ENABLED"] = "false"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"

# ====================================================
# Configuration
# ====================================================
HF_DATASET_ID = "zanimal/anti-echo-artifacts"
REPO_OWNER = "AHMerrill"
REPO_NAME = "anti-echo-chamber"
BRANCH = "main"

TOPIC_MODEL_NAME  = "intfloat/e5-base-v2"
STANCE_MODEL_NAME = "Snowflake/snowflake-arctic-embed-l"
SUMMARIZER_MODEL_NAME = "facebook/bart-large-cnn"

CHROMA_DIR = Path("chroma_db")
TOPIC_COLL_NAME = "news_topic"
STANCE_COLL_NAME = "news_stance"


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m61.4/67.3 kB[0m [31m133.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.9/19.9 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m28.0 MB/s

## Load or Rebuild Chroma from Hugging Face Dataset
This step fetches topic and stance embeddings from your Hugging Face dataset and constructs local Chroma collections.


In [None]:
# ====================================================
# Load or rebuild Chroma collections
# ====================================================
if CHROMA_DIR.exists():
    print("Using existing local Chroma database.")
    client = chromadb.PersistentClient(path=str(CHROMA_DIR))
else:
    print("Rebuilding Chroma collections from Hugging Face dataset...")
    CHROMA_DIR.mkdir(parents=True, exist_ok=True)
    client = chromadb.PersistentClient(path=str(CHROMA_DIR))
    topic_coll = client.get_or_create_collection(TOPIC_COLL_NAME, metadata={"hnsw:space": "cosine"})
    stance_coll = client.get_or_create_collection(STANCE_COLL_NAME, metadata={"hnsw:space": "cosine"})

    REGISTRY_URL = f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{BRANCH}/artifacts/artifacts_registry.json"
    REGISTRY = requests.get(REGISTRY_URL, timeout=30).json()

    for b in REGISTRY.get("batches", []):
        paths = b.get("paths") or {}
        if not all(k in paths for k in ["embeddings_topic", "embeddings_stance", "metadata_topic", "metadata_stance"]):
            continue

        t_vecs = np.load(hf_hub_download(HF_DATASET_ID, paths["embeddings_topic"], repo_type="dataset"))["arr_0"]
        s_vecs = np.load(hf_hub_download(HF_DATASET_ID, paths["embeddings_stance"], repo_type="dataset"))["arr_0"]

        t_meta = [json.loads(l) for l in open(hf_hub_download(HF_DATASET_ID, paths["metadata_topic"], repo_type="dataset"), encoding="utf-8")]
        s_meta = [json.loads(l) for l in open(hf_hub_download(HF_DATASET_ID, paths["metadata_stance"], repo_type="dataset"), encoding="utf-8")]

        topic_coll.upsert(
            ids=[m.get("id", f"topic::{i}") for i, m in enumerate(t_meta)],
            embeddings=t_vecs.tolist(),
            metadatas=t_meta
        )
        stance_coll.upsert(
            ids=[m.get("id", f"stance::{i}") for i, m in enumerate(s_meta)],
            embeddings=s_vecs.tolist(),
            metadatas=s_meta
        )

topic_coll = client.get_collection(TOPIC_COLL_NAME)
stance_coll = client.get_collection(STANCE_COLL_NAME)
print(f"Chroma ready with {topic_coll.count()} topic and {stance_coll.count()} stance vectors.")

## Load Embedding and Summarization Models
We will use the following:
- intfloat/e5-base-v2 for topic and stance embeddings
- facebook/bart-large-cnn for summarization


In [None]:
# ====================================================
# Load embedding + summarization models
# ====================================================
print("Loading models...")
topic_model  = SentenceTransformer(TOPIC_MODEL_NAME)
stance_model = SentenceTransformer(STANCE_MODEL_NAME)
tok_sum  = AutoTokenizer.from_pretrained(SUMMARIZER_MODEL_NAME)
model_sum = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZER_MODEL_NAME)
print("Models loaded successfully.")

## Upload an Article
Upload a .txt, .pdf, or .html file for analysis.


In [None]:
# ====================================================
# Upload and extract article
# ====================================================
uploaded = files.upload()
filename = list(uploaded.keys())[0]
ext = Path(filename).suffix.lower()

def extract_text(file_path):
    if ext == ".txt":
        return open(file_path, encoding="utf-8", errors="ignore").read()
    elif ext == ".pdf":
        text = ""
        with fitz.open(file_path) as doc:
            for page in doc:
                text += page.get_text()
        return text
    elif ext == ".html":
        html = open(file_path, encoding="utf-8", errors="ignore").read()
        soup = BeautifulSoup(html, "html.parser")
        return soup.get_text(separator="\n")
    else:
        raise ValueError("Unsupported file type")

text = extract_text(filename)
display(Markdown(f"**Extracted first 2,000 characters:**\n\n{text[:2000]}..."))

## Summarize and Compute Embeddings
This step summarizes the article for stance analysis, then embeds both the summary and full text for topic analysis.


In [None]:
# ====================================================
# Summarize and compute embeddings
# ====================================================
inputs = tok_sum([text], return_tensors="pt", truncation=True, max_length=1024)
summary_ids = model_sum.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
summary = tok_sum.batch_decode(summary_ids, skip_special_tokens=True)[0].strip()

# Stance vector from summary (same encoder used for stance collection)
stance_vec = stance_model.encode([summary], normalize_embeddings=True)[0]

# Topic embedding uses both summary + main text for robustness
topic_vecs = topic_model.encode([summary, text[:3000]], normalize_embeddings=True)
topic_vec_mean = topic_vecs.mean(axis=0)

display(Markdown(f"### One-sentence Summary\n> {summary}"))

## Query for Similar Topics
Retrieve the 100 most similar articles by topic embedding.


In [None]:
# ====================================================
# Retrieve similar topics
# ====================================================
results = topic_coll.query(
    query_embeddings=[topic_vec_mean.tolist()],
    n_results=100,
    include=["metadatas"]
)
flat_results = [m for batch in results["metadatas"] for m in batch]
print(f"Found {len(flat_results)} potential topic matches.")

## Rank by Opposing Stance
We compute cosine similarity between stance embeddings and display opposing viewpoints first.


In [None]:
# ====================================================
# Rank by opposing stance
# ====================================================
if not flat_results:
    display(Markdown("No topic matches found."))
else:
    # Use stance summary text for stance embeddings
    stance_texts = [
        f"{m.get('political_leaning','')}\n{m.get('implied_stance','')}\n{m.get('stance_summary_text','')}"
        for m in flat_results
    ]
    stance_embeddings = stance_model.encode(stance_texts, normalize_embeddings=True)
    stance_sims = cosine_similarity([stance_vec], stance_embeddings)[0]

    # Lower similarity = more opposing
    ranked = sorted(zip(flat_results, stance_sims), key=lambda x: x[1])

    def sim_label(s):
        if s < 0.2: return "Very Dissimilar"
        elif s < 0.4: return "Dissimilar"
        elif s < 0.6: return "Somewhat Similar"
        elif s < 0.8: return "Similar"
        else: return "Very Similar"

    display(Markdown("### Results: Similar Topics, Contrasting Perspectives"))
    for meta, sim in ranked[:10]:
        topic_display = meta.get("topic_label") or meta.get("inferred_topic") or "(topic unknown)"
        leaning = meta.get("political_leaning", "")
        stance  = meta.get("implied_stance", "")
        summary = meta.get("stance_summary_text", "")
        md = f"""
**{meta.get('title','(untitled)')}**
Source: {meta.get('domain','unknown')}
Topic: *{topic_display}*
Political Leaning: `{leaning}`
Implied Stance: `{stance}`
Stance Similarity: {sim:.2f} ({sim_label(sim)})

{summary}

[Read original article]({meta.get('url','#')})
"""
        display(Markdown(md))


## Cleanup
Free GPU and CPU memory after analysis.


In [None]:
# ====================================================
# Cleanup
# ====================================================
del text, summary, stance_vec, topic_vec_mean, topic_vecs
gc.collect()
print("Memory cleared.")