# Anti-Echo Chamber - News Scraper & Processor

This notebook scrapes news articles from RSS feeds, processes them through the anti-echo chamber pipeline, and uploads only metadata + embeddings to Hugging Face.

**Key Features:**
- Scrapes from diverse news sources
- Processes articles through topic modeling and stance classification
- Stores only embeddings and metadata (no full text)
- Uploads processed data to Hugging Face for sharing

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AHMerrill/anti-echo-2/blob/main/notebooks/scraper_artifacts.ipynb)


## Setup and Installation


In [1]:
# Install dependencies
!pip install -q chromadb sentence-transformers transformers huggingface-hub pymupdf beautifulsoup4 scikit-learn nltk pyyaml feedparser trafilatura

# Set environment variables
import os
import torch
os.environ["CHROMA_TELEMETRY_ENABLED"] = "false"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

# GPU/CPU Detection and Configuration
def setup_device():
    """Detect and configure device for optimal performance."""
    if torch.cuda.is_available():
        device = "cuda"
        print(f"🚀 GPU detected: {torch.cuda.get_device_name(0)}")
        print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
        os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    else:
        device = "cpu"
        print("💻 Using CPU (GPU not available)")
        os.environ["CUDA_VISIBLE_DEVICES"] = ""

    print(f"✅ Device configured: {device}")
    return device

# Setup device
device = setup_device()

# Manual device override (uncomment if needed)
# device = "cpu"  # Force CPU usage
# device = "cuda"  # Force GPU usage (if available)


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.9/19.9 MB[0m [31m127.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m116.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.5/81.5 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

## Download Core Library and Configuration


In [2]:
# Download the core library and configs from GitHub
!git clone https://github.com/AHMerrill/anti-echo-2.git temp_repo
!cp -r temp_repo/* ./
!rm -rf temp_repo

# Verify files are downloaded
!ls -la

# Optional: Set up Git for pushing results back (if you want to save results to GitHub)
# Uncomment the lines below and add your PAT if you want to push results back
# !git config --global user.email "your-email@example.com"
# !git config --global user.name "Your Name"
# !git remote set-url origin https://YOUR_PAT@github.com/AHMerrill/anti-echo-2.git


Cloning into 'temp_repo'...
remote: Enumerating objects: 77, done.[K
remote: Counting objects: 100% (77/77), done.[K
remote: Compressing objects: 100% (52/52), done.[K
remote: Total 77 (delta 44), reused 58 (delta 25), pack-reused 0 (from 0)[K
Receiving objects: 100% (77/77), 49.65 KiB | 891.00 KiB/s, done.
Resolving deltas: 100% (44/44), done.
total 56
drwxr-xr-x 1 root root  4096 Oct 15 17:33 .
drwxr-xr-x 1 root root  4096 Oct 15 17:28 ..
-rw-r--r-- 1 root root 16722 Oct 15 17:33 anti_echo_core.py
drwxr-xr-x 4 root root  4096 Oct 14 13:41 .config
drwxr-xr-x 2 root root  4096 Oct 15 17:33 config
drwxr-xr-x 2 root root  4096 Oct 15 17:33 notebooks
-rw-r--r-- 1 root root  5569 Oct 15 17:33 README.md
-rw-r--r-- 1 root root   325 Oct 15 17:33 requirements.txt
drwxr-xr-x 1 root root  4096 Oct 14 13:41 sample_data


In [3]:
from anti_echo_core import AntiEchoCore
import json
import time
from datetime import datetime

# Initialize the core system with detected device
print("Initializing Anti-Echo Chamber system...")
core = AntiEchoCore("config/config.yaml", device=device)
print(f"✓ System initialized successfully on {core.device}")


Initializing Anti-Echo Chamber system...
✓ System initialized successfully on cuda


## Hugging Face Authentication


In [4]:
# Hugging Face Authentication
from huggingface_hub import HfApi, create_repo, login
import getpass

def get_hf_token():
    """Get Hugging Face token from user input or local file."""
    # Try to load from local file first (for local development)
    try:
        with open("hf_token.txt", "r") as f:
            token = f.read().strip()
        print("✅ Hugging Face token loaded from local file")
        return token
    except FileNotFoundError:
        pass

    # If not found locally, ask user to input it
    print("🔑 Hugging Face Authentication Required")
    print("   Get your token from: https://huggingface.co/settings/tokens")
    print("   (Make sure it has 'Write' permissions)")

    # Use getpass for secure input (hides the token as you type)
    token = getpass.getpass("Enter your Hugging Face token: ").strip()

    if token:
        print("✅ Hugging Face token received")
        return token
    else:
        print("❌ No token provided")
        return None

# Get the token
hf_token = get_hf_token()

if not hf_token:
    print("⚠️ No HF token available. Upload functionality will be disabled.")
    print("   You can still run the scraping and processing locally.")

print("Hugging Face authentication ready")


🔑 Hugging Face Authentication Required
   Get your token from: https://huggingface.co/settings/tokens
   (Make sure it has 'Write' permissions)
Enter your Hugging Face token: ··········
✅ Hugging Face token received
Hugging Face authentication ready


## Configuration - Easy Customization


In [5]:
# =============================================================================
# CONFIGURATION - EASILY CUSTOMIZE SCRAPING PARAMETERS
# =============================================================================

# Number of articles to scrape per RSS feed (adjust as needed)
MAX_ARTICLES_PER_FEED = 5  # Change this number to scrape more/fewer articles

# Balance distribution across sources (optional)
BALANCE_DISTRIBUTION = False  # Set to True to ensure even distribution

print(f"📊 Configuration:")
print(f"   Max articles per feed: {MAX_ARTICLES_PER_FEED}")
print(f"   Balance distribution: {BALANCE_DISTRIBUTION}")
print(f"   (Adjust MAX_ARTICLES_PER_FEED above to change this)")
print(f"   (Set BALANCE_DISTRIBUTION=True for even distribution)")


📊 Configuration:
   Max articles per feed: 5
   Balance distribution: False
   (Adjust MAX_ARTICLES_PER_FEED above to change this)
   (Set BALANCE_DISTRIBUTION=True for even distribution)


## Complete Pipeline


In [6]:
# RSS Feed sources for diverse political perspectives
RSS_FEEDS = {
    # Conservative sources (using more reliable feeds)
    "fox_news": "https://feeds.foxnews.com/foxnews/politics",
    "daily_wire": "https://www.dailywire.com/feeds/rss.xml",
    "national_review": "https://www.nationalreview.com/feed/",
    "reason": "https://reason.com/feed/",

    # Liberal sources (using more reliable feeds)
    "npr": "https://feeds.npr.org/1001/rss.xml",
    "vox": "https://www.vox.com/rss/index.xml",
    "huffpost": "https://www.huffpost.com/section/politics/feed",
    "propublica": "https://www.propublica.org/feeds/propublica/main",

    # International sources (using more reliable feeds)
    "bbc": "http://feeds.bbci.co.uk/news/rss.xml",
    "guardian": "https://www.theguardian.com/world/rss",
    "reuters": "https://feeds.reuters.com/reuters/politicsNews",
    "ap_news": "https://feeds.apnews.com/rss/ap/topnews",

    # Academic/Think tank (using more reliable feeds)
    "conversation": "https://theconversation.com/global/rss",
    "brookings": "https://www.brookings.edu/feed/",
    "cato": "https://www.cato.org/feed"
}

print(f"Configured {len(RSS_FEEDS)} RSS feeds for scraping")


Configured 15 RSS feeds for scraping


In [7]:
import feedparser
import trafilatura
import pandas as pd
from urllib.parse import urlparse
import hashlib

def scrape_and_process_articles(max_articles_per_feed=None, dataset_name="anti-echo-chamber-data"):
    """Complete pipeline: scrape, process, and upload to Hugging Face."""

    # Use global config if not specified
    if max_articles_per_feed is None:
        max_articles_per_feed = MAX_ARTICLES_PER_FEED

    print("🚀 Starting Anti-Echo Chamber Pipeline")
    print("=" * 50)
    print(f"🖥️ Running on: {core.device}")
    print(f"📊 Max articles per feed: {max_articles_per_feed}")
    if core.device == "cuda":
        print(f"⚡ GPU acceleration enabled - expect faster processing!")
    else:
        print(f"🐌 CPU mode - processing will be slower but still functional")

    # Check for existing articles in HF dataset to avoid duplicates
    existing_article_ids = set()
    try:
        from datasets import load_dataset
        dataset = load_dataset(dataset_name, split="train")
        existing_article_ids = set(dataset["id"])
        print(f"📋 Found {len(existing_article_ids)} existing articles in dataset")
    except:
        print("📋 No existing dataset found, starting fresh")

    # Step 1: Scrape articles
    print("\n📰 Step 1: Scraping news articles...")
    all_articles = []
    source_stats = {}

    for source_name, feed_url in RSS_FEEDS.items():
        print(f"Scraping {source_name}...")
        try:
            feed = feedparser.parse(feed_url)
            articles = []

            for i, entry in enumerate(feed.entries[:max_articles_per_feed]):
                try:
                    print(f"      📰 Article {i+1}: {entry.get('title', 'No title')[:50]}...")

                    # Try multiple extraction methods
                    article_text = None

                    # Method 1: Try trafilatura first
                    try:
                        article_text = trafilatura.extract(entry.link)
                        if article_text and len(article_text) > 200:
                            print(f"        ✓ Trafilatura extracted {len(article_text)} characters")
                    except Exception as e:
                        print(f"        ⚠️ Trafilatura failed: {e}")

                    # Method 2: If trafilatura fails, try using feed content
                    if not article_text or len(article_text) < 200:
                        try:
                            # Use feed summary/description as fallback
                            article_text = entry.get('summary', '') or entry.get('description', '')
                            if article_text:
                                # Clean up HTML tags
                                import re
                                article_text = re.sub(r'<[^>]+>', '', article_text)
                                article_text = re.sub(r'\s+', ' ', article_text).strip()
                                print(f"        ✓ Feed content extracted {len(article_text)} characters")
                        except Exception as e:
                            print(f"        ⚠️ Feed content failed: {e}")

                    # Method 3: If still no content, try title + summary
                    if not article_text or len(article_text) < 200:
                        try:
                            title = entry.get('title', '')
                            summary = entry.get('summary', '') or entry.get('description', '')
                            if title and summary:
                                # Clean HTML and combine
                                import re
                                clean_summary = re.sub(r'<[^>]+>', '', summary)
                                clean_summary = re.sub(r'\s+', ' ', clean_summary).strip()
                                article_text = f"{title}\n\n{clean_summary}"
                                print(f"        ✓ Title+summary extracted {len(article_text)} characters")
                        except Exception as e:
                            print(f"        ⚠️ Title+summary failed: {e}")

                    # Check if we got usable content
                    if article_text and len(article_text) > 200:
                        article = {
                            "title": entry.get("title", ""),
                            "url": entry.link,
                            "source": source_name,
                            "published": entry.get("published", ""),
                            "text": article_text,
                            "id": hashlib.md5(article_text.encode()).hexdigest()
                        }
                        articles.append(article)
                        print(f"        ✅ Successfully processed article")
                    else:
                        print(f"        ⏭️ Skipped (insufficient content: {len(article_text) if article_text else 0} chars)")

                except Exception as e:
                    print(f"        ❌ Error processing article: {e}")
                    continue

            source_stats[source_name] = len(articles)
            print(f"  ✓ Scraped {len(articles)} articles from {source_name}")
            all_articles.extend(articles)
            time.sleep(1)  # Be respectful to servers

        except Exception as e:
            source_stats[source_name] = 0
            print(f"  ✗ Error scraping {source_name}: {e}")

    # Show distribution statistics
    print(f"\n📊 Scraping Distribution:")
    for source, count in source_stats.items():
        print(f"   {source}: {count} articles")

    if BALANCE_DISTRIBUTION:
        print(f"\n⚖️ Balancing distribution...")
        # Find the minimum number of articles across all sources
        min_articles = min(count for count in source_stats.values() if count > 0)
        if min_articles > 0:
            # Keep only the first min_articles from each source
            balanced_articles = []
            for source_name in RSS_FEEDS.keys():
                source_articles = [a for a in all_articles if a['source'] == source_name]
                balanced_articles.extend(source_articles[:min_articles])
            all_articles = balanced_articles
            print(f"   Balanced to {min_articles} articles per source")
        else:
            print(f"   No articles found to balance")

    # Remove duplicates (both URL and existing article IDs)
    seen_urls = set()
    unique_articles = []
    for article in all_articles:
        article_id = article["id"]
        if (article["url"] not in seen_urls and
            article_id not in existing_article_ids):
            seen_urls.add(article["url"])
            unique_articles.append(article)
        elif article_id in existing_article_ids:
            print(f"  ⏭️ Skipping existing article: {article['title'][:50]}...")

    print(f"\n✓ Total new articles to process: {len(unique_articles)}")
    print(f"⏭️ Skipped {len(all_articles) - len(unique_articles)} duplicate/existing articles")

    print(f"\n✓ Total unique articles scraped: {len(unique_articles)}")

    if not unique_articles:
        print("❌ No articles were scraped")
        return

    # Step 2: Process articles
    print("\n🔍 Step 2: Processing articles through pipeline...")
    processed_articles = []

    for i, article in enumerate(unique_articles):
        try:
            print(f"  Processing {i+1}/{len(unique_articles)}: {article['title'][:50]}...")

            processed = core.process_article(article)
            if processed:
                core.upsert_to_chroma(processed)
                metadata_only = core.export_metadata_only(processed)
                processed_articles.append(metadata_only)
                print(f"    ✓ Processed: {processed['political_leaning']} / {processed['implied_stance']}")
            else:
                print(f"    ✗ Failed to process")

        except Exception as e:
            print(f"    ✗ Error: {e}")
            continue

    print(f"\n✓ Successfully processed {len(processed_articles)} articles")

    if not processed_articles:
        print("❌ No articles were successfully processed")
        return

    # Step 3: Upload to Hugging Face (if token available)
    if hf_token:
        print("\n☁️ Step 3: Uploading to Hugging Face...")

        try:
            # Create dataset repository
            create_repo(dataset_name, repo_type="dataset", exist_ok=True, token=hf_token)
            print(f"✓ Created/verified dataset: {dataset_name}")

            # Convert to DataFrame and upload
            df = pd.DataFrame(processed_articles)
            df.to_csv("processed_articles.csv", index=False)

            api = HfApi(token=hf_token)
            api.upload_file(
                path_or_fileobj="processed_articles.csv",
                path_in_repo="processed_articles.csv",
                repo_id=dataset_name,
                repo_type="dataset"
            )

            print(f"✓ Uploaded {len(processed_articles)} articles to Hugging Face")
            print(f"  Dataset: https://huggingface.co/datasets/{dataset_name}")

            # Also save embeddings separately
            embeddings_data = {
                "topic_embeddings": [article["topic_vectors"] for article in processed_articles],
                "stance_embeddings": [article["stance_embedding"] for article in processed_articles],
                "metadata": [{k: v for k, v in article.items() if k not in ["topic_vectors", "stance_embedding"]} for article in processed_articles]
            }

            with open("embeddings_data.json", "w") as f:
                json.dump(embeddings_data, f, indent=2)

            api.upload_file(
                path_or_fileobj="embeddings_data.json",
                path_in_repo="embeddings_data.json",
                repo_id=dataset_name,
                repo_type="dataset"
            )

            print(f"✓ Uploaded embeddings data to Hugging Face")

        except Exception as e:
            print(f"Upload error: {e}")
    else:
        print("\n⚠️ Step 3: Skipping Hugging Face upload (no token provided)")
        print("   Articles processed and stored locally in ChromaDB")
        print("   To upload later, run this cell again with a valid HF token")

    print("\n✅ Pipeline completed successfully!")
    print(f"   - Scraped: {len(unique_articles)} articles")
    print(f"   - Processed: {len(processed_articles)} articles")
    print(f"   - Uploaded: Metadata + embeddings only (no full text)")

    return processed_articles


In [8]:
# =============================================================================
# RUN THE PIPELINE - ACTUALLY EXECUTE THE SCRAPING AND PROCESSING
# =============================================================================

# Configure dataset name
dataset_name = "zanimal/anti-echo-chamber-data"  # Default dataset
# Uncomment the line below to input a different dataset name
# dataset_name = input("Enter dataset name (or press Enter for default): ").strip() or "zanimal/anti-echo-chamber-data"

print(f"🔧 Debug Info:")
print(f"   Dataset: {dataset_name}")
print(f"   HF Token: {'✅ Available' if hf_token else '❌ Not provided'}")
print(f"   Device: {core.device}")
print(f"   RSS Feeds: {len(RSS_FEEDS)} configured")

# Run the complete pipeline
# This will scrape articles, process them, and upload to Hugging Face
try:
    processed_articles = scrape_and_process_articles(
        max_articles_per_feed=MAX_ARTICLES_PER_FEED,
        dataset_name=dataset_name
    )
    print(f"\n🎉 Pipeline completed! Processed {len(processed_articles) if processed_articles else 0} articles")
except Exception as e:
    print(f"\n❌ Pipeline failed with error: {e}")
    import traceback
    traceback.print_exc()


🔧 Debug Info:
   Dataset: zanimal/anti-echo-chamber-data
   HF Token: ✅ Available
   Device: cuda
   RSS Feeds: 15 configured
🚀 Starting Anti-Echo Chamber Pipeline
🖥️ Running on: cuda
📊 Max articles per feed: 5
⚡ GPU acceleration enabled - expect faster processing!


README.md: 0.00B [00:00, ?B/s]

initial_data.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1 [00:00<?, ? examples/s]

📋 No existing dataset found, starting fresh

📰 Step 1: Scraping news articles...
Scraping fox_news...


ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


      📰 Article 1: Atlanta’s city-run grocery sees early success, spa...
        ✓ Feed content extracted 167 characters
        ✓ Title+summary extracted 254 characters
        ✅ Successfully processed article
      📰 Article 2: Rep Marjorie Taylor Greene says there are many 'we...
        ✓ Feed content extracted 164 characters
        ✓ Title+summary extracted 234 characters
        ✅ Successfully processed article
      📰 Article 3: White House may 'run out' of funds to pay military...
        ✓ Feed content extracted 140 characters
        ✓ Title+summary extracted 229 characters
        ✅ Successfully processed article
      📰 Article 4: Israel to deploy first combat-ready laser weapon s...
        ✓ Feed content extracted 161 characters
        ✓ Title+summary extracted 246 characters
        ✅ Successfully processed article
      📰 Article 5: Republicans eye fresh funding fight as shutdown st...
        ✓ Feed content extracted 158 characters
        ✓ Title+summary extracted 2

ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


      📰 Article 1: Spanberger Still Sells Merchandise With Jay Jones’...
        ✓ Feed content extracted 400 characters
        ✅ Successfully processed article
      📰 Article 2: Los Angeles County Declares Emergency Over ICE Rai...
        ✓ Feed content extracted 432 characters
        ✅ Successfully processed article
      📰 Article 3: ‘I Can’t Make Fun of Him Anymore’: Michael Rapapor...
        ✓ Feed content extracted 374 characters
        ✅ Successfully processed article
      📰 Article 4: John Rich Reflects On Charlie Kirk’s Legacy: ‘Say ...
        ✓ Feed content extracted 346 characters
        ✅ Successfully processed article
      📰 Article 5: U.S. Military Warns Hamas To End Gaza Violence Aft...
        ✓ Feed content extracted 348 characters
        ✅ Successfully processed article
  ✓ Scraped 5 articles from daily_wire


ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


Scraping national_review...
      📰 Article 1: The Supreme Court Can Choose to Clarify Election L...
        ✓ Feed content extracted 88 characters
        ✓ Title+summary extracted 154 characters
        ⏭️ Skipped (insufficient content: 154 chars)
      📰 Article 2: Brightline Touches the Third Rail...
        ✓ Feed content extracted 53 characters
        ✓ Title+summary extracted 88 characters
        ⏭️ Skipped (insufficient content: 88 chars)
      📰 Article 3: A Preposterous Legal Assault Against Fossil Fuels...
        ✓ Feed content extracted 42 characters
        ✓ Title+summary extracted 93 characters
        ⏭️ Skipped (insufficient content: 93 chars)
      📰 Article 4: What Thatcher’s Partnership with Reagan Can Teach ...
        ✓ Feed content extracted 91 characters
        ✓ Title+summary extracted 145 characters
        ⏭️ Skipped (insufficient content: 145 chars)
      📰 Article 5: Ethan Coen’s Pulped-Fiction Politics...
        ✓ Feed content extracted 49 characters


ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


      📰 Article 1: Boerne, RFRA, and the VRA...
        ✓ Feed content extracted 100 characters
        ✓ Title+summary extracted 127 characters
        ⏭️ Skipped (insufficient content: 127 chars)
      📰 Article 2: Sarbanes-Oxley Promised To Protect Investors. It E...
        ✓ Feed content extracted 35 characters
        ✓ Title+summary extracted 113 characters
        ⏭️ Skipped (insufficient content: 113 chars)
      📰 Article 3: Trump Says Tariffs Are About National Security. Pe...
        ✓ Feed content extracted 49 characters
        ✓ Title+summary extracted 151 characters
        ⏭️ Skipped (insufficient content: 151 chars)
      📰 Article 4: Sex Robots Are Here… and It's OK...
        ✓ Feed content extracted 49 characters
        ✓ Title+summary extracted 83 characters
        ⏭️ Skipped (insufficient content: 83 chars)
      📰 Article 5: Can the ACLU Serve Progressives and Conservatives?...
        ✓ Feed content extracted 298 characters
        ✅ Successfully processed ar

ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


Scraping npr...
      📰 Article 1: A celebration of the South's rich — and messy — he...
        ✓ Feed content extracted 196 characters
        ✓ Title+summary extracted 276 characters
        ✅ Successfully processed article
      📰 Article 2: How Charlie Chaplin used his uncanny resemblance t...
        ✓ Feed content extracted 182 characters
        ✓ Title+summary extracted 259 characters
        ✅ Successfully processed article
      📰 Article 3: Photos: Ceasefire in Gaza brings reunions amid dev...
        ✓ Feed content extracted 182 characters
        ✓ Title+summary extracted 242 characters
        ✅ Successfully processed article
      📰 Article 4: A veteran state department negotiator unpacks the ...
        ✓ Feed content extracted 200 characters
        ⏭️ Skipped (insufficient content: 200 chars)
      📰 Article 5: Pakistan and Afghanistan agree to a temporary ceas...
        ✓ Feed content extracted 202 characters
        ✅ Successfully processed article
  ✓ Scraped 4 a

ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


Scraping vox...
      📰 Article 1: There are many ways the world could end. Don’t for...
        ✓ Feed content extracted 350 characters
        ✅ Successfully processed article
      📰 Article 2: Leading the way for El Salvador’s digital future...
        ✓ Feed content extracted 344 characters
        ✅ Successfully processed article
      📰 Article 3: The sadistic appeal of ICE’s recruitment ads...
        ✓ Feed content extracted 356 characters
        ✅ Successfully processed article
      📰 Article 4: Conservatives should oppose Trump on immigration...
        ✓ Feed content extracted 396 characters
        ✅ Successfully processed article
      📰 Article 5: What Gaza has been like since the ceasefire...
        ✓ Feed content extracted 380 characters
        ✅ Successfully processed article
  ✓ Scraped 5 articles from vox


ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


Scraping huffpost...
      📰 Article 1: Donald Trump's Treatment Of Female Journalist Slam...
        ✓ Feed content extracted 43 characters
        ✓ Title+summary extracted 114 characters
        ⏭️ Skipped (insufficient content: 114 chars)
      📰 Article 2: Eric Trump’s 'Offensive’ Boast About What His Dad’...
        ✓ Feed content extracted 85 characters
        ✓ Title+summary extracted 172 characters
        ⏭️ Skipped (insufficient content: 172 chars)
      📰 Article 3: 'This Bulls**t Has To Stop!': Kathy Hochul Takes O...
        ✓ Feed content extracted 112 characters
        ✓ Title+summary extracted 199 characters
        ⏭️ Skipped (insufficient content: 199 chars)
      📰 Article 4: Podcaster Calls Out Cory Booker On Democrat Infigh...
        ✓ Feed content extracted 133 characters
        ✓ Title+summary extracted 213 characters
        ✅ Successfully processed article
      📰 Article 5: CNN's Harry Enten Spots How Health Care Is 'Actual...
        ✓ Feed content extra

ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


      📰 Article 1: Disabled Idaho Students Lack Access to Playgrounds...
        ✓ Feed content extracted 17431 characters
        ✅ Successfully processed article
      📰 Article 2: A Year Before Trump’s Crime Rhetoric, Dallas Voted...
        ✓ Feed content extracted 18770 characters
        ✅ Successfully processed article
      📰 Article 3: ProPublica Names Kenneth Morales as David Burnham-...
        ✓ Feed content extracted 2305 characters
        ✅ Successfully processed article
      📰 Article 4: Students With Hearing and Vision Loss Get Funding ...
        ✓ Feed content extracted 4227 characters
        ✅ Successfully processed article
      📰 Article 5: On the Front Line of the Fluoride Wars, Debate Ove...
        ✓ Feed content extracted 14002 characters
        ✅ Successfully processed article
  ✓ Scraped 5 articles from propublica
Scraping bbc...


ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


      📰 Article 1: Funeral director admits fraud charges after inquir...
        ✓ Feed content extracted 85 characters
        ✓ Title+summary extracted 181 characters
        ⏭️ Skipped (insufficient content: 181 chars)
      📰 Article 2: Government will publish evidence in collapsed Chin...
        ✓ Feed content extracted 97 characters
        ✓ Title+summary extracted 168 characters
        ⏭️ Skipped (insufficient content: 168 chars)
      📰 Article 3: Reeves says she is looking at tax rises ahead of B...
        ✓ Feed content extracted 136 characters
        ✓ Title+summary extracted 193 characters
        ⏭️ Skipped (insufficient content: 193 chars)
      📰 Article 4: 'Worse than starting from scratch': how big is the...
        ✓ Feed content extracted 106 characters
        ✓ Title+summary extracted 183 characters
        ⏭️ Skipped (insufficient content: 183 chars)
      📰 Article 5: Student arrested after chanting about putting Zion...
        ✓ Feed content extracted 103 

ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


Scraping guardian...
      📰 Article 1: Raila Odinga, towering Kenyan opposition figure, d...
        ✓ Feed content extracted 450 characters
        ✅ Successfully processed article
      📰 Article 2: Thousands trapped in El Fasher siege on ‘edge of s...
        ✓ Feed content extracted 504 characters
        ✅ Successfully processed article
      📰 Article 3: Elite military unit says it has seized power in Ma...
        ✓ Feed content extracted 535 characters
        ✅ Successfully processed article
      📰 Article 4: Cameroon opposition leader declares victory in pre...
        ✓ Feed content extracted 780 characters
        ✅ Successfully processed article
      📰 Article 5: Madagascar president says he fled country in fear ...
        ✓ Feed content extracted 723 characters
        ✅ Successfully processed article
  ✓ Scraped 5 articles from guardian
Scraping reuters...
  ✓ Scraped 0 articles from reuters
Scraping ap_news...
  ✓ Scraped 0 articles from ap_news
Scraping conversatio

ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


      📰 Article 1: Should Governments Prosecute Fraud?...
        ✓ Feed content extracted 196 characters
        ✓ Title+summary extracted 233 characters
        ✅ Successfully processed article
      📰 Article 2: Election Policy Roundup...
        ✓ Feed content extracted 234 characters
        ✅ Successfully processed article
      📰 Article 3: Tenure Isn’t Safe: Professor’s Case Warns Academic...
        ✓ Feed content extracted 299 characters
        ✅ Successfully processed article
      📰 Article 4: Argentina: Don’t Waste Yet Another Opportunity to ...
        ✓ Feed content extracted 205 characters
        ✅ Successfully processed article
      📰 Article 5: AI Policy Already Exists, We Just Don’t Call It Th...
        ✓ Feed content extracted 295 characters
        ✅ Successfully processed article
  ✓ Scraped 5 articles from cato

📊 Scraping Distribution:
   fox_news: 5 articles
   daily_wire: 5 articles
   national_review: 0 articles
   reason: 1 articles
   npr: 4 articles
  

In [9]:
# =============================================================================
# TEST SCRAPING WITH A SIMPLE FEED FIRST
# =============================================================================

# Test with a simple, reliable RSS feed to verify the system works
print("🧪 Testing scraping with a simple RSS feed...")

test_feed = "https://feeds.bbci.co.uk/news/rss.xml"
print(f"Testing with: {test_feed}")

try:
    import feedparser
    feed = feedparser.parse(test_feed)
    print(f"✓ Feed parsed successfully: {len(feed.entries)} entries found")

    # Test extraction on first article
    if feed.entries:
        test_entry = feed.entries[0]
        print(f"📰 Test article: {test_entry.get('title', 'No title')[:50]}...")

        # Try trafilatura
        try:
            article_text = trafilatura.extract(test_entry.link)
            if article_text:
                print(f"✓ Trafilatura worked: {len(article_text)} characters")
            else:
                print("⚠️ Trafilatura returned empty content")
        except Exception as e:
            print(f"❌ Trafilatura failed: {e}")

        # Try feed content fallback
        summary = test_entry.get('summary', '') or test_entry.get('description', '')
        if summary:
            import re
            clean_summary = re.sub(r'<[^>]+>', '', summary)
            clean_summary = re.sub(r'\s+', ' ', clean_summary).strip()
            print(f"✓ Feed content available: {len(clean_summary)} characters")
        else:
            print("⚠️ No feed content available")

except Exception as e:
    print(f"❌ Test failed: {e}")

print("\n" + "="*50)


ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


🧪 Testing scraping with a simple RSS feed...
Testing with: https://feeds.bbci.co.uk/news/rss.xml
✓ Feed parsed successfully: 39 entries found
📰 Test article: Funeral director admits fraud charges after inquir...
⚠️ Trafilatura returned empty content
✓ Feed content available: 85 characters



## Verification - Check Processing Results


In [10]:
# Verify that articles were processed correctly
if 'processed_articles' in locals() and processed_articles:
    print("🔍 VERIFICATION RESULTS")
    print("=" * 50)

    # Check processing statistics
    total_processed = len(processed_articles)
    print(f"✅ Total articles processed: {total_processed}")

    # Check political leanings distribution
    leanings = [article.get('political_leaning', 'unknown') for article in processed_articles]
    leaning_counts = {}
    for leaning in leanings:
        leaning_counts[leaning] = leaning_counts.get(leaning, 0) + 1

    print(f"\n📊 Political Leanings Distribution:")
    for leaning, count in sorted(leaning_counts.items()):
        percentage = (count / total_processed) * 100
        print(f"   {leaning}: {count} ({percentage:.1f}%)")

    # Check implied stances distribution
    stances = [article.get('implied_stance', 'unknown') for article in processed_articles]
    stance_counts = {}
    for stance in stances:
        stance_counts[stance] = stance_counts.get(stance, 0) + 1

    print(f"\n📊 Implied Stances Distribution:")
    for stance, count in sorted(stance_counts.items()):
        percentage = (count / total_processed) * 100
        print(f"   {stance}: {count} ({percentage:.1f}%)")

    # Check topics distribution
    all_topics = []
    for article in processed_articles:
        all_topics.extend(article.get('topics', []))

    topic_counts = {}
    for topic in all_topics:
        topic_counts[topic] = topic_counts.get(topic, 0) + 1

    print(f"\n📊 Top Topics Detected:")
    for topic, count in sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
        print(f"   {topic}: {count}")

    # Check vectorization
    articles_with_vectors = sum(1 for article in processed_articles
                              if article.get('topic_vectors') and article.get('stance_embedding'))
    print(f"\n🔢 Vectorization Status:")
    print(f"   Articles with topic vectors: {articles_with_vectors}/{total_processed}")
    print(f"   Articles with stance embeddings: {articles_with_vectors}/{total_processed}")

    # Check ChromaDB storage
    try:
        topic_count = core.topic_coll.count()
        stance_count = core.stance_coll.count()
        print(f"\n💾 ChromaDB Storage:")
        print(f"   Topic collection: {topic_count} vectors")
        print(f"   Stance collection: {stance_count} vectors")
    except Exception as e:
        print(f"\n⚠️ ChromaDB check failed: {e}")

    print(f"\n✅ Verification complete! All systems working properly.")

else:
    print("❌ No processed articles found. Run the pipeline first.")


❌ No processed articles found. Run the pipeline first.


In [11]:
# Inspect a sample processed article to verify quality
if 'processed_articles' in locals() and processed_articles:
    print("🔍 SAMPLE ARTICLE INSPECTION")
    print("=" * 50)

    # Show first processed article
    sample = processed_articles[0]
    print(f"📰 Sample Article:")
    print(f"   Title: {sample.get('title', 'N/A')[:80]}...")
    print(f"   Source: {sample.get('source', 'N/A')}")
    print(f"   Political Leaning: {sample.get('political_leaning', 'N/A')}")
    print(f"   Implied Stance: {sample.get('implied_stance', 'N/A')}")
    print(f"   Topics: {', '.join(sample.get('topics', []))}")
    print(f"   Summary: {sample.get('summary', 'N/A')[:100]}...")
    print(f"   Text Length: {sample.get('text_length', 0)} characters")

    # Check vector dimensions
    topic_vectors = sample.get('topic_vectors', [])
    stance_embedding = sample.get('stance_embedding', [])

    print(f"\n🔢 Vector Analysis:")
    print(f"   Topic vectors: {len(topic_vectors)} clusters")
    if topic_vectors:
        print(f"   Topic vector dimension: {len(topic_vectors[0]) if topic_vectors[0] else 'N/A'}")
    print(f"   Stance embedding dimension: {len(stance_embedding) if stance_embedding else 'N/A'}")

    # Show a few more samples
    print(f"\n📋 Quick Sample of Other Articles:")
    for i, article in enumerate(processed_articles[1:4], 1):
        print(f"   {i}. {article.get('title', 'N/A')[:50]}... | {article.get('political_leaning', 'N/A')} | {article.get('implied_stance', 'N/A')}")

    print(f"\n✅ Sample inspection complete!")

else:
    print("❌ No processed articles found. Run the pipeline first.")


❌ No processed articles found. Run the pipeline first.


In [12]:
# Check final distribution across sources
if 'processed_articles' in locals() and processed_articles:
    print("📊 FINAL DISTRIBUTION ANALYSIS")
    print("=" * 50)

    # Count articles by source
    source_counts = {}
    for article in processed_articles:
        source = article.get('source', 'unknown')
        source_counts[source] = source_counts.get(source, 0) + 1

    print(f"📰 Articles per source:")
    for source, count in sorted(source_counts.items()):
        print(f"   {source}: {count}")

    # Calculate distribution statistics
    counts = list(source_counts.values())
    if counts:
        min_count = min(counts)
        max_count = max(counts)
        avg_count = sum(counts) / len(counts)

        print(f"\n📈 Distribution Statistics:")
        print(f"   Min articles per source: {min_count}")
        print(f"   Max articles per source: {max_count}")
        print(f"   Average articles per source: {avg_count:.1f}")
        print(f"   Distribution range: {max_count - min_count}")

        if max_count - min_count <= 1:
            print(f"   ✅ Very balanced distribution!")
        elif max_count - min_count <= 3:
            print(f"   ⚖️ Reasonably balanced distribution")
        else:
            print(f"   ⚠️ Uneven distribution - consider enabling BALANCE_DISTRIBUTION")

    print(f"\n✅ Distribution analysis complete!")

else:
    print("❌ No processed articles found. Run the pipeline first.")


❌ No processed articles found. Run the pipeline first.
