# Anti-Echo Chamber - News Scraper & Processor

This notebook scrapes news articles from RSS feeds, processes them through the anti-echo chamber pipeline, and uploads only metadata + embeddings to Hugging Face.

**Key Features:**
- Scrapes from diverse news sources
- Processes articles through topic modeling and stance classification
- Stores only embeddings and metadata (no full text)
- Uploads processed data to Hugging Face for sharing

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AHMerrill/anti-echo-2/blob/main/notebooks/scraper_artifacts.ipynb)


## Setup and Installation


In [None]:
# Install dependencies
!pip install -q chromadb sentence-transformers transformers huggingface-hub pymupdf beautifulsoup4 scikit-learn nltk pyyaml feedparser trafilatura

# Set environment variables
import os
import torch
os.environ["CHROMA_TELEMETRY_ENABLED"] = "false"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

# GPU/CPU Detection and Configuration
def setup_device():
    """Detect and configure device for optimal performance."""
    if torch.cuda.is_available():
        device = "cuda"
        print(f"🚀 GPU detected: {torch.cuda.get_device_name(0)}")
        print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
        os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    else:
        device = "cpu"
        print("💻 Using CPU (GPU not available)")
        os.environ["CUDA_VISIBLE_DEVICES"] = ""
    
    print(f"✅ Device configured: {device}")
    return device

# Setup device
device = setup_device()

# Manual device override (uncomment if needed)
# device = "cpu"  # Force CPU usage
# device = "cuda"  # Force GPU usage (if available)


## Download Core Library and Configuration


In [None]:
# Download the core library and configs from GitHub
!git clone https://github.com/AHMerrill/anti-echo-2.git temp_repo
!cp -r temp_repo/* ./
!rm -rf temp_repo

# Verify files are downloaded
!ls -la

# Optional: Set up Git for pushing results back (if you want to save results to GitHub)
# Uncomment the lines below and add your PAT if you want to push results back
# !git config --global user.email "your-email@example.com"
# !git config --global user.name "Your Name"
# !git remote set-url origin https://YOUR_PAT@github.com/AHMerrill/anti-echo-2.git


In [None]:
from anti_echo_core import AntiEchoCore
import json
import time
from datetime import datetime

# Initialize the core system with detected device
print("Initializing Anti-Echo Chamber system...")
core = AntiEchoCore("config/config.yaml", device=device)
print(f"✓ System initialized successfully on {core.device}")


## Hugging Face Authentication


In [None]:
# Hugging Face Authentication
from huggingface_hub import HfApi, create_repo, login
import getpass

def get_hf_token():
    """Get Hugging Face token from user input or local file."""
    # Try to load from local file first (for local development)
    try:
        with open("hf_token.txt", "r") as f:
            token = f.read().strip()
        print("✅ Hugging Face token loaded from local file")
        return token
    except FileNotFoundError:
        pass
    
    # If not found locally, ask user to input it
    print("🔑 Hugging Face Authentication Required")
    print("   Get your token from: https://huggingface.co/settings/tokens")
    print("   (Make sure it has 'Write' permissions)")
    
    # Use getpass for secure input (hides the token as you type)
    token = getpass.getpass("Enter your Hugging Face token: ").strip()
    
    if token:
        print("✅ Hugging Face token received")
        return token
    else:
        print("❌ No token provided")
        return None

# Get the token
hf_token = get_hf_token()

if not hf_token:
    print("⚠️ No HF token available. Upload functionality will be disabled.")
    print("   You can still run the scraping and processing locally.")

print("Hugging Face authentication ready")


## Configuration - Easy Customization


In [None]:
# =============================================================================
# CONFIGURATION - EASILY CUSTOMIZE SCRAPING PARAMETERS
# =============================================================================

# Number of articles to scrape per RSS feed (adjust as needed)
MAX_ARTICLES_PER_FEED = 5  # Change this number to scrape more/fewer articles

print(f"📊 Configuration:")
print(f"   Max articles per feed: {MAX_ARTICLES_PER_FEED}")
print(f"   (Adjust MAX_ARTICLES_PER_FEED above to change this)")


## Complete Pipeline


In [None]:
# RSS Feed sources for diverse political perspectives
RSS_FEEDS = {
    # Conservative sources
    "fox_news": "https://feeds.foxnews.com/foxnews/politics",
    "daily_caller": "https://dailycaller.com/feed/",
    "federalist": "https://thefederalist.com/feed/",
    "reason": "https://reason.com/feed/",
    
    # Liberal sources
    "npr": "https://feeds.npr.org/1001/rss.xml",
    "vox": "https://www.vox.com/rss/index.xml",
    "msnbc": "https://www.msnbc.com/feeds/latest",
    "propublica": "https://www.propublica.org/feeds/propublica/main",
    
    # International sources
    "bbc": "http://feeds.bbci.co.uk/news/rss.xml",
    "guardian": "https://www.theguardian.com/world/rss",
    "al_jazeera": "https://www.aljazeera.com/xml/rss/all.xml",
    "france24": "https://www.france24.com/en/rss",
    
    # Academic/Think tank
    "conversation": "https://theconversation.com/global/rss",
    "city_journal": "https://www.city-journal.org/feed",
    "dw": "https://rss.dw.com/rdf/rss-en-all"
}

print(f"Configured {len(RSS_FEEDS)} RSS feeds for scraping")


In [None]:
import feedparser
import trafilatura
import pandas as pd
from urllib.parse import urlparse
import hashlib

def scrape_and_process_articles(max_articles_per_feed=None, dataset_name="anti-echo-chamber-data"):
    """Complete pipeline: scrape, process, and upload to Hugging Face."""
    
    # Use global config if not specified
    if max_articles_per_feed is None:
        max_articles_per_feed = MAX_ARTICLES_PER_FEED
    
    print("🚀 Starting Anti-Echo Chamber Pipeline")
    print("=" * 50)
    print(f"🖥️ Running on: {core.device}")
    print(f"📊 Max articles per feed: {max_articles_per_feed}")
    if core.device == "cuda":
        print(f"⚡ GPU acceleration enabled - expect faster processing!")
    else:
        print(f"🐌 CPU mode - processing will be slower but still functional")
    
    # Check for existing articles in HF dataset to avoid duplicates
    existing_article_ids = set()
    try:
        from datasets import load_dataset
        dataset = load_dataset(dataset_name, split="train")
        existing_article_ids = set(dataset["id"])
        print(f"📋 Found {len(existing_article_ids)} existing articles in dataset")
    except:
        print("📋 No existing dataset found, starting fresh")
    
    # Step 1: Scrape articles
    print("\n📰 Step 1: Scraping news articles...")
    all_articles = []
    
    for source_name, feed_url in RSS_FEEDS.items():
        print(f"Scraping {source_name}...")
        try:
            feed = feedparser.parse(feed_url)
            articles = []
            
            for entry in feed.entries[:max_articles_per_feed]:
                try:
                    article_text = trafilatura.extract(entry.link)
                    if article_text and len(article_text) > 200:
                        article = {
                            "title": entry.get("title", ""),
                            "url": entry.link,
                            "source": source_name,
                            "published": entry.get("published", ""),
                            "text": article_text,
                            "id": hashlib.md5(article_text.encode()).hexdigest()
                        }
                        articles.append(article)
                except Exception as e:
                    continue
            
            print(f"  ✓ Scraped {len(articles)} articles from {source_name}")
            all_articles.extend(articles)
            time.sleep(1)  # Be respectful to servers
            
        except Exception as e:
            print(f"  ✗ Error scraping {source_name}: {e}")
    
    # Remove duplicates (both URL and existing article IDs)
    seen_urls = set()
    unique_articles = []
    for article in all_articles:
        article_id = article["id"]
        if (article["url"] not in seen_urls and 
            article_id not in existing_article_ids):
            seen_urls.add(article["url"])
            unique_articles.append(article)
        elif article_id in existing_article_ids:
            print(f"  ⏭️ Skipping existing article: {article['title'][:50]}...")
    
    print(f"\n✓ Total new articles to process: {len(unique_articles)}")
    print(f"⏭️ Skipped {len(all_articles) - len(unique_articles)} duplicate/existing articles")
    
    print(f"\n✓ Total unique articles scraped: {len(unique_articles)}")
    
    if not unique_articles:
        print("❌ No articles were scraped")
        return
    
    # Step 2: Process articles
    print("\n🔍 Step 2: Processing articles through pipeline...")
    processed_articles = []
    
    for i, article in enumerate(unique_articles):
        try:
            print(f"  Processing {i+1}/{len(unique_articles)}: {article['title'][:50]}...")
            
            processed = core.process_article(article)
            if processed:
                core.upsert_to_chroma(processed)
                metadata_only = core.export_metadata_only(processed)
                processed_articles.append(metadata_only)
                print(f"    ✓ Processed: {processed['political_leaning']} / {processed['implied_stance']}")
            else:
                print(f"    ✗ Failed to process")
                
        except Exception as e:
            print(f"    ✗ Error: {e}")
            continue
    
    print(f"\n✓ Successfully processed {len(processed_articles)} articles")
    
    if not processed_articles:
        print("❌ No articles were successfully processed")
        return
    
    # Step 3: Upload to Hugging Face (if token available)
    if hf_token:
        print("\n☁️ Step 3: Uploading to Hugging Face...")
        
        try:
            # Create dataset repository
            create_repo(dataset_name, repo_type="dataset", exist_ok=True, token=hf_token)
            print(f"✓ Created/verified dataset: {dataset_name}")
            
            # Convert to DataFrame and upload
            df = pd.DataFrame(processed_articles)
            df.to_csv("processed_articles.csv", index=False)
            
            api = HfApi(token=hf_token)
            api.upload_file(
                path_or_fileobj="processed_articles.csv",
                path_in_repo="processed_articles.csv",
                repo_id=dataset_name,
                repo_type="dataset"
            )
            
            print(f"✓ Uploaded {len(processed_articles)} articles to Hugging Face")
            print(f"  Dataset: https://huggingface.co/datasets/{dataset_name}")
            
            # Also save embeddings separately
            embeddings_data = {
                "topic_embeddings": [article["topic_vectors"] for article in processed_articles],
                "stance_embeddings": [article["stance_embedding"] for article in processed_articles],
                "metadata": [{k: v for k, v in article.items() if k not in ["topic_vectors", "stance_embedding"]} for article in processed_articles]
            }
            
            with open("embeddings_data.json", "w") as f:
                json.dump(embeddings_data, f, indent=2)
            
            api.upload_file(
                path_or_fileobj="embeddings_data.json",
                path_in_repo="embeddings_data.json",
                repo_id=dataset_name,
                repo_type="dataset"
            )
            
            print(f"✓ Uploaded embeddings data to Hugging Face")
            
        except Exception as e:
            print(f"Upload error: {e}")
    else:
        print("\n⚠️ Step 3: Skipping Hugging Face upload (no token provided)")
        print("   Articles processed and stored locally in ChromaDB")
        print("   To upload later, run this cell again with a valid HF token")
    
    print("\n✅ Pipeline completed successfully!")
    print(f"   - Scraped: {len(unique_articles)} articles")
    print(f"   - Processed: {len(processed_articles)} articles")
    print(f"   - Uploaded: Metadata + embeddings only (no full text)")
    
    return processed_articles


In [None]:
# Configure dataset name
dataset_name = "zanimal/anti-echo-chamber-data"  # Default dataset
# Uncomment the line below to input a different dataset name
# dataset_name = input("Enter dataset name (or press Enter for default): ").strip() or "zanimal/anti-echo-chamber-data"

print(f"🔧 Debug Info:")
print(f"   Dataset: {dataset_name}")
print(f"   HF Token: {'✅ Available' if hf_token else '❌ Not provided'}")
print(f"   Device: {core.device}")
print(f"   RSS Feeds: {len(RSS_FEEDS)} configured")

# Run the complete pipeline
# This will scrape articles, process them, and upload to Hugging Face
try:
    processed_articles = scrape_and_process_articles(
        max_articles_per_feed=MAX_ARTICLES_PER_FEED, 
        dataset_name=dataset_name
    )
    print(f"\n🎉 Pipeline completed! Processed {len(processed_articles) if processed_articles else 0} articles")
except Exception as e:
    print(f"\n❌ Pipeline failed with error: {e}")
    import traceback
    traceback.print_exc()


## Verification - Check Processing Results


In [None]:
# Verify that articles were processed correctly
if 'processed_articles' in locals() and processed_articles:
    print("🔍 VERIFICATION RESULTS")
    print("=" * 50)
    
    # Check processing statistics
    total_processed = len(processed_articles)
    print(f"✅ Total articles processed: {total_processed}")
    
    # Check political leanings distribution
    leanings = [article.get('political_leaning', 'unknown') for article in processed_articles]
    leaning_counts = {}
    for leaning in leanings:
        leaning_counts[leaning] = leaning_counts.get(leaning, 0) + 1
    
    print(f"\n📊 Political Leanings Distribution:")
    for leaning, count in sorted(leaning_counts.items()):
        percentage = (count / total_processed) * 100
        print(f"   {leaning}: {count} ({percentage:.1f}%)")
    
    # Check implied stances distribution
    stances = [article.get('implied_stance', 'unknown') for article in processed_articles]
    stance_counts = {}
    for stance in stances:
        stance_counts[stance] = stance_counts.get(stance, 0) + 1
    
    print(f"\n📊 Implied Stances Distribution:")
    for stance, count in sorted(stance_counts.items()):
        percentage = (count / total_processed) * 100
        print(f"   {stance}: {count} ({percentage:.1f}%)")
    
    # Check topics distribution
    all_topics = []
    for article in processed_articles:
        all_topics.extend(article.get('topics', []))
    
    topic_counts = {}
    for topic in all_topics:
        topic_counts[topic] = topic_counts.get(topic, 0) + 1
    
    print(f"\n📊 Top Topics Detected:")
    for topic, count in sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
        print(f"   {topic}: {count}")
    
    # Check vectorization
    articles_with_vectors = sum(1 for article in processed_articles 
                              if article.get('topic_vectors') and article.get('stance_embedding'))
    print(f"\n🔢 Vectorization Status:")
    print(f"   Articles with topic vectors: {articles_with_vectors}/{total_processed}")
    print(f"   Articles with stance embeddings: {articles_with_vectors}/{total_processed}")
    
    # Check ChromaDB storage
    try:
        topic_count = core.topic_coll.count()
        stance_count = core.stance_coll.count()
        print(f"\n💾 ChromaDB Storage:")
        print(f"   Topic collection: {topic_count} vectors")
        print(f"   Stance collection: {stance_count} vectors")
    except Exception as e:
        print(f"\n⚠️ ChromaDB check failed: {e}")
    
    print(f"\n✅ Verification complete! All systems working properly.")
    
else:
    print("❌ No processed articles found. Run the pipeline first.")


In [None]:
# Inspect a sample processed article to verify quality
if 'processed_articles' in locals() and processed_articles:
    print("🔍 SAMPLE ARTICLE INSPECTION")
    print("=" * 50)
    
    # Show first processed article
    sample = processed_articles[0]
    print(f"📰 Sample Article:")
    print(f"   Title: {sample.get('title', 'N/A')[:80]}...")
    print(f"   Source: {sample.get('source', 'N/A')}")
    print(f"   Political Leaning: {sample.get('political_leaning', 'N/A')}")
    print(f"   Implied Stance: {sample.get('implied_stance', 'N/A')}")
    print(f"   Topics: {', '.join(sample.get('topics', []))}")
    print(f"   Summary: {sample.get('summary', 'N/A')[:100]}...")
    print(f"   Text Length: {sample.get('text_length', 0)} characters")
    
    # Check vector dimensions
    topic_vectors = sample.get('topic_vectors', [])
    stance_embedding = sample.get('stance_embedding', [])
    
    print(f"\n🔢 Vector Analysis:")
    print(f"   Topic vectors: {len(topic_vectors)} clusters")
    if topic_vectors:
        print(f"   Topic vector dimension: {len(topic_vectors[0]) if topic_vectors[0] else 'N/A'}")
    print(f"   Stance embedding dimension: {len(stance_embedding) if stance_embedding else 'N/A'}")
    
    # Show a few more samples
    print(f"\n📋 Quick Sample of Other Articles:")
    for i, article in enumerate(processed_articles[1:4], 1):
        print(f"   {i}. {article.get('title', 'N/A')[:50]}... | {article.get('political_leaning', 'N/A')} | {article.get('implied_stance', 'N/A')}")
    
    print(f"\n✅ Sample inspection complete!")
    
else:
    print("❌ No processed articles found. Run the pipeline first.")
