# Anti-Echo Chamber - News Scraper & Processor

This notebook scrapes news articles from RSS feeds, processes them through the anti-echo chamber pipeline, and uploads only metadata + embeddings to Hugging Face.

**Key Features:**
- Scrapes from diverse news sources
- Processes articles through topic modeling and stance classification
- Stores only embeddings and metadata (no full text)
- Uploads processed data to Hugging Face for sharing

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AHMerrill/anti-echo-2/blob/main/notebooks/scraper_artifacts.ipynb)


## Setup and Installation


In [None]:
# Install dependencies
!pip install -q chromadb sentence-transformers transformers huggingface-hub pymupdf beautifulsoup4 scikit-learn nltk pyyaml feedparser trafilatura

# Set environment variables
import os
os.environ["CHROMA_TELEMETRY_ENABLED"] = "false"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"


## Download Core Library and Configuration


In [None]:
# Download the core library and configs from GitHub
!git clone https://github.com/AHMerrill/anti-echo-2.git temp_repo
!cp -r temp_repo/* ./
!rm -rf temp_repo

# Verify files are downloaded
!ls -la


In [None]:
from anti_echo_core import AntiEchoCore
import json
import time
from datetime import datetime

# Initialize the core system
print("Initializing Anti-Echo Chamber system...")
core = AntiEchoCore("config/config.yaml")
print("✓ System initialized successfully")


## Hugging Face Authentication


In [None]:
# Hugging Face Authentication
# You'll need to get a token from: https://huggingface.co/settings/tokens
# Then run: huggingface-cli login

from huggingface_hub import HfApi, create_repo, login

# Login to Hugging Face (run this cell and follow the prompts)
# login()

# Or set your token directly (not recommended for security)
# os.environ["HUGGINGFACE_HUB_TOKEN"] = "your_token_here"

print("Hugging Face authentication ready")


## Complete Pipeline


In [None]:
# RSS Feed sources for diverse political perspectives
RSS_FEEDS = {
    # Conservative sources
    "fox_news": "https://feeds.foxnews.com/foxnews/politics",
    "daily_caller": "https://dailycaller.com/feed/",
    "federalist": "https://thefederalist.com/feed/",
    "reason": "https://reason.com/feed/",
    
    # Liberal sources
    "npr": "https://feeds.npr.org/1001/rss.xml",
    "vox": "https://www.vox.com/rss/index.xml",
    "msnbc": "https://www.msnbc.com/feeds/latest",
    "propublica": "https://www.propublica.org/feeds/propublica/main",
    
    # International sources
    "bbc": "http://feeds.bbci.co.uk/news/rss.xml",
    "guardian": "https://www.theguardian.com/world/rss",
    "al_jazeera": "https://www.aljazeera.com/xml/rss/all.xml",
    "france24": "https://www.france24.com/en/rss",
    
    # Academic/Think tank
    "conversation": "https://theconversation.com/global/rss",
    "city_journal": "https://www.city-journal.org/feed",
    "dw": "https://rss.dw.com/rdf/rss-en-all"
}

print(f"Configured {len(RSS_FEEDS)} RSS feeds for scraping")


In [None]:
import feedparser
import trafilatura
import pandas as pd
from urllib.parse import urlparse
import hashlib

def scrape_and_process_articles(max_articles_per_feed=5, dataset_name="anti-echo-chamber-data"):
    """Complete pipeline: scrape, process, and upload to Hugging Face."""
    
    print("🚀 Starting Anti-Echo Chamber Pipeline")
    print("=" * 50)
    
    # Step 1: Scrape articles
    print("\n📰 Step 1: Scraping news articles...")
    all_articles = []
    
    for source_name, feed_url in RSS_FEEDS.items():
        print(f"Scraping {source_name}...")
        try:
            feed = feedparser.parse(feed_url)
            articles = []
            
            for entry in feed.entries[:max_articles_per_feed]:
                try:
                    article_text = trafilatura.extract(entry.link)
                    if article_text and len(article_text) > 200:
                        article = {
                            "title": entry.get("title", ""),
                            "url": entry.link,
                            "source": source_name,
                            "published": entry.get("published", ""),
                            "text": article_text,
                            "id": hashlib.md5(article_text.encode()).hexdigest()
                        }
                        articles.append(article)
                except Exception as e:
                    continue
            
            print(f"  ✓ Scraped {len(articles)} articles from {source_name}")
            all_articles.extend(articles)
            time.sleep(1)  # Be respectful to servers
            
        except Exception as e:
            print(f"  ✗ Error scraping {source_name}: {e}")
    
    # Remove duplicates
    seen_urls = set()
    unique_articles = []
    for article in all_articles:
        if article["url"] not in seen_urls:
            seen_urls.add(article["url"])
            unique_articles.append(article)
    
    print(f"\n✓ Total unique articles scraped: {len(unique_articles)}")
    
    if not unique_articles:
        print("❌ No articles were scraped")
        return
    
    # Step 2: Process articles
    print("\n🔍 Step 2: Processing articles through pipeline...")
    processed_articles = []
    
    for i, article in enumerate(unique_articles):
        try:
            print(f"  Processing {i+1}/{len(unique_articles)}: {article['title'][:50]}...")
            
            processed = core.process_article(article)
            if processed:
                core.upsert_to_chroma(processed)
                metadata_only = core.export_metadata_only(processed)
                processed_articles.append(metadata_only)
                print(f"    ✓ Processed: {processed['political_leaning']} / {processed['implied_stance']}")
            else:
                print(f"    ✗ Failed to process")
                
        except Exception as e:
            print(f"    ✗ Error: {e}")
            continue
    
    print(f"\n✓ Successfully processed {len(processed_articles)} articles")
    
    if not processed_articles:
        print("❌ No articles were successfully processed")
        return
    
    # Step 3: Upload to Hugging Face
    print("\n☁️ Step 3: Uploading to Hugging Face...")
    
    try:
        # Create dataset repository
        create_repo(dataset_name, repo_type="dataset", exist_ok=True)
        print(f"✓ Created/verified dataset: {dataset_name}")
        
        # Convert to DataFrame and upload
        df = pd.DataFrame(processed_articles)
        df.to_csv("processed_articles.csv", index=False)
        
        api = HfApi()
        api.upload_file(
            path_or_fileobj="processed_articles.csv",
            path_in_repo="processed_articles.csv",
            repo_id=dataset_name,
            repo_type="dataset"
        )
        
        print(f"✓ Uploaded {len(processed_articles)} articles to Hugging Face")
        print(f"  Dataset: https://huggingface.co/datasets/{dataset_name}")
        
        # Also save embeddings separately
        embeddings_data = {
            "topic_embeddings": [article["topic_vectors"] for article in processed_articles],
            "stance_embeddings": [article["stance_embedding"] for article in processed_articles],
            "metadata": [{k: v for k, v in article.items() if k not in ["topic_vectors", "stance_embedding"]} for article in processed_articles]
        }
        
        with open("embeddings_data.json", "w") as f:
            json.dump(embeddings_data, f, indent=2)
        
        api.upload_file(
            path_or_fileobj="embeddings_data.json",
            path_in_repo="embeddings_data.json",
            repo_id=dataset_name,
            repo_type="dataset"
        )
        
        print(f"✓ Uploaded embeddings data to Hugging Face")
        
    except Exception as e:
        print(f"Upload error: {e}")
    
    print("\n✅ Pipeline completed successfully!")
    print(f"   - Scraped: {len(unique_articles)} articles")
    print(f"   - Processed: {len(processed_articles)} articles")
    print(f"   - Uploaded: Metadata + embeddings only (no full text)")
    
    return processed_articles


In [None]:
# Run the complete pipeline
# This will scrape articles, process them, and upload to Hugging Face
processed_articles = scrape_and_process_articles(max_articles_per_feed=3, dataset_name="anti-echo-chamber-data")
