In [None]:
# Legislative Bill Mentions Data Collection
# Erdos Institute Project: Predicting Congressional Bill Outcomes

import pandas as pd
import requests
import time
import json
import re
from datetime import datetime, timedelta
import os
from typing import List, Dict, Optional
import warnings
warnings.filterwarnings('ignore')

# Install required packages first
import subprocess
import sys

def install_packages():
    """Install required packages if not available"""
    packages = ['praw', 'pmaw', 'requests', 'pandas', 'numpy', 'matplotlib', 'seaborn']
    
    for package in packages:
        try:
            __import__(package)
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

# Install packages
install_packages()

# Import packages
try:
    import praw
    from pmaw import PushshiftAPI
    REDDIT_AVAILABLE = True
    print("✅ Reddit packages loaded successfully")
except ImportError as e:
    print(f"⚠️  Reddit packages not available: {e}")
    print("Note: You can still use Guardian and Common Crawl collectors")
    REDDIT_AVAILABLE = False

# For data processing and analysis
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

print("📊 Legislative Bill Mentions Data Collector")
print("=" * 50)

✅ Reddit packages loaded successfully
📊 Legislative Bill Mentions Data Collector


In [None]:
## CONFIGURATION
START_DATE = "2014-01-01"  # 10 years of data
END_DATE = "2024-12-31"
OUTPUT_DIR = "legislative_data"

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Common bill patterns for identification
BILL_PATTERNS = [
    r'\b[HS]\.?\s*(?:R\.?\s*|J\.?\s*Res\.?\s*|Con\.?\s*Res\.?\s*)?(\d+)\b',  # H.R. 1234, S. 567
    r'\b(?:House|Senate)\s+(?:Bill|Resolution)\s+(\d+)\b',  # House Bill 1234
    r'\b(?:HR|SR)\s*-?\s*(\d+)\b',  # HR-1234, SR 567
]

def extract_bill_numbers(text: str) -> List[str]:
    """Extract bill numbers from text using regex patterns"""
    bills = []
    for pattern in BILL_PATTERNS:
        matches = re.finditer(pattern, text, re.IGNORECASE)
        for match in matches:
            bills.append(match.group(0))
    return list(set(bills))  # Remove duplicates

def save_data(data: List[Dict], filename: str):
    """Save data to JSON and CSV formats"""
    # Save as JSON
    json_path = os.path.join(OUTPUT_DIR, f"{filename}.json")
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    
    # Save as CSV
    if data:
        df = pd.DataFrame(data)
        csv_path = os.path.join(OUTPUT_DIR, f"{filename}.csv")
        df.to_csv(csv_path, index=False, encoding='utf-8')
        print(f"✅ Saved {len(data)} records to {json_path} and {csv_path}")
    return len(data)

In [None]:
# =============================================================================
# PART 1: REDDIT DATA COLLECTION
# =============================================================================

print("\n🔴 REDDIT DATA COLLECTION")
print("-" * 30)

class RedditBillScraper:
    def __init__(self):
        if not REDDIT_AVAILABLE:
            raise ImportError("Reddit packages (praw, pmaw) not available")
        
        # Using Pushshift for historical data
        self.api = PushshiftAPI()
        
        # Political subreddits to search
        self.subreddits = [
            'politics', 'Conservative', 'NeutralPolitics', 'Ask_Politics',
            'PoliticalDiscussion', 'moderatepolitics', 'AmericanPolitics',
            'congress', 'law', 'Legal'
        ]
    
    def search_submissions(self, query: str, subreddit: str, start_date: str, end_date: str) -> List[Dict]:
        """Search Reddit submissions for bill mentions"""
        print(f"  Searching r/{subreddit} for: {query}")
        
        try:
            submissions = self.api.search_submissions(
                q=query,
                subreddit=subreddit,
                after=start_date,
                before=end_date,
                limit=1000
            )
            
            results = []
            for submission in submissions:
                if hasattr(submission, 'title') and hasattr(submission, 'selftext'):
                    # Extract bill mentions from title and text
                    title_bills = extract_bill_numbers(submission.title)
                    text_bills = extract_bill_numbers(submission.selftext or "")
                    
                    if title_bills or text_bills:
                        results.append({
                            'id': submission.id,
                            'title': submission.title,
                            'text': submission.selftext,
                            'author': str(submission.author),
                            'subreddit': submission.subreddit,
                            'created_utc': submission.created_utc,
                            'score': submission.score,
                            'num_comments': submission.num_comments,
                            'url': f"https://reddit.com{submission.permalink}",
                            'bills_mentioned': list(set(title_bills + text_bills)),
                            'source': 'reddit'
                        })
            
            return results
            
        except Exception as e:
            print(f"  ⚠️  Error searching r/{subreddit}: {str(e)}")
            return []
    
    def collect_bill_mentions(self) -> List[Dict]:
        """Collect bill mentions from multiple subreddits"""
        all_results = []
        
        # Search terms related to legislation
        search_terms = [
            "H.R.", "S.", "House Bill", "Senate Bill", "legislation",
            "Congress", "bill passed", "vote", "amendment"
        ]
        
        for term in search_terms:
            for subreddit in self.subreddits:
                try:
                    results = self.search_submissions(term, subreddit, START_DATE, END_DATE)
                    all_results.extend(results)
                    time.sleep(1)  # Rate limiting
                except Exception as e:
                    print(f"  ⚠️  Error with {subreddit}: {str(e)}")
                    continue
        
        # Remove duplicates based on post ID
        seen_ids = set()
        unique_results = []
        for result in all_results:
            if result['id'] not in seen_ids:
                seen_ids.add(result['id'])
                unique_results.append(result)
        
        return unique_results

class ImprovedRedditScraper:
    """Improved Reddit scraper with better error handling and diagnostics"""
    
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        self.subreddits = ['politics', 'Conservative', 'NeutralPolitics', 'congress']
        self.base_delay = 2
    
    def search_reddit_json(self, subreddit: str, query: str, time_filter: str = 'all', limit: int = 100) -> List[Dict]:
        """Search Reddit using public JSON API"""
        url = f"https://www.reddit.com/r/{subreddit}/search.json"
        params = {
            'q': query,
            'restrict_sr': 'on',
            'sort': 'relevance',
            't': time_filter,
            'limit': limit
        }
        
        print(f"  Searching r/{subreddit} for '{query}'...", end=' ')
        
        try:
            response = requests.get(url, headers=self.headers, params=params, timeout=15)
            
            if response.status_code == 429:
                print("⚠️  Rate limited, waiting...")
                time.sleep(60)
                return []
            
            response.raise_for_status()
            data = response.json()
            
            posts = data.get('data', {}).get('children', [])
            results = []
            
            for post in posts:
                post_data = post.get('data', {})
                
                title = post_data.get('title', '')
                selftext = post_data.get('selftext', '')
                
                # Extract bill mentions
                title_bills = extract_bill_numbers(title)
                text_bills = extract_bill_numbers(selftext)
                
                # More flexible matching
                combined_text = (title + ' ' + selftext).lower()
                has_legislative = any(term in combined_text for term in 
                                    ['h.r.', 's.', 'h.r ', 's. ', 'hr ', 'senate bill', 
                                     'house bill', 'congress', 'legislation', 'congressional'])
                
                if title_bills or text_bills or has_legislative:
                    results.append({
                        'id': post_data.get('id', ''),
                        'title': title,
                        'text': selftext,
                        'author': post_data.get('author', ''),
                        'subreddit': subreddit,
                        'created_utc': post_data.get('created_utc', 0),
                        'score': post_data.get('score', 0),
                        'num_comments': post_data.get('num_comments', 0),
                        'url': f"https://reddit.com{post_data.get('permalink', '')}",
                        'bills_mentioned': list(set(title_bills + text_bills)),
                        'source': 'reddit_improved'
                    })
            
            print(f"✓ Found {len(results)} relevant posts")
            return results
            
        except requests.exceptions.RequestException as e:
            print(f"✗ Error: {str(e)}")
            return []
        except Exception as e:
            print(f"✗ Unexpected error: {str(e)}")
            return []
    
    def get_subreddit_hot(self, subreddit: str, limit: int = 100) -> List[Dict]:
        """Get hot posts from subreddit (no search query needed)"""
        url = f"https://www.reddit.com/r/{subreddit}/hot.json"
        params = {'limit': limit}
        
        print(f"  Getting hot posts from r/{subreddit}...", end=' ')
        
        try:
            response = requests.get(url, headers=self.headers, params=params, timeout=15)
            response.raise_for_status()
            data = response.json()
            
            posts = data.get('data', {}).get('children', [])
            results = []
            
            for post in posts:
                post_data = post.get('data', {})
                title = post_data.get('title', '')
                selftext = post_data.get('selftext', '')
                
                # Check for legislative content
                combined = (title + ' ' + selftext).lower()
                if any(term in combined for term in ['h.r.', 's.', 'bill', 'congress', 'senate', 'house']):
                    title_bills = extract_bill_numbers(title)
                    text_bills = extract_bill_numbers(selftext)
                    
                    results.append({
                        'id': post_data.get('id', ''),
                        'title': title,
                        'text': selftext,
                        'author': post_data.get('author', ''),
                        'subreddit': subreddit,
                        'created_utc': post_data.get('created_utc', 0),
                        'score': post_data.get('score', 0),
                        'num_comments': post_data.get('num_comments', 0),
                        'url': f"https://reddit.com{post_data.get('permalink', '')}",
                        'bills_mentioned': list(set(title_bills + text_bills)),
                        'source': 'reddit_hot'
                    })
            
            print(f"✓ Found {len(results)} legislative posts")
            return results
            
        except Exception as e:
            print(f"✗ Error: {str(e)}")
            return []
    
    def collect_bill_mentions(self) -> List[Dict]:
        """Collect bill mentions using multiple strategies"""
        all_results = []
        
        # Strategy 1: Search specific terms
        search_terms = [
            "H.R.",
            "Senate bill", 
            "House bill",
            "Congress passed",
            "legislation"
        ]
        
        print("\n📍 Strategy 1: Searching with specific terms")
        for term in search_terms:
            for subreddit in self.subreddits:
                try:
                    results = self.search_reddit_json(subreddit, term, time_filter='year', limit=100)
                    all_results.extend(results)
                    time.sleep(self.base_delay)
                except Exception as e:
                    print(f"  ⚠️  Error with r/{subreddit}: {str(e)}")
                    continue
        
        # Strategy 2: Get hot posts and filter
        print("\n📍 Strategy 2: Filtering hot/recent posts")
        for subreddit in self.subreddits:
            try:
                results = self.get_subreddit_hot(subreddit, limit=100)
                all_results.extend(results)
                time.sleep(self.base_delay)
            except Exception as e:
                print(f"  ⚠️  Error with r/{subreddit}: {str(e)}")
                continue
        
        # Remove duplicates
        seen_ids = set()
        unique_results = []
        for result in all_results:
            if result['id'] not in seen_ids:
                seen_ids.add(result['id'])
                unique_results.append(result)
        
        print(f"\n📊 Total unique Reddit posts found: {len(unique_results)}")
        return unique_results


🔴 REDDIT DATA COLLECTION
------------------------------


In [None]:
# Initialize Reddit scraper and collect data
reddit_data = []
reddit_count = 0

print("🔴 REDDIT DATA COLLECTION")
print("-" * 30)

try:
    improved_scraper = ImprovedRedditScraper()
    reddit_data = improved_scraper.collect_bill_mentions()
    
    if reddit_data:
        reddit_count = save_data(reddit_data, "reddit_bill_mentions")
    else:
        print("\n⚠️  No Reddit data collected. This could be due to:")
        print("  - Rate limiting (try again in a few minutes)")
        print("  - Search terms not matching recent content")
        print("  - Reddit API issues")
        print("\n💡 Tip: You can still proceed with Guardian data (26k articles is excellent!)")
        
except Exception as e:
    print(f"⚠️  Error collecting Reddit data: {e}")
    print("Continuing with Guardian data...")

🔴 REDDIT DATA COLLECTION
------------------------------

📍 Strategy 1: Searching with specific terms
  Searching r/politics for 'H.R.'... ✓ Found 99 relevant posts
  Searching r/Conservative for 'H.R.'... ✓ Found 9 relevant posts
  Searching r/NeutralPolitics for 'H.R.'... ✓ Found 1 relevant posts
  Searching r/congress for 'H.R.'... ✓ Found 13 relevant posts
  Searching r/politics for 'Senate bill'... ✓ Found 19 relevant posts
  Searching r/Conservative for 'Senate bill'... ✓ Found 13 relevant posts
  Searching r/NeutralPolitics for 'Senate bill'... ✓ Found 9 relevant posts
  Searching r/congress for 'Senate bill'... ✓ Found 20 relevant posts
  Searching r/politics for 'House bill'... ✓ Found 13 relevant posts
  Searching r/Conservative for 'House bill'... ✓ Found 15 relevant posts
  Searching r/NeutralPolitics for 'House bill'... ✓ Found 7 relevant posts
  Searching r/congress for 'House bill'... ✓ Found 23 relevant posts
  Searching r/politics for 'Congress passed'... ✓ Found 35 rel

In [None]:
# =============================================================================
# PART 2: GUARDIAN API DATA COLLECTION
# =============================================================================

print("\n📰 GUARDIAN API DATA COLLECTION")
print("-" * 35)

class GuardianScraper:
    def __init__(self, api_key: Optional[str] = None):
        # Guardian API is free - get key at: https://open-platform.theguardian.com/access/
        self.api_key = api_key or "test"
        self.base_url = "https://content.guardianapis.com/search"
    
    def search_articles(self, query: str, from_date: str, to_date: str, page_size: int = 50) -> List[Dict]:
        """Search Guardian articles for legislative content"""
        print(f"  Searching Guardian for: {query}")
        
        all_articles = []
        page = 1
        
        while True:
            params = {
                'q': query,
                'from-date': from_date,
                'to-date': to_date,
                'page-size': page_size,
                'page': page,
                'show-fields': 'headline,bodyText,byline,firstPublicationDate',
                'section': 'us-news|world/usa',
                'api-key': self.api_key
            }
            
            try:
                response = requests.get(self.base_url, params=params)
                response.raise_for_status()
                data = response.json()
                
                if data['response']['status'] != 'ok':
                    break
                
                articles = data['response']['results']
                if not articles:
                    break
                
                for article in articles:
                    # Extract bill mentions from headline and body
                    headline = article.get('fields', {}).get('headline', '')
                    body = article.get('fields', {}).get('bodyText', '')
                    
                    headline_bills = extract_bill_numbers(headline)
                    body_bills = extract_bill_numbers(body)
                    
                    if headline_bills or body_bills or any(term in (headline + body).lower() 
                                                         for term in ['congress', 'senate', 'house', 'bill', 'legislation']):
                        all_articles.append({
                            'id': article['id'],
                            'headline': headline,
                            'body': body,
                            'byline': article.get('fields', {}).get('byline', ''),
                            'publication_date': article.get('fields', {}).get('firstPublicationDate', ''),
                            'web_url': article['webUrl'],
                            'section': article.get('sectionName', ''),
                            'bills_mentioned': list(set(headline_bills + body_bills)),
                            'source': 'guardian'
                        })
                
                page += 1
                time.sleep(0.1)  # Rate limiting
                
                if page > data['response']['pages']:
                    break
                    
            except requests.exceptions.RequestException as e:
                print(f"  ⚠️  Error fetching page {page}: {str(e)}")
                break
        
        return all_articles
    
    def collect_legislative_coverage(self) -> List[Dict]:
        """Collect legislative coverage from Guardian"""
        all_articles = []
        
        # Search terms for legislative content
        search_queries = [
            'Congress bill', 'Senate vote', 'House Representatives',
            'legislation passed', 'congressional vote', 'federal law',
            'H.R.', 'Senate bill', 'House bill'
        ]
        
        for query in search_queries:
            try:
                articles = self.search_articles(query, START_DATE, END_DATE)
                all_articles.extend(articles)
                time.sleep(1)  # Rate limiting between queries
            except Exception as e:
                print(f"  ⚠️  Error with query '{query}': {str(e)}")
                continue
        
        # Remove duplicates based on article ID
        seen_ids = set()
        unique_articles = []
        for article in all_articles:
            if article['id'] not in seen_ids:
                seen_ids.add(article['id'])
                unique_articles.append(article)
        
        return unique_articles


📰 GUARDIAN API DATA COLLECTION
-----------------------------------


In [7]:
# Initialize Guardian scraper
guardian_scraper = GuardianScraper()

# Collect Guardian data
print("Collecting Guardian data...")
guardian_data = guardian_scraper.collect_legislative_coverage()
guardian_count = save_data(guardian_data, "guardian_bill_mentions")

Collecting Guardian data...
  Searching Guardian for: Congress bill
  Searching Guardian for: Senate vote
  Searching Guardian for: House Representatives
  Searching Guardian for: legislation passed
  Searching Guardian for: congressional vote
  Searching Guardian for: federal law
  ⚠️  Error fetching page 128: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
  Searching Guardian for: H.R.
  Searching Guardian for: Senate bill
  Searching Guardian for: House bill
✅ Saved 25097 records to legislative_data/guardian_bill_mentions.json and legislative_data/guardian_bill_mentions.csv


In [8]:
# =============================================================================
# PART 3: COMMON CRAWL DATA COLLECTION
# =============================================================================

print("\n🌐 COMMON CRAWL DATA COLLECTION")
print("-" * 35)

class CommonCrawlScraper:
    def __init__(self):
        self.index_url = "https://index.commoncrawl.org/CC-MAIN-{}-index"
        self.available_indexes = self._get_available_indexes()
    
    def _get_available_indexes(self) -> List[str]:
        """Get available Common Crawl indexes for the date range"""
        # Common Crawl releases monthly archives
        # Format: CC-MAIN-YYYY-WW (where WW is week number)
        recent_indexes = [
            "2024-51", "2024-46", "2024-42", "2024-38", "2024-33",
            "2023-50", "2023-40", "2023-23", "2023-14", "2023-06",
            "2022-49", "2022-40", "2022-33", "2022-27", "2022-21",
            "2021-49", "2021-43", "2021-39", "2021-31", "2021-25"
        ]
        
        return recent_indexes
    
    def search_crawl_index(self, domain: str, index: str, max_results: int = 50) -> List[Dict]:
        """Search Common Crawl INDEX only (metadata, not full content)"""
        print(f"  Searching {domain} in index {index}")
        
        url = self.index_url.format(index)
        params = {
            'url': f"{domain}/*",
            'output': 'json',
            'limit': max_results,
            'filter': '=status:200'  # Only successful pages
        }
        
        try:
            response = requests.get(url, params=params, timeout=30)
            response.raise_for_status()
            
            results = []
            for line in response.text.strip().split('\n'):
                if line:
                    try:
                        data = json.loads(line)
                        # Filter for legislative content in URL
                        url_lower = data.get('url', '').lower()
                        if any(term in url_lower for term in ['congress', 'bill', 'senate', 'house', 'legislation', 'law', 'vote']):
                            results.append(data)
                    except json.JSONDecodeError:
                        continue
            
            print(f"    Found {len(results)} legislative URLs")
            return results
            
        except requests.exceptions.RequestException as e:
            print(f"  ⚠️  Error searching {domain} in {index}: {str(e)}")
            return []
    
    def collect_metadata_only(self) -> List[Dict]:
        """Collect URL metadata only (faster, no content fetching)"""
        # Target news domains known for political coverage
        news_domains = [
            'politico.com', 'thehill.com', 'rollcall.com', 
            'congress.gov', 'npr.org'
        ]
        
        all_mentions = []
        
        for domain in news_domains:
            for index in self.available_indexes[:3]:  # Limit to 3 most recent
                try:
                    results = self.search_crawl_index(domain, index, max_results=30)
                    
                    for result in results:
                        url = result.get('url', '')
                        timestamp = result.get('timestamp', '')
                        
                        # Extract bill mentions from URL itself
                        bills_in_url = extract_bill_numbers(url)
                        
                        all_mentions.append({
                            'url': url,
                            'domain': domain,
                            'timestamp': timestamp,
                            'bills_mentioned': bills_in_url,
                            'crawl_index': index,
                            'mime_type': result.get('mime', ''),
                            'source': 'common_crawl_metadata'
                        })
                    
                    time.sleep(1)  # Rate limiting
                    
                except Exception as e:
                    print(f"  ⚠️  Error processing {domain} in {index}: {str(e)}")
                    continue
        
        return all_mentions


class AlternativeNewsScraper:
    """Alternative scraper using direct RSS feeds and sitemaps"""
    
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (compatible; AcademicResearch/1.0)'
        }
    
    def scrape_politico_rss(self) -> List[Dict]:
        """Scrape Politico RSS feed"""
        print("  Fetching Politico RSS feed...")
        
        rss_urls = [
            'https://www.politico.com/rss/congress.xml',
            'https://www.politico.com/rss/politics08.xml'
        ]
        
        all_articles = []
        
        for rss_url in rss_urls:
            try:
                response = requests.get(rss_url, headers=self.headers, timeout=10)
                response.raise_for_status()
                
                # Simple XML parsing without external libraries
                content = response.text
                
                # Extract titles and links (basic regex parsing)
                import re
                items = re.findall(r'<item>(.*?)</item>', content, re.DOTALL)
                
                for item in items:
                    title_match = re.search(r'<title>(.*?)</title>', item)
                    link_match = re.search(r'<link>(.*?)</link>', item)
                    pub_date_match = re.search(r'<pubDate>(.*?)</pubDate>', item)
                    
                    if title_match and link_match:
                        title = title_match.group(1)
                        link = link_match.group(1)
                        pub_date = pub_date_match.group(1) if pub_date_match else ''
                        
                        # Extract bill mentions
                        bills = extract_bill_numbers(title)
                        
                        if bills or any(term in title.lower() for term in ['congress', 'senate', 'house', 'bill']):
                            all_articles.append({
                                'title': title,
                                'url': link,
                                'publication_date': pub_date,
                                'bills_mentioned': bills,
                                'source': 'politico_rss'
                            })
                
                print(f"    Found {len(all_articles)} articles")
                time.sleep(1)
                
            except Exception as e:
                print(f"  ⚠️  Error fetching {rss_url}: {str(e)}")
                continue
        
        return all_articles
    
    def scrape_thehill_recent(self) -> List[Dict]:
        """Scrape The Hill recent articles"""
        print("  Fetching The Hill recent articles...")
        
        try:
            url = "https://thehill.com/policy/national-security/"
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()
            
            content = response.text
            articles = []
            
            # Extract article titles and links
            import re
            article_pattern = r'<a[^>]*href="([^"]*)"[^>]*>([^<]*(?:H\.R\.|S\.|bill|Congress|Senate|House)[^<]*)</a>'
            matches = re.findall(article_pattern, content, re.IGNORECASE)
            
            for link, title in matches:
                bills = extract_bill_numbers(title)
                
                if bills or any(term in title.lower() for term in ['congress', 'bill', 'legislation']):
                    articles.append({
                        'title': title,
                        'url': link if link.startswith('http') else f"https://thehill.com{link}",
                        'bills_mentioned': bills,
                        'source': 'thehill_scrape'
                    })
            
            print(f"    Found {len(articles)} articles")
            return articles
            
        except Exception as e:
            print(f"  ⚠️  Error fetching The Hill: {str(e)}")
            return []
    
    def collect_alternative_sources(self) -> List[Dict]:
        """Collect from alternative sources"""
        all_data = []
        
        # Politico RSS
        all_data.extend(self.scrape_politico_rss())
        
        # The Hill
        all_data.extend(self.scrape_thehill_recent())
        
        return all_data


🌐 COMMON CRAWL DATA COLLECTION
-----------------------------------


In [None]:
# Initialize Common Crawl scraper
print("⚠️  Note: Common Crawl with full content fetching often has connection issues.")
print("Using metadata-only approach + alternative RSS/direct scraping.")

crawl_data = []
crawl_count = 0

try:
    print("\nCollecting Common Crawl metadata...")
    crawl_scraper = CommonCrawlScraper()
    crawl_data = crawl_scraper.collect_metadata_only()
    
    print("\nCollecting from alternative news sources (RSS feeds)...")
    alt_scraper = AlternativeNewsScraper()
    alt_data = alt_scraper.collect_alternative_sources()
    crawl_data.extend(alt_data)
    
    crawl_count = save_data(crawl_data, "news_sources_bill_mentions")
    
except Exception as e:
    print(f"⚠️  Error with news source collection: {e}")
    print("Continuing with Reddit and Guardian data...")

⚠️  Note: Common Crawl with full content fetching often has connection issues.
Using metadata-only approach + alternative RSS/direct scraping.

Collecting Common Crawl metadata...
  Searching politico.com in index 2024-51
    Found 0 legislative URLs
  Searching politico.com in index 2024-46
    Found 0 legislative URLs
  Searching politico.com in index 2024-42
    Found 0 legislative URLs
  Searching thehill.com in index 2024-51
  ⚠️  Error searching thehill.com in 2024-51: 404 Client Error: Not Found for url: https://index.commoncrawl.org/CC-MAIN-2024-51-index?url=thehill.com%2F%2A&output=json&limit=30&filter=%3Dstatus%3A200
  Searching thehill.com in index 2024-46
  ⚠️  Error searching thehill.com in 2024-46: 404 Client Error: Not Found for url: https://index.commoncrawl.org/CC-MAIN-2024-46-index?url=thehill.com%2F%2A&output=json&limit=30&filter=%3Dstatus%3A200
  Searching thehill.com in index 2024-42
  ⚠️  Error searching thehill.com in 2024-42: 404 Client Error: Not Found for url:

In [None]:
# =============================================================================
# DATA ANALYSIS AND SUMMARY
# =============================================================================

print(f"\n📊 DATA COLLECTION SUMMARY")
print("=" * 40)
print(f"Reddit mentions: {reddit_count}")
print(f"Guardian articles: {guardian_count}")
print(f"Common Crawl mentions: {crawl_count}")
print(f"Total records: {reddit_count + guardian_count + crawl_count}")
print(f"\n📁 Data saved in '{OUTPUT_DIR}' directory")

# Combine all data for analysis
all_data = reddit_data + guardian_data + crawl_data

if all_data:
    # Create summary DataFrame
    df_all = pd.DataFrame(all_data)
    
    # Basic analysis
    print(f"\n🔍 PRELIMINARY ANALYSIS")
    print("-" * 25)
    
    # Source distribution
    source_counts = df_all['source'].value_counts()
    print(f"Records by source:\n{source_counts}")
    
    # Bill mention frequency
    all_bills = []
    for record in all_data:
        all_bills.extend(record.get('bills_mentioned', []))
    
    if all_bills:
        bill_counts = Counter(all_bills)
        print(f"\nMost mentioned bills:")
        for bill, count in bill_counts.most_common(10):
            print(f"  {bill}: {count} mentions")
    
    # Save combined dataset
    combined_path = os.path.join(OUTPUT_DIR, "combined_bill_mentions.csv")
    df_all.to_csv(combined_path, index=False, encoding='utf-8')
    print(f"\n✅ Combined dataset saved to {combined_path}")



📊 DATA COLLECTION SUMMARY
Reddit mentions: 562
Guardian articles: 25097
Common Crawl mentions: 15
Total records: 25674

📁 Data saved in 'legislative_data' directory

🔍 PRELIMINARY ANALYSIS
-------------------------
Records by source:
source
guardian                 25097
reddit_improved            395
reddit_hot                 167
common_crawl_metadata       15
Name: count, dtype: int64

Most mentioned bills:
  s 2020: 445 mentions
  s 2016: 435 mentions
  s 2024: 270 mentions
  s 2008: 111 mentions
  s 10: 104 mentions
  s 2012: 99 mentions
  s 100: 92 mentions
  s 2018: 84 mentions
  s 7: 82 mentions
  s 2: 79 mentions

✅ Combined dataset saved to legislative_data/combined_bill_mentions.csv

🎯 NEXT STEPS:
1. Clean and deduplicate the collected data
2. Enhance bill number extraction with more sophisticated NLP
3. Match mentions to actual bills using Congress.gov API
4. Add sentiment analysis to the mentions
5. Temporal analysis of mention patterns
6. Integrate with voting records an