In [1]:
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath('.')))

# Essential imports
import pandas as pd
import sqlalchemy
from datetime import datetime, date, timedelta
import requests
import json
import time
from tqdm import tqdm

# Local imports
from src.database import get_database_connection, get_api_key

print("🚀 Phase 1E: Historical News Collection - Ready!")
print("📊 Mission: Build real historical news pipeline for strategy validation")


🚀 Phase 1E: Historical News Collection - Ready!
📊 Mission: Build real historical news pipeline for strategy validation


In [2]:
# Create decoupled database schema
def create_decoupled_schema():
    """Create new tables for decoupled news collection and sentiment analysis"""
    
    schema_sql = """
    -- Raw news articles storage
    CREATE TABLE IF NOT EXISTS raw_news_articles (
        id SERIAL PRIMARY KEY,
        symbol_id INTEGER REFERENCES symbols(id),
        article_date DATE NOT NULL,
        title TEXT NOT NULL,
        content TEXT,
        summary TEXT,
        source VARCHAR(100),
        url TEXT,
        published_at TIMESTAMP,
        relevance_score DECIMAL(3,2),
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
        UNIQUE(url, symbol_id)
    );
    
    -- Processed sentiment results
    CREATE TABLE IF NOT EXISTS processed_sentiment (
        id SERIAL PRIMARY KEY,
        symbol_id INTEGER REFERENCES symbols(id),
        analysis_date DATE NOT NULL,
        smo_score DECIMAL(3,2),
        smd_score DECIMAL(3,2),
        smc_score DECIMAL(3,2),
        sms_score DECIMAL(3,2),
        sdc_score DECIMAL(3,2),
        articles_analyzed INTEGER,
        confidence_score DECIMAL(3,2),
        analysis_summary TEXT,
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
        UNIQUE(symbol_id, analysis_date)
    );
    
    -- Create indexes
    CREATE INDEX IF NOT EXISTS idx_raw_news_symbol_date ON raw_news_articles(symbol_id, article_date);
    CREATE INDEX IF NOT EXISTS idx_processed_sentiment_date ON processed_sentiment(symbol_id, analysis_date);
    """
    
    try:
        engine = get_database_connection()
        with engine.connect() as conn:
            conn.execute(sqlalchemy.text(schema_sql))
            conn.commit()
        
        print("✅ Decoupled database schema created successfully")
        return True
    except Exception as e:
        print(f"❌ Error creating schema: {e}")
        return False

# Create the new schema
create_decoupled_schema()


✅ Decoupled database schema created successfully


True

In [3]:
class EODHistoricalNewsCollector:
    """Reliable EOD Historical Data collector with built-in sentiment"""
    
    def __init__(self):
        self.base_url_news = "https://eodhd.com/api/news"
        self.base_url_sentiment = "https://eodhd.com/api/sentiments"
        self.symbols = ['INTC', 'AMD', 'NVDA']
        
        # Get API key
        self.api_key = get_api_key('eod_historical')
        if not self.api_key:
            print("⚠️  EOD API key not found. Get one at: https://eodhd.com/")
            print("💡 Add it to your .env file as: EOD_HISTORICAL_API_KEY=your_key_here")
        
        # Symbol mappings for EOD format
        self.eod_symbols = {
            'INTC': 'INTC.US',
            'AMD': 'AMD.US', 
            'NVDA': 'NVDA.US'
        }
        
    def fetch_historical_news_and_sentiment(self, symbol, start_date, end_date, max_records=50):
        """Fetch both news and pre-computed sentiment from EOD"""
        
        if not self.api_key:
            print("❌ Cannot proceed without EOD API key")
            return []
            
        eod_symbol = self.eod_symbols.get(symbol, f"{symbol}.US")
        
        print(f"🔍 Collecting news + sentiment for {symbol} ({eod_symbol})")
        print(f"📅 Date range: {start_date.date()} to {end_date.date()}")
        
        # Fetch news articles
        news_articles = self._fetch_news(eod_symbol, start_date, end_date, max_records)
        
        # Fetch sentiment data
        sentiment_data = self._fetch_sentiment(eod_symbol, start_date, end_date)
        
        # Combine news with sentiment
        enriched_articles = self._enrich_with_sentiment(news_articles, sentiment_data, symbol)
        
        print(f"✅ Collected {len(enriched_articles)} articles with sentiment for {symbol}")
        return enriched_articles
    
    def _fetch_news(self, eod_symbol, start_date, end_date, max_records):
        """Fetch news articles from EOD"""
        
        params = {
            's': eod_symbol,
            'from': start_date.strftime('%Y-%m-%d'),
            'to': end_date.strftime('%Y-%m-%d'),
            'limit': max_records,
            'api_token': self.api_key,
            'fmt': 'json'
        }
        
        try:
            print(f"📡 Fetching news from EOD API...")
            response = requests.get(self.base_url_news, params=params, timeout=30)
            
            if response.status_code == 200:
                articles = response.json()
                print(f"📰 EOD returned {len(articles)} news articles")
                return articles
            else:
                print(f"⚠️  EOD News API error: {response.status_code}")
                return []
                
        except Exception as e:
            print(f"❌ Error fetching news: {e}")
            return []
    
    def _fetch_sentiment(self, eod_symbol, start_date, end_date):
        """Fetch pre-computed sentiment scores from EOD"""
        
        params = {
            's': eod_symbol,
            'from': start_date.strftime('%Y-%m-%d'),
            'to': end_date.strftime('%Y-%m-%d'),
            'api_token': self.api_key
        }
        
        try:
            print(f"🧠 Fetching sentiment scores from EOD API...")
            response = requests.get(self.base_url_sentiment, params=params, timeout=30)
            
            if response.status_code == 200:
                sentiment_data = response.json()
                
                # Extract sentiment by symbol and date
                sentiment_by_date = {}
                for symbol_data in sentiment_data.values():
                    if isinstance(symbol_data, list):
                        for entry in symbol_data:
                            date_key = entry.get('date')
                            if date_key:
                                sentiment_by_date[date_key] = {
                                    'normalized': entry.get('normalized', 0.0),
                                    'count': entry.get('count', 0)
                                }
                
                print(f"📊 EOD returned sentiment for {len(sentiment_by_date)} dates")
                return sentiment_by_date
            else:
                print(f"⚠️  EOD Sentiment API error: {response.status_code}")
                return {}
                
        except Exception as e:
            print(f"❌ Error fetching sentiment: {e}")
            return {}
    
    def _enrich_with_sentiment(self, news_articles, sentiment_data, symbol):
        """Combine news articles with sentiment scores"""
        
        enriched = []
        
        for article in news_articles:
            try:
                # Parse article date
                article_date = datetime.fromisoformat(article.get('date', '').replace('Z', '+00:00'))
                date_key = article_date.strftime('%Y-%m-%d')
                
                # Get sentiment for this date
                day_sentiment = sentiment_data.get(date_key, {'normalized': 0.0, 'count': 0})
                
                enriched_article = {
                    'symbol': symbol,
                    'title': article.get('title', ''),
                    'content': article.get('content', ''),
                    'url': article.get('link', ''),
                    'published_at': article_date,
                    'source': self._extract_source(article.get('link', '')),
                    'relevance_score': self._calculate_relevance(article.get('title', ''), symbol),
                    
                    # Built-in sentiment from EOD (no OpenAI needed!)
                    'eod_sentiment_score': day_sentiment['normalized'],
                    'eod_sentiment_count': day_sentiment['count']
                }
                
                enriched.append(enriched_article)
                
            except Exception as e:
                print(f"⚠️  Skipping malformed article: {e}")
                continue
        
        return enriched
    
    def _extract_source(self, url):
        """Extract source domain from URL"""
        try:
            from urllib.parse import urlparse
            parsed = urlparse(url)
            return parsed.netloc.replace('www.', '')
        except:
            return 'unknown'
    
    def _calculate_relevance(self, title, symbol):
        """Calculate relevance score for an article"""
        title_lower = title.lower()
        score = 0.5  # Base score
        
        # Boost for direct mentions
        if symbol.lower() in title_lower:
            score += 0.3
        if 'semiconductor' in title_lower or 'chip' in title_lower:
            score += 0.2
        if 'earnings' in title_lower or 'revenue' in title_lower:
            score += 0.3
            
        return min(1.0, score)

# Initialize EOD collector
news_collector = EODHistoricalNewsCollector()
print("✅ EOD Historical Data News Collector initialized!")
print("🏛️  Using institutional-grade infrastructure")
print("🧠 Built-in sentiment analysis (no OpenAI costs!)")
print("📊 Ready for reliable historical news + sentiment collection")


✅ EOD Historical Data News Collector initialized!
🏛️  Using institutional-grade infrastructure
🧠 Built-in sentiment analysis (no OpenAI costs!)
📊 Ready for reliable historical news + sentiment collection


In [4]:
def store_eod_news_with_sentiment(articles):
    """Store EOD news articles with built-in sentiment in database"""
    if not articles:
        return 0, 0
    
    try:
        engine = get_database_connection()
        stored_articles = 0
        stored_sentiment = 0
        
        with engine.connect() as conn:
            for article in articles:
                # Get symbol_id
                symbol_query = "SELECT id FROM symbols WHERE symbol = :symbol"
                symbol_result = conn.execute(sqlalchemy.text(symbol_query), {'symbol': article['symbol']})
                symbol_row = symbol_result.fetchone()
                
                if not symbol_row:
                    continue
                    
                symbol_id = symbol_row[0]
                
                # Store raw news article
                article_query = """
                INSERT INTO raw_news_articles 
                (symbol_id, article_date, title, content, url, published_at, source, relevance_score)
                VALUES (:symbol_id, :article_date, :title, :content, :url, :published_at, :source, :relevance_score)
                ON CONFLICT (url, symbol_id) DO NOTHING
                """
                
                result = conn.execute(sqlalchemy.text(article_query), {
                    'symbol_id': symbol_id,
                    'article_date': article['published_at'].date(),
                    'title': article['title'][:500],
                    'content': article.get('content', '')[:2000],  # Store content too
                    'url': article['url'],
                    'published_at': article['published_at'],
                    'source': article['source'],
                    'relevance_score': article['relevance_score']
                })
                
                if result.rowcount > 0:
                    stored_articles += 1
                
                # Store pre-computed sentiment (from EOD) if available
                if article.get('eod_sentiment_score') is not None and article.get('eod_sentiment_count', 0) > 0:
                    sentiment_query = """
                    INSERT INTO processed_sentiment 
                    (symbol_id, analysis_date, smo_score, smd_score, smc_score, sms_score, sdc_score,
                     articles_analyzed, confidence_score, analysis_summary)
                    VALUES (:symbol_id, :analysis_date, :smo, :smd, :smc, :sms, :sdc, :articles, :confidence, :summary)
                    ON CONFLICT (symbol_id, analysis_date) DO UPDATE SET
                        smo_score = EXCLUDED.smo_score,
                        articles_analyzed = EXCLUDED.articles_analyzed,
                        confidence_score = EXCLUDED.confidence_score,
                        analysis_summary = EXCLUDED.analysis_summary
                    """
                    
                    # Use EOD sentiment as unified score (market open, mid-day, close all same)
                    eod_score = float(article['eod_sentiment_score'])
                    
                    result = conn.execute(sqlalchemy.text(sentiment_query), {
                        'symbol_id': symbol_id,
                        'analysis_date': article['published_at'].date(),
                        'smo': eod_score,  # Market open sentiment
                        'smd': eod_score,  # Mid-day sentiment  
                        'smc': eod_score,  # Market close sentiment
                        'sms': eod_score,  # Sector sentiment
                        'sdc': eod_score,  # Direct competitor sentiment
                        'articles': article.get('eod_sentiment_count', 1),
                        'confidence': 0.85,  # EOD sentiment has high confidence
                        'summary': f"EOD Historical Data sentiment: {eod_score:.2f} (based on {article.get('eod_sentiment_count', 1)} articles)"
                    })
                    
                    if result.rowcount > 0:
                        stored_sentiment += 1
            
            conn.commit()
        
        print(f"✅ Stored {stored_articles} articles + {stored_sentiment} sentiment records")
        return stored_articles, stored_sentiment
        
    except Exception as e:
        print(f"❌ Error storing EOD data: {e}")
        return 0, 0

def collect_eod_news_batch(symbol, days_back=30):
    """Collect historical news + sentiment from EOD (no chunking needed - reliable!)"""
    end_date = datetime.now()
    start_date = end_date - timedelta(days=days_back)
    
    print(f"🔍 Collecting {days_back} days of news + sentiment for {symbol}...")
    
    # EOD is reliable - no need for chunking
    articles = news_collector.fetch_historical_news_and_sentiment(
        symbol, start_date, end_date, max_records=200
    )
    
    # Store articles with built-in sentiment
    if articles:
        stored_articles, stored_sentiment = store_eod_news_with_sentiment(articles)
        print(f"📦 Result: {stored_articles} articles + {stored_sentiment} sentiment records")
        return stored_articles
    else:
        print("📦 Result: No articles found")
        return 0

def collect_eod_news_robust():
    """Reliable EOD collection - no complex retry logic needed!"""
    
    print("🚀 Starting EOD Historical Data collection...")
    print("=" * 60)
    
    collection_results = {}
    
    for symbol in ['INTC', 'AMD', 'NVDA']:
        print(f"\n🎯 Processing {symbol}...")
        
        try:
            # EOD is reliable - start with 14 days
            count = collect_eod_news_batch(symbol, days_back=14)
            collection_results[symbol] = count
            
            print(f"✅ {symbol} collection complete: {count} articles")
            
        except Exception as e:
            print(f"❌ {symbol} collection failed: {e}")
            collection_results[symbol] = 0
        
        # Brief pause between symbols (EOD is reliable, minimal delay needed)
        print("⏳ Pausing 2 seconds between symbols...")
        time.sleep(2)
    
    print(f"\n🎉 EOD collection summary: {collection_results}")
    print(f"📊 Total articles collected: {sum(collection_results.values())}")
    
    return collection_results

print("✅ News collection and storage functions ready!")


✅ News collection and storage functions ready!


In [None]:
# Execute the EOD news collection
print("🚀 EXECUTING NEWS COLLECTION")
print("=" * 50)

# Run the collection for all three symbols
collection_results = collect_eod_news_robust()

print(f"\n✅ COLLECTION COMPLETE!")
print(f"📊 Final Results: {collection_results}")


In [6]:
# Verify what we actually collected and stored
print("🔍 FINAL VERIFICATION OF COLLECTED DATA")
print("=" * 50)

try:
    engine = get_database_connection()
    
    # Check total articles collected
    with engine.connect() as conn:
        result = conn.execute(sqlalchemy.text('SELECT COUNT(*) FROM raw_news_articles'))
        total_articles = result.fetchone()[0]
        
        # Check by symbol
        result = conn.execute(sqlalchemy.text('''
            SELECT s.symbol, COUNT(*) as article_count,
                   MIN(rna.article_date) as earliest_date,
                   MAX(rna.article_date) as latest_date
            FROM raw_news_articles rna 
            JOIN symbols s ON rna.symbol_id = s.id 
            WHERE s.symbol IN ('INTC', 'AMD', 'NVDA')
            GROUP BY s.symbol 
            ORDER BY s.symbol
        '''))
        
        print(f"📰 TOTAL ARTICLES COLLECTED: {total_articles}")
        print("\n📊 BREAKDOWN BY SYMBOL:")
        
        for row in result:
            symbol, count, earliest, latest = row
            print(f"   📈 {symbol}: {count} articles")
            print(f"      📅 Date range: {earliest} to {latest}")
        
        # Check if any EOD sentiment was stored (should be minimal)
        result = conn.execute(sqlalchemy.text('SELECT COUNT(*) FROM processed_sentiment'))
        eod_sentiment_count = result.fetchone()[0]
        
        if eod_sentiment_count > 0:
            print(f"\n🧠 EOD Sentiment Records: {eod_sentiment_count} (will be replaced by OpenAI analysis)")
        else:
            print(f"\n🧠 EOD Sentiment Records: 0 (ready for OpenAI processing)")

except Exception as e:
    print(f"❌ Verification error: {e}")

print(f"\n✅ NEWS COLLECTION PHASE COMPLETE!")
print(f"🎯 Ready for sentiment processing in notebook 05_sentiment_processing.ipynb")


🔍 FINAL VERIFICATION OF COLLECTED DATA
📰 TOTAL ARTICLES COLLECTED: 552

📊 BREAKDOWN BY SYMBOL:
   📈 AMD: 200 articles
      📅 Date range: 2025-06-20 to 2025-06-28
   📈 INTC: 152 articles
      📅 Date range: 2025-06-14 to 2025-06-28
   📈 NVDA: 200 articles
      📅 Date range: 2025-06-26 to 2025-06-28

🧠 EOD Sentiment Records: 27 (will be replaced by OpenAI analysis)

✅ NEWS COLLECTION PHASE COMPLETE!
🎯 Ready for sentiment processing in notebook 05_sentiment_processing.ipynb
