In [1]:
# Core imports
import os
import sys
import pandas as pd
import numpy as np
import json
import time
from datetime import datetime, timedelta
from tqdm import tqdm

# Database and API imports
import sqlalchemy
from openai import OpenAI

# Add src to path for database functions
sys.path.append('../src')
from database import get_database_connection, get_api_key

print("üì¶ Imports completed successfully")
print(f"üêç Python version: {sys.version.split()[0]}")
print(f"üìä Pandas version: {pd.__version__}")
print(f"üóÑÔ∏è  SQLAlchemy version: {sqlalchemy.__version__}")


üì¶ Imports completed successfully
üêç Python version: 3.11.13
üìä Pandas version: 2.3.0
üóÑÔ∏è  SQLAlchemy version: 2.0.41


In [2]:
# Verify the news data collected from notebook 04_historical_news_collection.ipynb
print("üîç VERIFYING HANDOFF FROM NOTEBOOK 04")
print("üìã Confirming news articles are ready for sentiment processing")
print("=" * 60)

engine = get_database_connection()

# Check raw news articles
news_summary_query = """
SELECT 
    s.symbol, 
    COUNT(*) as article_count,
    MIN(rna.article_date) as earliest_date,
    MAX(rna.article_date) as latest_date,
    AVG(rna.relevance_score) as avg_relevance
FROM raw_news_articles rna
JOIN symbols s ON rna.symbol_id = s.id
WHERE s.symbol IN ('INTC', 'AMD', 'NVDA')
GROUP BY s.symbol
ORDER BY s.symbol
"""

news_df = pd.read_sql(news_summary_query, engine)

print("üì∞ RAW NEWS ARTICLES SUMMARY:")
total_articles = 0
for _, row in news_df.iterrows():
    total_articles += row['article_count']
    print(f"üìà {row['symbol']}: {row['article_count']} articles")
    print(f"   üìÖ Date range: {row['earliest_date']} to {row['latest_date']}")
    print(f"   üéØ Avg relevance: {row['avg_relevance']:.2f}")
    print()

print(f"üéâ TOTAL: {total_articles} articles ready for sentiment processing")

# Check what dates have sufficient articles for processing
processing_candidates_query = """
SELECT 
    s.symbol, 
    rna.article_date, 
    COUNT(*) as article_count
FROM raw_news_articles rna
JOIN symbols s ON rna.symbol_id = s.id
WHERE s.symbol IN ('INTC', 'AMD', 'NVDA')
GROUP BY s.symbol, rna.article_date
HAVING COUNT(*) >= 2
ORDER BY rna.article_date DESC, s.symbol
"""

candidates_df = pd.read_sql(processing_candidates_query, engine)

print(f"\nüìä PROCESSING CANDIDATES: {len(candidates_df)} symbol-date combinations with 2+ articles")
print("\nüìã Sample processing queue:")
for i, (_, row) in enumerate(candidates_df.head(10).iterrows()):
    print(f"   {i+1}. {row['symbol']} on {row['article_date']} ({row['article_count']} articles)")

if len(candidates_df) > 10:
    print(f"   ... and {len(candidates_df) - 10} more")

print(f"\n‚úÖ Ready to process {len(candidates_df)} sentiment analyses")


üîç VERIFYING HANDOFF FROM NOTEBOOK 04
üìã Confirming news articles are ready for sentiment processing
üì∞ RAW NEWS ARTICLES SUMMARY:
üìà AMD: 200 articles
   üìÖ Date range: 2025-06-20 to 2025-06-28
   üéØ Avg relevance: 0.64

üìà INTC: 152 articles
   üìÖ Date range: 2025-06-14 to 2025-06-28
   üéØ Avg relevance: 0.58

üìà NVDA: 200 articles
   üìÖ Date range: 2025-06-26 to 2025-06-28
   üéØ Avg relevance: 0.55

üéâ TOTAL: 552 articles ready for sentiment processing

üìä PROCESSING CANDIDATES: 26 symbol-date combinations with 2+ articles

üìã Sample processing queue:
   1. AMD on 2025-06-28 (11 articles)
   2. INTC on 2025-06-28 (4 articles)
   3. NVDA on 2025-06-28 (49 articles)
   4. AMD on 2025-06-27 (28 articles)
   5. INTC on 2025-06-27 (10 articles)
   6. NVDA on 2025-06-27 (119 articles)
   7. AMD on 2025-06-26 (32 articles)
   8. INTC on 2025-06-26 (15 articles)
   9. NVDA on 2025-06-26 (32 articles)
   10. AMD on 2025-06-25 (40 articles)
   ... and 16 more

‚ú

In [4]:
# Setup and test OpenAI client
print("üîë SETTING UP OPENAI CLIENT")
print("=" * 40)

try:
    openai_key = get_api_key('openai')
    if not openai_key:
        raise ValueError("No OpenAI API key found")
    
    client = OpenAI(api_key=openai_key)
    print(f"‚úÖ OpenAI API key loaded: {openai_key[:8]}...{openai_key[-4:]}")
    
    # Test the client with a simple call
    print("\nüß™ Testing OpenAI API connection...")
    
    test_response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that responds with valid JSON only."},
            {"role": "user", "content": 'Return this exact JSON: {"status": "success", "test": true}'}
        ],
        max_tokens=50,
        temperature=0.1
    )
    
    test_content = test_response.choices[0].message.content.strip()
    print(f"üì§ API Response: {test_content}")
    
    # Clean and parse the JSON (handle markdown formatting)
    try:
        # Strip common markdown formatting from OpenAI responses
        clean_content = test_content
        if clean_content.startswith('```json'):
            clean_content = clean_content.replace('```json', '').replace('```', '').strip()
        elif clean_content.startswith('```'):
            clean_content = clean_content.replace('```', '').strip()
        
        print(f"üßπ Cleaned content: {clean_content}")
        
        test_json = json.loads(clean_content)
        print(f"‚úÖ JSON parsing successful: {test_json}")
        print(f"üí∞ Tokens used: {test_response.usage.total_tokens}")
        openai_working = True
    except json.JSONDecodeError as e:
        print(f"‚ùå JSON parsing failed: {e}")
        print(f"üìÑ Raw content: '{test_content}'")
        print(f"üìÑ Cleaned content: '{clean_content if 'clean_content' in locals() else 'N/A'}'")
        openai_working = False
        
except Exception as e:
    print(f"‚ùå OpenAI setup failed: {e}")
    print("\nüîß Troubleshooting:")
    print("   1. Check OPENAI_API_KEY in .env file")
    print("   2. Verify API key is valid at https://platform.openai.com/api-keys")
    print("   3. Check account has sufficient credits")
    print("   4. Ensure internet connectivity")
    client = None
    openai_working = False

if openai_working:
    print("\nüöÄ OpenAI client ready for sentiment processing!")
else:
    print("\n‚ö†Ô∏è  OpenAI client issues detected - check troubleshooting steps above")


üîë SETTING UP OPENAI CLIENT
‚úÖ OpenAI API key loaded: sk-proj-...zqgA

üß™ Testing OpenAI API connection...
üì§ API Response: ```json
{"status": "success", "test": true}
```
üßπ Cleaned content: {"status": "success", "test": true}
‚úÖ JSON parsing successful: {'status': 'success', 'test': True}
üí∞ Tokens used: 54

üöÄ OpenAI client ready for sentiment processing!


In [5]:
def process_sentiment_for_date(symbol, date, max_articles=5):
    """Process sentiment for a specific symbol and date"""
    
    if not openai_working or not client:
        print("‚ùå OpenAI client not working")
        return None
    
    try:
        # Get articles for the date
        engine = get_database_connection()
        
        query = """
        SELECT rna.title, rna.source, rna.relevance_score
        FROM raw_news_articles rna
        JOIN symbols s ON rna.symbol_id = s.id
        WHERE s.symbol = :symbol AND rna.article_date = :date
        ORDER BY rna.relevance_score DESC
        LIMIT :max_articles
        """
        
        with engine.connect() as conn:
            result = conn.execute(sqlalchemy.text(query), {
                'symbol': symbol,
                'date': date,
                'max_articles': max_articles
            })
            
            articles = []
            for row in result:
                articles.append({
                    'title': row[0],
                    'source': row[1],
                    'relevance': row[2]
                })
        
        if not articles:
            print(f"‚ö†Ô∏è  No articles found for {symbol} on {date}")
            return None
        
        print(f"üì∞ Processing {len(articles)} articles for {symbol} on {date}")
        
        # Create prompt
        articles_text = "\n".join([
            f"‚Ä¢ {article['title']} (Source: {article['source']}, Relevance: {article['relevance']:.1f})"
            for article in articles
        ])
        
        prompt = f"""Analyze sentiment impact for {symbol} stock based on these news articles from {date}:

NEWS ARTICLES:
{articles_text}

Provide sentiment analysis as JSON with scores from -1.0 (very negative) to 1.0 (very positive):

{{
    "smo": 0.0,
    "smd": 0.0,
    "smc": 0.0,
    "sms": 0.0,
    "sdc": 0.0,
    "confidence": 0.8,
    "summary": "Brief analysis summary"
}}

Score meanings:
- smo: Market open impact
- smd: Mid-day trading impact
- smc: Market close impact
- sms: Semiconductor sector impact
- sdc: Direct competitor impact

Return ONLY the JSON object, no other text."""

        # Process with retries
        for attempt in range(3):
            try:
                print(f"üîÑ API call attempt {attempt + 1}/3")
                
                response = client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=[
                        {"role": "system", "content": "You are a financial analyst. Return only valid JSON."},
                        {"role": "user", "content": prompt}
                    ],
                    max_tokens=300,
                    temperature=0.2
                )
                
                content = response.choices[0].message.content.strip()
                
                if not content:
                    print("‚ö†Ô∏è  Empty response from OpenAI")
                    if attempt < 2:
                        time.sleep(5)
                        continue
                    else:
                        # Return neutral sentiment as fallback
                        return {
                            'symbol': symbol,
                            'date': date,
                            'smo': 0.0, 'smd': 0.0, 'smc': 0.0, 'sms': 0.0, 'sdc': 0.0,
                            'confidence': 0.1,
                            'summary': 'Neutral fallback (empty API response)',
                            'articles_analyzed': len(articles)
                        }
                
                # Clean up response
                clean_content = content
                if clean_content.startswith('```json'):
                    clean_content = clean_content.replace('```json', '').replace('```', '').strip()
                elif clean_content.startswith('```'):
                    clean_content = clean_content.replace('```', '').strip()
                
                # Parse JSON
                try:
                    sentiment_data = json.loads(clean_content)
                    
                    # Add metadata
                    sentiment_data['symbol'] = symbol
                    sentiment_data['date'] = date
                    sentiment_data['articles_analyzed'] = len(articles)
                    sentiment_data['tokens_used'] = response.usage.total_tokens
                    
                    print(f"‚úÖ Sentiment analysis successful")
                    print(f"üìä SMO: {sentiment_data.get('smo', 0):.2f}, SMS: {sentiment_data.get('sms', 0):.2f}")
                    
                    return sentiment_data
                    
                except json.JSONDecodeError as e:
                    print(f"‚ùå JSON parsing failed: {e}")
                    print(f"üìÑ Content: {clean_content[:100]}...")
                    
                    if attempt < 2:
                        time.sleep(3)
                        continue
                    else:
                        return {
                            'symbol': symbol,
                            'date': date,
                            'smo': 0.0, 'smd': 0.0, 'smc': 0.0, 'sms': 0.0, 'sdc': 0.0,
                            'confidence': 0.1,
                            'summary': 'Neutral fallback (JSON parse failed)',
                            'articles_analyzed': len(articles)
                        }
                        
            except Exception as e:
                print(f"‚ùå API call failed: {e}")
                
                if attempt < 2:
                    wait_time = 5 * (attempt + 1)
                    print(f"‚è≥ Waiting {wait_time} seconds before retry...")
                    time.sleep(wait_time)
                    continue
                else:
                    return {
                        'symbol': symbol,
                        'date': date,
                        'smo': 0.0, 'smd': 0.0, 'smc': 0.0, 'sms': 0.0, 'sdc': 0.0,
                        'confidence': 0.1,
                        'summary': f'Neutral fallback (API error: {str(e)})',
                        'articles_analyzed': len(articles)
                    }
        
        return None
        
    except Exception as e:
        print(f"‚ùå Overall error: {e}")
        return None

def store_sentiment_result(sentiment_data):
    """Store sentiment analysis result in database"""
    try:
        engine = get_database_connection()
        
        with engine.connect() as conn:
            # Get symbol_id
            symbol_result = conn.execute(
                sqlalchemy.text("SELECT id FROM symbols WHERE symbol = :symbol"),
                {'symbol': sentiment_data['symbol']}
            )
            symbol_row = symbol_result.fetchone()
            
            if not symbol_row:
                print(f"‚ùå Symbol {sentiment_data['symbol']} not found")
                return False
            
            symbol_id = symbol_row[0]
            
            # Insert sentiment data
            insert_query = """
            INSERT INTO processed_sentiment 
            (symbol_id, analysis_date, smo_score, smd_score, smc_score, sms_score, sdc_score,
             articles_analyzed, confidence_score, analysis_summary)
            VALUES (:symbol_id, :analysis_date, :smo, :smd, :smc, :sms, :sdc, :articles, :confidence, :summary)
            ON CONFLICT (symbol_id, analysis_date) DO UPDATE SET
                smo_score = EXCLUDED.smo_score,
                smd_score = EXCLUDED.smd_score,
                smc_score = EXCLUDED.smc_score,
                sms_score = EXCLUDED.sms_score,
                sdc_score = EXCLUDED.sdc_score,
                articles_analyzed = EXCLUDED.articles_analyzed,
                confidence_score = EXCLUDED.confidence_score,
                analysis_summary = EXCLUDED.analysis_summary
            """
            
            conn.execute(sqlalchemy.text(insert_query), {
                'symbol_id': symbol_id,
                'analysis_date': sentiment_data['date'],
                'smo': sentiment_data.get('smo', 0.0),
                'smd': sentiment_data.get('smd', 0.0),
                'smc': sentiment_data.get('smc', 0.0),
                'sms': sentiment_data.get('sms', 0.0),
                'sdc': sentiment_data.get('sdc', 0.0),
                'articles': sentiment_data.get('articles_analyzed', 0),
                'confidence': sentiment_data.get('confidence', 0.5),
                'summary': sentiment_data.get('summary', 'Sentiment analysis')[:500]
            })
            
            conn.commit()
        
        return True
        
    except Exception as e:
        print(f"‚ùå Storage error: {e}")
        return False

print("‚úÖ Sentiment processing functions defined")


‚úÖ Sentiment processing functions defined


In [6]:
# Execute the complete sentiment processing pipeline
if not openai_working:
    print("‚ùå Cannot proceed - OpenAI client not working")
    print("üîß Fix OpenAI setup in the previous cell first")
else:
    print("üöÄ STARTING SENTIMENT PROCESSING PIPELINE")
    print("=" * 60)
    
    # Get processing queue
    engine = get_database_connection()
    
    queue_query = """
    SELECT 
        s.symbol, 
        rna.article_date, 
        COUNT(*) as article_count
    FROM raw_news_articles rna
    JOIN symbols s ON rna.symbol_id = s.id
    WHERE s.symbol IN ('INTC', 'AMD', 'NVDA')
    GROUP BY s.symbol, rna.article_date
    HAVING COUNT(*) >= 2
    ORDER BY rna.article_date DESC, s.symbol
    """
    
    processing_queue = pd.read_sql(queue_query, engine)
    
    print(f"üìã Processing queue: {len(processing_queue)} symbol-date combinations")
    print(f"‚è±Ô∏è  Estimated time: {len(processing_queue) * 20 / 60:.1f} minutes (with rate limiting)")
    
    # Track results
    successful = 0
    failed = 0
    results = []
    
    # Process each combination
    for i, (_, row) in enumerate(processing_queue.iterrows()):
        symbol = row['symbol']
        date = row['article_date']
        article_count = row['article_count']
        
        print(f"\nüìà [{i+1}/{len(processing_queue)}] {symbol} on {date} ({article_count} articles)")
        print("-" * 50)
        
        try:
            # Rate limiting (skip for first item)
            if i > 0:
                print("‚è≥ Rate limiting: waiting 15 seconds...")
                time.sleep(15)
            
            # Process sentiment
            sentiment_result = process_sentiment_for_date(symbol, date)
            
            if sentiment_result:
                # Store result
                if store_sentiment_result(sentiment_result):
                    successful += 1
                    results.append(sentiment_result)
                    print(f"‚úÖ SUCCESS: {symbol} on {date}")
                    print(f"üìä Scores: SMO={sentiment_result.get('smo', 0):.2f}, SMS={sentiment_result.get('sms', 0):.2f}, Confidence={sentiment_result.get('confidence', 0):.2f}")
                else:
                    failed += 1
                    print(f"‚ùå STORAGE FAILED: {symbol} on {date}")
            else:
                failed += 1
                print(f"‚ùå PROCESSING FAILED: {symbol} on {date}")
                
        except KeyboardInterrupt:
            print("\n‚ö†Ô∏è  Processing interrupted by user")
            break
            
        except Exception as e:
            failed += 1
            print(f"‚ùå UNEXPECTED ERROR: {symbol} on {date} - {e}")
    
    # Final summary
    print("\n" + "=" * 60)
    print("üéâ SENTIMENT PROCESSING COMPLETE!")
    print("=" * 60)
    print(f"‚úÖ Successful: {successful}")
    print(f"‚ùå Failed: {failed}")
    print(f"üìä Total processed: {successful + failed}")
    print(f"üìà Success rate: {(successful / (successful + failed) * 100):.1f}%")
    
    if successful > 0:
        print(f"\nüí∞ Estimated cost: ~${successful * 0.02:.2f} (approximate)")
        print("\nüéØ Ready for trading strategy development!")
    else:
        print("\n‚ö†Ô∏è  No successful processing - check error messages above")


üöÄ STARTING SENTIMENT PROCESSING PIPELINE
üìã Processing queue: 26 symbol-date combinations
‚è±Ô∏è  Estimated time: 8.7 minutes (with rate limiting)

üìà [1/26] AMD on 2025-06-28 (11 articles)
--------------------------------------------------
üì∞ Processing 5 articles for AMD on 2025-06-28
üîÑ API call attempt 1/3
‚úÖ Sentiment analysis successful
üìä SMO: 0.70, SMS: 0.50
‚úÖ SUCCESS: AMD on 2025-06-28
üìä Scores: SMO=0.70, SMS=0.50, Confidence=0.90

üìà [2/26] INTC on 2025-06-28 (4 articles)
--------------------------------------------------
‚è≥ Rate limiting: waiting 15 seconds...
üì∞ Processing 4 articles for INTC on 2025-06-28
üîÑ API call attempt 1/3
‚úÖ Sentiment analysis successful
üìä SMO: -0.60, SMS: -0.30
‚úÖ SUCCESS: INTC on 2025-06-28
üìä Scores: SMO=-0.60, SMS=-0.30, Confidence=0.75

üìà [3/26] NVDA on 2025-06-28 (49 articles)
--------------------------------------------------
‚è≥ Rate limiting: waiting 15 seconds...
üì∞ Processing 5 articles for NVDA on 20

In [8]:
# Validate the sentiment processing results
print("üìä SENTIMENT PROCESSING VALIDATION")
print("=" * 50)

engine = get_database_connection()

# Check processed sentiment summary
try:
    sentiment_summary_query = """
    SELECT 
        s.symbol,
        COUNT(*) as days_processed,
        AVG(ps.confidence_score) as avg_confidence,
        AVG(ps.articles_analyzed) as avg_articles_per_day,
        AVG(ps.smo_score) as avg_smo,
        AVG(ps.sms_score) as avg_sms,
        MIN(ps.analysis_date) as earliest_date,
        MAX(ps.analysis_date) as latest_date
    FROM processed_sentiment ps
    JOIN symbols s ON ps.symbol_id = s.id
    WHERE s.symbol IN ('INTC', 'AMD', 'NVDA')
    GROUP BY s.symbol
    ORDER BY s.symbol
    """
    
    sentiment_df = pd.read_sql(sentiment_summary_query, engine)
    
    if len(sentiment_df) > 0:
        print("‚úÖ PROCESSED SENTIMENT SUMMARY:")
        for _, row in sentiment_df.iterrows():
            print(f"\nüìä {row['symbol']}:")
            print(f"   üìÖ {row['days_processed']} days processed ({row['earliest_date']} to {row['latest_date']})")
            print(f"   üì∞ Avg {row['avg_articles_per_day']:.1f} articles per day")
            print(f"   üéØ Avg confidence: {row['avg_confidence']:.2f}")
            print(f"   üìà Avg SMO (market open): {row['avg_smo']:.2f}")
            print(f"   üè≠ Avg SMS (sector): {row['avg_sms']:.2f}")
        
        total_days = sentiment_df['days_processed'].sum()
        print(f"\nüéâ TOTAL: {total_days} sentiment analyses completed!")
        
    else:
        print("‚ö†Ô∏è  No processed sentiment data found")
        
except Exception as e:
    print(f"‚ùå Error checking sentiment data: {e}")

# Show recent sentiment samples
try:
    recent_query = """
    SELECT 
        s.symbol,
        ps.analysis_date,
        ps.smo_score,
        ps.smd_score,
        ps.smc_score,
        ps.sms_score,  
        ps.sdc_score,
        ps.confidence_score,
        ps.articles_analyzed,
        ps.analysis_summary
    FROM processed_sentiment ps
    JOIN symbols s ON ps.symbol_id = s.id
    WHERE s.symbol IN ('INTC', 'AMD', 'NVDA')
    ORDER BY ps.analysis_date DESC, s.symbol
    LIMIT 5
    """
    
    recent_df = pd.read_sql(recent_query, engine)
    
    if len(recent_df) > 0:
        print("\nüìà RECENT SENTIMENT SAMPLES:")
        for _, row in recent_df.iterrows():
            print(f"\nüîπ {row['symbol']} on {row['analysis_date']} ({row['articles_analyzed']} articles):")
            print(f"   SMO: {row['smo_score']:.2f} | SMD: {row['smd_score']:.2f} | SMC: {row['smc_score']:.2f}")
            print(f"   SMS: {row['sms_score']:.2f} | SDC: {row['sdc_score']:.2f} | Confidence: {row['confidence_score']:.2f}")
            print(f"   Summary: {row['analysis_summary'][:80]}...")
            
except Exception as e:
    print(f"‚ùå Error getting recent samples: {e}")

# Data completeness check
try:
    completeness_query = """
    SELECT 
        'Raw Articles' as data_type,
        COUNT(*) as count
    FROM raw_news_articles rna
    JOIN symbols s ON rna.symbol_id = s.id
    WHERE s.symbol IN ('INTC', 'AMD', 'NVDA')
    
    UNION ALL
    
    SELECT 
        'Processed Sentiment' as data_type,
        COUNT(*) as count
    FROM processed_sentiment ps
    JOIN symbols s ON ps.symbol_id = s.id
    WHERE s.symbol IN ('INTC', 'AMD', 'NVDA')
    """
    
    completeness_df = pd.read_sql(completeness_query, engine)
    
    print("\nüìã DATA COMPLETENESS:")
    for _, row in completeness_df.iterrows():
        print(f"   {row['data_type']}: {row['count']}")
        
except Exception as e:
    print(f"‚ùå Error checking data completeness: {e}")

print("\n‚úÖ Validation complete!")
print("üéØ If sentiment data is available, ready for 06_trading_strategy_development.ipynb")


üìä SENTIMENT PROCESSING VALIDATION
‚úÖ PROCESSED SENTIMENT SUMMARY:

üìä AMD:
   üìÖ 9 days processed (2025-06-20 to 2025-06-28)
   üì∞ Avg 5.0 articles per day
   üéØ Avg confidence: 0.86
   üìà Avg SMO (market open): 0.66
   üè≠ Avg SMS (sector): 0.54

üìä INTC:
   üìÖ 15 days processed (2025-06-14 to 2025-06-28)
   üì∞ Avg 4.1 articles per day
   üéØ Avg confidence: 0.80
   üìà Avg SMO (market open): 0.10
   üè≠ Avg SMS (sector): 0.15

üìä NVDA:
   üìÖ 3 days processed (2025-06-26 to 2025-06-28)
   üì∞ Avg 5.0 articles per day
   üéØ Avg confidence: 0.85
   üìà Avg SMO (market open): 0.70
   üè≠ Avg SMS (sector): 0.70

üéâ TOTAL: 27 sentiment analyses completed!

üìà RECENT SENTIMENT SAMPLES:

üîπ AMD on 2025-06-28 (5 articles):
   SMO: 0.70 | SMD: 0.60 | SMC: 0.80
   SMS: 0.50 | SDC: 0.70 | Confidence: 0.90
   Summary: Positive sentiment driven by analyst upgrades and comparisons to competitors, in...

üîπ INTC on 2025-06-28 (4 articles):
   SMO: -0.60 | SMD

In [None]:
# Enhanced Article Selection with Source Quality Filtering

TRUSTED_SOURCES = {
    # Tier 1: Premium financial sources (2x weight)
    'reuters.com': 2.0,
    'bloomberg.com': 2.0, 
    'marketwatch.com': 2.0,
    'seekingalpha.com': 2.0,
    'finance.yahoo.com': 2.0,
    
    # Tier 2: Standard financial sources (1x weight)
    'cnbc.com': 1.0,
    'forbes.com': 1.0,
    'barrons.com': 1.0,
    'fool.com': 1.0,
    'benzinga.com': 1.0,
    
    # Tier 3: General sources (0.5x weight)  
    'zacks.com': 0.5,
    'investorplace.com': 0.5,
    'nasdaq.com': 0.5
}

def calculate_enhanced_relevance(article_title, article_source, symbol):
    """Calculate relevance with source quality weighting"""
    
    # Base relevance (existing logic)
    base_score = 0.5
    title_lower = article_title.lower()
    
    if symbol.lower() in title_lower:
        base_score += 0.3
    if any(term in title_lower for term in ['semiconductor', 'chip', 'ai']):
        base_score += 0.2
    if any(term in title_lower for term in ['earnings', 'revenue', 'guidance']):
        base_score += 0.3
    if any(term in title_lower for term in ['upgrade', 'downgrade', 'rating']):
        base_score += 0.2
        
    # Source quality multiplier
    source_weight = TRUSTED_SOURCES.get(article_source.lower(), 0.3)  # Default 0.3 for unknown sources
    
    enhanced_score = min(1.0, base_score * source_weight)
    return enhanced_score

def get_optimal_articles_for_date(symbol, date, max_articles=3):
    """Get optimal article selection with quality filtering"""
    
    try:
        engine = get_database_connection()
        
        # Get ALL articles for the date first
        query = """
        SELECT rna.title, rna.source, rna.relevance_score, rna.content
        FROM raw_news_articles rna
        JOIN symbols s ON rna.symbol_id = s.id
        WHERE s.symbol = :symbol AND rna.article_date = :date
        ORDER BY rna.relevance_score DESC
        """
        
        with engine.connect() as conn:
            result = conn.execute(sqlalchemy.text(query), {
                'symbol': symbol,
                'date': date
            })
            
            raw_articles = []
            for row in result:
                raw_articles.append({
                    'title': row[0],
                    'source': row[1],
                    'relevance': row[2],
                    'content': row[3] or ''
                })
        
        if not raw_articles:
            return []
            
        print(f"üì∞ Found {len(raw_articles)} raw articles for {symbol} on {date}")
        
        # Enhance relevance scores with source quality
        enhanced_articles = []
        for article in raw_articles:
            enhanced_relevance = calculate_enhanced_relevance(
                article['title'], article['source'], symbol
            )
            
            enhanced_articles.append({
                **article,
                'enhanced_relevance': enhanced_relevance,
                'source_tier': TRUSTED_SOURCES.get(article['source'].lower(), 0.3)
            })
        
        # Sort by enhanced relevance and select top articles
        enhanced_articles.sort(key=lambda x: x['enhanced_relevance'], reverse=True)
        selected_articles = enhanced_articles[:max_articles]
        
        print(f"üìä Selected {len(selected_articles)} high-quality articles:")
        for i, article in enumerate(selected_articles, 1):
            tier = "Tier 1" if article['source_tier'] >= 2.0 else "Tier 2" if article['source_tier'] >= 1.0 else "Tier 3"
            print(f"   {i}. {tier} | {article['source']} | Score: {article['enhanced_relevance']:.2f}")
            print(f"      {article['title'][:80]}...")
        
        return selected_articles
        
    except Exception as e:
        print(f"‚ùå Error selecting optimal articles: {e}")
        return []

# Test the enhanced selection
print("üß™ TESTING ENHANCED ARTICLE SELECTION")
print("=" * 50)

# Test on a high-volume day
test_symbol = 'NVDA'
test_date = '2025-06-27'

optimal_articles = get_optimal_articles_for_date(test_symbol, test_date, max_articles=3)

if optimal_articles:
    print(f"\n‚úÖ Enhanced selection reduces processing costs significantly")
    print("üí∞ Estimated cost reduction: ~70-80% vs processing all articles")
else:
    print("‚ö†Ô∏è  No articles found for test")

print("\nüéØ Ready to implement enhanced processing pipeline!")
