In [1]:
# Essential imports
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '..'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
import sqlalchemy
from datetime import datetime, timedelta
import warnings

# Custom modules
from src.database import get_database_connection

# Configure display settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8')
pio.templates.default = "plotly_dark"

print("✅ All imports loaded successfully!")
print(f"📊 Pandas version: {pd.__version__}")
print(f"📈 Plotly version: {plotly.__version__}")
print("🚀 Ready for analysis!")


✅ All imports loaded successfully!
📊 Pandas version: 2.3.0
📈 Plotly version: 6.2.0
🚀 Ready for analysis!


In [2]:
def load_foundation_dataset():
    """Load the complete foundation dataset with market data and sentiment scores"""
    
    try:
        print("🔄 Connecting to database...")
        engine = get_database_connection()
        
        # Load market data with sentiment scores from actual database schema
        query = """
        SELECT 
            s.symbol,
            md.trade_date as date,
            md.open_price,
            md.high_price,
            md.low_price,
            md.close_price,
            md.volume,
            md.close_price as adjusted_close,
            COALESCE(ps.smo_score, 0) as sentiment_score,
            COALESCE(ps.confidence_score, 0) as confidence_score,
            ps.articles_analyzed as articles_count
        FROM market_data md
        LEFT JOIN symbols s ON md.symbol_id = s.id
        LEFT JOIN processed_sentiment ps ON md.symbol_id = ps.symbol_id 
            AND md.trade_date = ps.analysis_date
        WHERE md.trade_date >= '2024-01-01'
        ORDER BY s.symbol, md.trade_date
        """
        
        print("📊 Loading market data with sentiment scores...")
        df = pd.read_sql(query, engine)
        df['date'] = pd.to_datetime(df['date'])
        
        print(f"✅ Loaded {len(df):,} records across {df['symbol'].nunique()} symbols")
        print(f"📅 Date range: {df['date'].min().date()} to {df['date'].max().date()}")
        
        # Check sentiment coverage
        sentiment_records = df[df['sentiment_score'] != 0].shape[0]
        sentiment_coverage = (sentiment_records / len(df)) * 100 if len(df) > 0 else 0
        print(f"🎭 Sentiment coverage: {sentiment_records:,} records ({sentiment_coverage:.1f}%)")
        
        return df
        
    except Exception as e:
        print(f"❌ Error loading data: {e}")
        return None

def create_combined_dataset(market_data, sentiment_data):
    """Combine market data with sentiment data for analysis"""
    
    if market_data is None:
        print("❌ No market data provided")
        return None
        
    print("🔄 Processing combined dataset...")
    
    # Calculate daily returns
    market_data = market_data.sort_values(['symbol', 'date'])
    market_data['daily_return'] = market_data.groupby('symbol')['close_price'].pct_change()
    market_data['prev_close'] = market_data.groupby('symbol')['close_price'].shift(1)
    
    # Calculate moving averages
    market_data['ma_5'] = market_data.groupby('symbol')['close_price'].rolling(5).mean().reset_index(0, drop=True)
    market_data['ma_20'] = market_data.groupby('symbol')['close_price'].rolling(20).mean().reset_index(0, drop=True)
    
    # Calculate volatility
    market_data['volatility_5d'] = market_data.groupby('symbol')['daily_return'].rolling(5).std().reset_index(0, drop=True)
    
    # Sentiment metrics
    market_data['sentiment_lag1'] = market_data.groupby('symbol')['sentiment_score'].shift(1)
    market_data['sentiment_ma3'] = market_data.groupby('symbol')['sentiment_score'].rolling(3).mean().reset_index(0, drop=True)
    
    print(f"✅ Combined dataset processed with {len(market_data):,} records")
    
    return market_data

print("✅ Data loading functions ready!")


✅ Data loading functions ready!


In [3]:
def create_market_overview(df):
    """Create comprehensive market overview visualization"""
    
    if df is None or df.empty:
        print("❌ No data available for market overview")
        return None
    
    print("🔄 Creating market overview visualization...")
    
    # Create subplots
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=['Price Trends by Symbol', 'Daily Returns Distribution', 
                       'Volume Patterns', 'Volatility Analysis'],
        specs=[[{"secondary_y": True}, {"type": "histogram"}],
               [{"secondary_y": True}, {"type": "scatter"}]]
    )
    
    symbols = df['symbol'].unique()[:5]  # Top 5 symbols for clarity
    colors = ['blue', 'red', 'green', 'orange', 'purple']
    
    for i, symbol in enumerate(symbols):
        symbol_data = df[df['symbol'] == symbol].sort_values('date')
        color = colors[i % len(colors)]
        
        # 1. Price trends
        fig.add_trace(
            go.Scatter(
                x=symbol_data['date'],
                y=symbol_data['close_price'],
                name=f'{symbol} Price',
                line=dict(color=color),
                legendgroup=symbol
            ),
            row=1, col=1
        )
        
        # Add volume on secondary y-axis
        fig.add_trace(
            go.Scatter(
                x=symbol_data['date'],
                y=symbol_data['volume'],
                name=f'{symbol} Volume',
                line=dict(color=color, dash='dash'),
                opacity=0.6,
                yaxis='y2',
                legendgroup=symbol,
                showlegend=False
            ),
            row=1, col=1
        )
        
        # 2. Returns distribution
        fig.add_trace(
            go.Histogram(
                x=symbol_data['daily_return'].dropna(),
                name=f'{symbol} Returns',
                opacity=0.7,
                nbinsx=30,
                legendgroup=symbol,
                showlegend=False
            ),
            row=1, col=2
        )
        
        # 3. Volume patterns
        fig.add_trace(
            go.Scatter(
                x=symbol_data['date'],
                y=symbol_data['volume'],
                name=f'{symbol} Volume',
                line=dict(color=color),
                legendgroup=symbol,
                showlegend=False
            ),
            row=2, col=1
        )
        
        # 4. Volatility analysis
        if 'volatility_5d' in symbol_data.columns:
            fig.add_trace(
                go.Scatter(
                    x=symbol_data['date'],
                    y=symbol_data['volatility_5d'],
                    name=f'{symbol} Volatility',
                    mode='lines',
                    line=dict(color=color),
                    legendgroup=symbol,
                    showlegend=False
                ),
                row=2, col=2
            )
    
    # Update layout
    fig.update_layout(
        height=800,
        title="Market Overview Dashboard",
        showlegend=True
    )
    
    # Update axes
    fig.update_xaxes(title_text="Date", row=1, col=1)
    fig.update_yaxes(title_text="Price ($)", row=1, col=1)
    fig.update_yaxes(title_text="Volume", secondary_y=True, row=1, col=1)
    
    fig.update_xaxes(title_text="Daily Return", row=1, col=2)
    fig.update_yaxes(title_text="Frequency", row=1, col=2)
    
    fig.update_xaxes(title_text="Date", row=2, col=1)
    fig.update_yaxes(title_text="Volume", row=2, col=1)
    
    fig.update_xaxes(title_text="Date", row=2, col=2)
    fig.update_yaxes(title_text="5-day Volatility", row=2, col=2)
    
    fig.show()
    return fig

def print_market_insights(df):
    """Print key market insights and statistics"""
    
    if df is None or df.empty:
        print("❌ No data available for insights")
        return
    
    print("\n" + "="*60)
    print("📊 MARKET INSIGHTS SUMMARY")
    print("="*60)
    
    # Basic statistics
    total_records = len(df)
    symbols_count = df['symbol'].nunique()
    date_range = (df['date'].max() - df['date'].min()).days
    
    print(f"📈 Dataset Overview:")
    print(f"   • Total records: {total_records:,}")
    print(f"   • Number of symbols: {symbols_count}")
    print(f"   • Date range: {date_range} days")
    print(f"   • Period: {df['date'].min().date()} to {df['date'].max().date()}")
    
    # Price and return statistics
    print(f"\n💰 Price & Returns Analysis:")
    avg_return = df['daily_return'].mean() * 100
    volatility = df['daily_return'].std() * 100
    
    print(f"   • Average daily return: {avg_return:.3f}%")
    print(f"   • Daily volatility: {volatility:.3f}%")
    print(f"   • Best single day return: {df['daily_return'].max()*100:.2f}%")
    print(f"   • Worst single day return: {df['daily_return'].min()*100:.2f}%")
    
    # Volume analysis
    print(f"\n📊 Volume Analysis:")
    avg_volume = df['volume'].mean()
    print(f"   • Average daily volume: {avg_volume:,.0f}")
    print(f"   • Peak volume: {df['volume'].max():,.0f}")
    
    # Top performers
    print(f"\n🏆 Top Performing Symbols (by avg return):")
    top_performers = df.groupby('symbol')['daily_return'].mean().sort_values(ascending=False).head(5)
    for symbol, avg_ret in top_performers.items():
        print(f"   • {symbol}: {avg_ret*100:.3f}% daily avg")
    
    print("="*60)

print("✅ Market analysis functions ready!")


✅ Market analysis functions ready!


In [4]:
def create_sentiment_analysis(df):
    """Create comprehensive sentiment analysis visualization"""
    
    if df is None or df.empty:
        print("❌ No data available for sentiment analysis")
        return None
    
    print("🔄 Creating sentiment analysis visualization...")
    
    # Filter data with sentiment scores
    sentiment_data = df[df['sentiment_score'] != 0].copy()
    
    if sentiment_data.empty:
        print("⚠️ No sentiment data available")
        return None
    
    # Create subplots
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=['Sentiment Over Time', 'Sentiment Distribution', 
                       'Sentiment vs Returns', 'Sentiment Confidence'],
        specs=[[{"secondary_y": False}, {"type": "histogram"}],
               [{"type": "scatter"}, {"type": "scatter"}]]
    )
    
    symbols = sentiment_data['symbol'].unique()[:5]
    colors = ['blue', 'red', 'green', 'orange', 'purple']
    
    for i, symbol in enumerate(symbols):
        symbol_data = sentiment_data[sentiment_data['symbol'] == symbol].sort_values('date')
        color = colors[i % len(colors)]
        
        # 1. Sentiment over time
        fig.add_trace(
            go.Scatter(
                x=symbol_data['date'],
                y=symbol_data['sentiment_score'],
                name=f'{symbol}',
                line=dict(color=color),
                legendgroup=symbol
            ),
            row=1, col=1
        )
        
        # 2. Sentiment distribution
        fig.add_trace(
            go.Histogram(
                x=symbol_data['sentiment_score'],
                name=f'{symbol}',
                opacity=0.7,
                nbinsx=20,
                legendgroup=symbol,
                showlegend=False
            ),
            row=1, col=2
        )
        
        # 3. Sentiment vs Returns scatter
        valid_data = symbol_data.dropna(subset=['sentiment_score', 'daily_return'])
        if not valid_data.empty:
            fig.add_trace(
                go.Scatter(
                    x=valid_data['sentiment_score'],
                    y=valid_data['daily_return'],
                    mode='markers',
                    name=f'{symbol}',
                    marker=dict(color=color, size=6, opacity=0.6),
                    legendgroup=symbol,
                    showlegend=False
                ),
                row=2, col=1
            )
        
        # 4. Sentiment confidence
        if 'confidence_score' in symbol_data.columns:
            fig.add_trace(
                go.Scatter(
                    x=symbol_data['sentiment_score'],
                    y=symbol_data['confidence_score'],
                    mode='markers',
                    name=f'{symbol}',
                    marker=dict(color=color, size=6, opacity=0.6),
                    legendgroup=symbol,
                    showlegend=False
                ),
                row=2, col=2
            )
    
    # Update layout
    fig.update_layout(
        height=800,
        title="Sentiment Analysis Dashboard",
        showlegend=True
    )
    
    # Update axes
    fig.update_xaxes(title_text="Date", row=1, col=1)
    fig.update_yaxes(title_text="Sentiment Score", row=1, col=1)
    
    fig.update_xaxes(title_text="Sentiment Score", row=1, col=2)
    fig.update_yaxes(title_text="Frequency", row=1, col=2)
    
    fig.update_xaxes(title_text="Sentiment Score", row=2, col=1)
    fig.update_yaxes(title_text="Daily Return", row=2, col=1)
    
    fig.update_xaxes(title_text="Sentiment Score", row=2, col=2)
    fig.update_yaxes(title_text="Confidence Score", row=2, col=2)
    
    fig.show()
    return fig

def calculate_sentiment_correlations(df):
    """Calculate correlations between sentiment and market metrics"""
    
    if df is None or df.empty:
        print("❌ No data available for correlation analysis")
        return None
    
    print("🔄 Calculating sentiment correlations...")
    
    correlations = {}
    
    for symbol in df['symbol'].unique():
        symbol_data = df[df['symbol'] == symbol].copy()
        
        # Calculate correlations
        corr_data = {}
        
        if 'sentiment_score' in symbol_data.columns and symbol_data['sentiment_score'].notna().sum() > 10:
            corr_data['sentiment_vs_return'] = symbol_data['sentiment_score'].corr(symbol_data['daily_return'])
            corr_data['sentiment_vs_volume'] = symbol_data['sentiment_score'].corr(symbol_data['volume'])
            
            # Lagged correlations
            if 'sentiment_lag1' in symbol_data.columns:
                corr_data['sentiment_lag1_vs_return'] = symbol_data['sentiment_lag1'].corr(symbol_data['daily_return'])
            
            correlations[symbol] = corr_data
    
    return correlations

def print_sentiment_insights(df, correlations=None):
    """Print key sentiment insights and statistics"""
    
    if df is None or df.empty:
        print("❌ No data available for sentiment insights")
        return
    
    print("\n" + "="*60)
    print("🎭 SENTIMENT ANALYSIS INSIGHTS")
    print("="*60)
    
    # Filter sentiment data
    sentiment_data = df[df['sentiment_score'] != 0]
    
    if sentiment_data.empty:
        print("⚠️ No sentiment data available")
        return
    
    # Basic sentiment statistics
    print(f"📊 Sentiment Data Overview:")
    print(f"   • Records with sentiment: {len(sentiment_data):,}")
    print(f"   • Coverage: {len(sentiment_data)/len(df)*100:.1f}% of total data")
    print(f"   • Average sentiment: {sentiment_data['sentiment_score'].mean():.3f}")
    print(f"   • Sentiment range: {sentiment_data['sentiment_score'].min():.3f} to {sentiment_data['sentiment_score'].max():.3f}")
    
    # Sentiment distribution
    positive_count = (sentiment_data['sentiment_score'] > 0.1).sum()
    negative_count = (sentiment_data['sentiment_score'] < -0.1).sum()
    neutral_count = len(sentiment_data) - positive_count - negative_count
    
    print(f"\n📈 Sentiment Distribution:")
    print(f"   • Positive sentiment: {positive_count:,} ({positive_count/len(sentiment_data)*100:.1f}%)")
    print(f"   • Negative sentiment: {negative_count:,} ({negative_count/len(sentiment_data)*100:.1f}%)")
    print(f"   • Neutral sentiment: {neutral_count:,} ({neutral_count/len(sentiment_data)*100:.1f}%)")
    
    # Correlation insights
    if correlations:
        print(f"\n🔗 Sentiment-Market Correlations:")
        for symbol, corr_data in correlations.items():
            if corr_data:
                sent_return_corr = corr_data.get('sentiment_vs_return', 0)
                print(f"   • {symbol}: Sentiment-Return correlation = {sent_return_corr:.3f}")
    
    print("="*60)

print("✅ Sentiment analysis functions ready!")


✅ Sentiment analysis functions ready!


In [5]:
def analyze_basic_strategy(df, sentiment_threshold=0.1):
    """Analyze a basic sentiment-based trading strategy"""
    
    if df is None or df.empty:
        print("❌ No data available for strategy analysis")
        return None
    
    print(f"🔄 Running basic strategy analysis with threshold: {sentiment_threshold}")
    
    results = []
    
    for symbol in df['symbol'].unique():
        symbol_data = df[df['symbol'] == symbol].copy()
        symbol_data = symbol_data.sort_values('date').reset_index(drop=True)
        
        if len(symbol_data) < 30:  # Need sufficient data
            continue
        
        # Generate trading signals
        symbol_data['position'] = 0  # 0 = no position, 1 = long, -1 = short
        symbol_data['signal'] = 0    # 1 = buy, -1 = sell, 0 = hold
        
        # Simple strategy: go long when sentiment > threshold, short when < -threshold
        for i in range(1, len(symbol_data)):
            current_sentiment = symbol_data.loc[i, 'sentiment_score']
            prev_position = symbol_data.loc[i-1, 'position']
            
            if current_sentiment > sentiment_threshold and prev_position != 1:
                symbol_data.loc[i, 'signal'] = 1  # Buy signal
                symbol_data.loc[i, 'position'] = 1
            elif current_sentiment < -sentiment_threshold and prev_position != -1:
                symbol_data.loc[i, 'signal'] = -1  # Sell signal
                symbol_data.loc[i, 'position'] = -1
            else:
                symbol_data.loc[i, 'position'] = prev_position  # Hold position
        
        # Calculate strategy returns
        symbol_data['strategy_return'] = symbol_data['position'].shift(1) * symbol_data['daily_return']
        symbol_data['strategy_return'] = symbol_data['strategy_return'].fillna(0)
        
        # Calculate cumulative returns
        symbol_data['cum_return_buy_hold'] = (1 + symbol_data['daily_return']).cumprod() - 1
        symbol_data['cum_return_strategy'] = (1 + symbol_data['strategy_return']).cumprod() - 1
        
        # Get final returns
        total_return = symbol_data['cum_return_buy_hold'].iloc[-1]
        strategy_return = symbol_data['cum_return_strategy'].iloc[-1]
        
        # Count signals
        signals = symbol_data[symbol_data['signal'] != 0].copy()
        signals_list = []
        
        for idx, row in signals.iterrows():
            signals_list.append({
                'date': row['date'],
                'signal_type': 'BUY' if row['signal'] == 1 else 'SELL',
                'price': row['close_price'],
                'sentiment_score': row['sentiment_score']
            })
        
        result = {
            'symbol': symbol,
            'total_return': total_return,
            'strategy_return': strategy_return,
            'signals_count': len(signals_list),
            'signals': signals_list,
            'price_data': symbol_data
        }
        
        results.append(result)
    
    print(f"✅ Strategy analysis completed for {len(results)} symbols")
    return results

def create_strategy_visualization(strategy_results):
    """Create comprehensive visualizations for strategy analysis results - FIXED VERSION"""
    
    if not strategy_results:
        print("⚠️ No strategy results to visualize")
        return
    
    print("🔄 Creating strategy visualization...")
    
    # Create subplots with PROPER specs for table
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=['Portfolio Performance', 'Returns Distribution', 'Performance Summary', 'Signal Distribution'],
        specs=[[{"secondary_y": True}, {"type": "histogram"}],
               [{"type": "table"}, {"type": "scatter"}]],  # Fixed: specify table type for position (2,1)
        vertical_spacing=0.12,
        horizontal_spacing=0.1
    )
    
    # Colors for different symbols
    colors = ['blue', 'red', 'green', 'orange', 'purple', 'brown', 'pink', 'gray']
    
    # 1. Portfolio performance comparison
    for i, result in enumerate(strategy_results):
        symbol = result['symbol']
        color = colors[i % len(colors)]
        
        # Buy and hold performance
        fig.add_trace(
            go.Scatter(
                x=result['price_data']['date'],
                y=result['price_data']['cum_return_buy_hold'],
                name=f'{symbol} Buy & Hold',
                line=dict(color=color, dash='dash'),
                legendgroup=symbol
            ),
            row=1, col=1
        )
        
        # Strategy performance
        fig.add_trace(
            go.Scatter(
                x=result['price_data']['date'],
                y=result['price_data']['cum_return_strategy'],
                name=f'{symbol} Strategy',
                line=dict(color=color),
                legendgroup=symbol
            ),
            row=1, col=1
        )
        
        # Daily returns distribution
        fig.add_trace(
            go.Histogram(
                x=result['price_data']['daily_return'],
                name=f'{symbol} Returns',
                opacity=0.7,
                nbinsx=30,
                legendgroup=symbol,
                showlegend=False
            ),
            row=1, col=2
        )
    
    # 2. Performance summary table (now properly configured)
    summary_data = []
    for result in strategy_results:
        summary_data.append([
            result['symbol'],
            f"{result['total_return']*100:.2f}%",
            f"{result['strategy_return']*100:.2f}%",
            result['signals_count']
        ])

    fig.add_trace(
        go.Table(
            header=dict(values=['Symbol', 'Buy-Hold Return', 'Strategy Return', 'Signals'],
                       fill_color='lightblue',
                       align='center'),
            cells=dict(values=[[row[i] for row in summary_data] for i in range(4)],
                      fill_color='white',
                      align='center')
        ),
        row=2, col=1  # This will now work because we specified "type": "table" in specs
    )

    # 3. Signal distribution
    all_signals = []
    for result in strategy_results:
        all_signals.extend(result['signals'])
    
    if all_signals:
        signal_df = pd.DataFrame(all_signals)
        
        # Plot signal frequency by sentiment score
        sentiment_bins = pd.cut(signal_df['sentiment_score'], bins=10)
        sentiment_counts = sentiment_bins.value_counts().sort_index()
        
        fig.add_trace(
            go.Scatter(
                x=[interval.mid for interval in sentiment_counts.index],
                y=sentiment_counts.values,
                mode='lines+markers',
                name='Signal Frequency',
                line=dict(color='red'),
                showlegend=False
            ),
            row=2, col=2
        )
    
    # Update layout
    fig.update_layout(
        height=800,
        title="Trading Strategy Analysis Dashboard",
        showlegend=True,
        legend=dict(x=1.05, y=1)
    )
    
    # Update axes
    fig.update_xaxes(title_text="Date", row=1, col=1)
    fig.update_yaxes(title_text="Cumulative Return", row=1, col=1)
    fig.update_xaxes(title_text="Daily Return", row=1, col=2)
    fig.update_yaxes(title_text="Frequency", row=1, col=2)
    fig.update_xaxes(title_text="Sentiment Score", row=2, col=2)
    fig.update_yaxes(title_text="Frequency", row=2, col=2)
    
    fig.show()
    return fig

def print_strategy_insights(strategy_results):
    """Print key insights from strategy analysis"""
    
    if not strategy_results:
        print("❌ No strategy results to analyze")
        return
    
    print("\n" + "="*60)
    print("📈 TRADING STRATEGY INSIGHTS")
    print("="*60)
    
    total_symbols = len(strategy_results)
    total_signals = sum(result['signals_count'] for result in strategy_results)
    
    print(f"🎯 Strategy Overview:")
    print(f"   • Symbols analyzed: {total_symbols}")
    print(f"   • Total trading signals: {total_signals}")
    print(f"   • Average signals per symbol: {total_signals/total_symbols:.1f}")
    
    # Performance comparison
    print(f"\n📊 Performance Comparison:")
    outperforming = 0
    total_strategy_return = 0
    total_buy_hold_return = 0
    
    for result in strategy_results:
        strategy_ret = result['strategy_return']
        buy_hold_ret = result['total_return']
        
        total_strategy_return += strategy_ret
        total_buy_hold_return += buy_hold_ret
        
        if strategy_ret > buy_hold_ret:
            outperforming += 1
        
        excess_return = (strategy_ret - buy_hold_ret) * 100
        print(f"   • {result['symbol']}: Strategy {strategy_ret*100:.2f}% vs Buy-Hold {buy_hold_ret*100:.2f}% (Excess: {excess_return:.2f}%)")
    
    avg_strategy_return = total_strategy_return / total_symbols
    avg_buy_hold_return = total_buy_hold_return / total_symbols
    
    print(f"\n🏆 Summary:")
    print(f"   • Strategy outperformed buy-and-hold in {outperforming}/{total_symbols} cases ({outperforming/total_symbols*100:.1f}%)")
    print(f"   • Average strategy return: {avg_strategy_return*100:.2f}%")
    print(f"   • Average buy-and-hold return: {avg_buy_hold_return*100:.2f}%")
    print(f"   • Average excess return: {(avg_strategy_return-avg_buy_hold_return)*100:.2f}%")
    
    print("="*60)

print("✅ Trading strategy analysis functions ready!")


✅ Trading strategy analysis functions ready!


In [6]:
def analyze_dataset_readiness(df):
    """Analyze dataset readiness for production trading"""
    
    if df is None or df.empty:
        print("❌ No data available for readiness analysis")
        return None
    
    print("🔄 Analyzing dataset readiness...")
    
    criteria_scores = {}
    
    # 1. Data Completeness (30%)
    total_records = len(df)
    complete_records = df.dropna(subset=['close_price', 'volume']).shape[0]
    completeness_score = (complete_records / total_records) * 100
    criteria_scores['Data Completeness'] = completeness_score
    
    # 2. Sentiment Coverage (25%)
    sentiment_records = df[df['sentiment_score'] != 0].shape[0]
    sentiment_coverage = (sentiment_records / total_records) * 100
    criteria_scores['Sentiment Coverage'] = sentiment_coverage
    
    # 3. Date Range & Consistency (20%)
    date_range_days = (df['date'].max() - df['date'].min()).days
    expected_trading_days = date_range_days * 5/7  # Rough estimate
    actual_trading_days = df['date'].nunique()
    date_consistency = min((actual_trading_days / expected_trading_days) * 100, 100)
    criteria_scores['Date Consistency'] = date_consistency
    
    # 4. Symbol Diversity (15%)
    unique_symbols = df['symbol'].nunique()
    symbol_diversity = min((unique_symbols / 10) * 100, 100)  # Target: 10+ symbols
    criteria_scores['Symbol Diversity'] = symbol_diversity
    
    # 5. Data Quality (10%)
    # Check for anomalies
    price_anomalies = 0
    for symbol in df['symbol'].unique():
        symbol_data = df[df['symbol'] == symbol]
        if len(symbol_data) > 1:
            # Check for extreme price jumps (>50% in one day)
            returns = symbol_data['close_price'].pct_change()
            extreme_moves = (abs(returns) > 0.5).sum()
            price_anomalies += extreme_moves
    
    total_price_points = df.shape[0]
    quality_score = max(0, (1 - price_anomalies / total_price_points)) * 100
    criteria_scores['Data Quality'] = quality_score
    
    # Calculate overall weighted score
    weights = {
        'Data Completeness': 0.30,
        'Sentiment Coverage': 0.25,
        'Date Consistency': 0.20,
        'Symbol Diversity': 0.15,
        'Data Quality': 0.10
    }
    
    overall_score = sum(criteria_scores[criterion] * weights[criterion] for criterion in criteria_scores)
    
    return {
        'overall_score': overall_score,
        'criteria_scores': criteria_scores,
        'weights': weights,
        'metrics': {
            'total_records': total_records,
            'complete_records': complete_records,
            'sentiment_records': sentiment_records,
            'date_range_days': date_range_days,
            'unique_symbols': unique_symbols,
            'price_anomalies': price_anomalies
        }
    }

def generate_recommendations(overall_score, criteria_scores):
    """Generate recommendations based on readiness analysis"""
    
    recommendations = []
    
    if overall_score >= 80:
        recommendations.append("🎉 Dataset is production-ready! Consider launching live trading.")
    elif overall_score >= 65:
        recommendations.append("⚠️ Dataset is mostly ready but needs minor improvements.")
    else:
        recommendations.append("🔧 Dataset needs significant improvements before production use.")
    
    # Specific recommendations based on criteria
    for criterion, score in criteria_scores.items():
        if score < 70:
            if criterion == 'Data Completeness':
                recommendations.append("• Improve data collection to reduce missing values")
            elif criterion == 'Sentiment Coverage':
                recommendations.append("• Enhance news collection and sentiment analysis coverage")
            elif criterion == 'Date Consistency':
                recommendations.append("• Fill gaps in historical data collection")
            elif criterion == 'Symbol Diversity':
                recommendations.append("• Add more symbols to diversify the portfolio")
            elif criterion == 'Data Quality':
                recommendations.append("• Review and clean data anomalies")
    
    return recommendations

def create_readiness_summary(readiness_analysis):
    """Create a comprehensive readiness summary visualization"""
    
    if not readiness_analysis:
        print("❌ No readiness analysis available")
        return None
    
    print("🔄 Creating readiness summary visualization...")
    
    criteria_scores = readiness_analysis['criteria_scores']
    overall_score = readiness_analysis['overall_score']
    
    # Create radar chart for criteria scores
    categories = list(criteria_scores.keys())
    values = list(criteria_scores.values())
    
    fig = go.Figure()
    
    fig.add_trace(go.Scatterpolar(
        r=values,
        theta=categories,
        fill='toself',
        name='Readiness Scores',
        line_color='blue'
    ))
    
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 100]
            )),
        showlegend=True,
        title=f"Dataset Readiness Assessment (Overall Score: {overall_score:.1f}/100)",
        height=500
    )
    
    fig.show()
    return fig

print("✅ Dataset readiness assessment functions ready!")


✅ Dataset readiness assessment functions ready!


In [7]:
# Main Analysis Pipeline - Run Complete Analysis
print("🚀 Starting Comprehensive Analysis Pipeline...")
print("=" * 60)

# Step 1: Load Foundation Dataset
print("\n📊 STEP 1: LOADING DATA")
print("-" * 30)
foundation_data = load_foundation_dataset()

if foundation_data is not None:
    # Step 2: Process Combined Dataset
    print("\n🔄 STEP 2: PROCESSING COMBINED DATASET")
    print("-" * 40)
    processed_data = create_combined_dataset(foundation_data, None)
    
    if processed_data is not None:
        # Step 3: Market Analysis
        print("\n📈 STEP 3: MARKET OVERVIEW ANALYSIS")
        print("-" * 40)
        print("🔄 Creating market overview...")
        market_fig = create_market_overview(processed_data)
        print_market_insights(processed_data)
        
        # Step 4: Sentiment Analysis
        print("\n🎭 STEP 4: SENTIMENT ANALYSIS")
        print("-" * 35)
        print("🔄 Creating sentiment analysis...")
        sentiment_fig = create_sentiment_analysis(processed_data)
        correlations = calculate_sentiment_correlations(processed_data)
        print_sentiment_insights(processed_data, correlations)
        
        # Step 5: Trading Strategy Analysis
        print("\n📈 STEP 5: TRADING STRATEGY ANALYSIS")
        print("-" * 40)
        print("🔄 Running basic strategy analysis...")
        strategy_results = analyze_basic_strategy(processed_data)
        
        if strategy_results:
            create_strategy_visualization(strategy_results)
            print_strategy_insights(strategy_results)
        else:
            print("⚠️ No strategy results available")
        
        # Step 6: Dataset Readiness Assessment
        print("\n🎯 STEP 6: DATASET READINESS ASSESSMENT")
        print("-" * 45)
        readiness_analysis = analyze_dataset_readiness(processed_data)
        
        if readiness_analysis:
            readiness_fig = create_readiness_summary(readiness_analysis)
            
            # Generate and display recommendations
            recommendations = generate_recommendations(
                readiness_analysis['overall_score'], 
                readiness_analysis['criteria_scores']
            )
            
            print(f"\n🎯 FINAL RECOMMENDATIONS")
            print("=" * 40)
            for rec in recommendations:
                print(rec)
            print("=" * 40)
        
        print(f"\n✅ ANALYSIS COMPLETE!")
        print("=" * 60)
        print("📊 All visualizations have been generated above.")
        print("📈 Review the insights and recommendations for next steps.")
        print("🚀 Ready for production decision making!")
        
    else:
        print("❌ Failed to process combined dataset")
else:
    print("❌ Failed to load foundation dataset - check database connection")


🚀 Starting Comprehensive Analysis Pipeline...

📊 STEP 1: LOADING DATA
------------------------------
🔄 Connecting to database...
📊 Loading market data with sentiment scores...
✅ Loaded 819 records across 3 symbols
📅 Date range: 2024-05-24 to 2025-06-27
🎭 Sentiment coverage: 35 records (4.3%)

🔄 STEP 2: PROCESSING COMBINED DATASET
----------------------------------------
🔄 Processing combined dataset...
✅ Combined dataset processed with 819 records

📈 STEP 3: MARKET OVERVIEW ANALYSIS
----------------------------------------
🔄 Creating market overview...
🔄 Creating market overview visualization...



📊 MARKET INSIGHTS SUMMARY
📈 Dataset Overview:
   • Total records: 819
   • Number of symbols: 3
   • Date range: 399 days
   • Period: 2024-05-24 to 2025-06-27

💰 Price & Returns Analysis:
   • Average daily return: 0.060%
   • Daily volatility: 3.622%
   • Best single day return: 23.82%
   • Worst single day return: -26.06%

📊 Volume Analysis:
   • Average daily volume: 135,915,602
   • Peak volume: 818,830,900

🏆 Top Performing Symbols (by avg return):
   • NVDA: 0.210% daily avg
   • AMD: 0.001% daily avg
   • INTC: -0.032% daily avg

🎭 STEP 4: SENTIMENT ANALYSIS
-----------------------------------
🔄 Creating sentiment analysis...
🔄 Creating sentiment analysis visualization...


🔄 Calculating sentiment correlations...

🎭 SENTIMENT ANALYSIS INSIGHTS
📊 Sentiment Data Overview:
   • Records with sentiment: 35
   • Coverage: 4.3% of total data
   • Average sentiment: 0.297
   • Sentiment range: -0.700 to 0.900

📈 Sentiment Distribution:
   • Positive sentiment: 25 (71.4%)
   • Negative sentiment: 8 (22.9%)
   • Neutral sentiment: 2 (5.7%)

🔗 Sentiment-Market Correlations:
   • AMD: Sentiment-Return correlation = 0.046
   • INTC: Sentiment-Return correlation = 0.075
   • NVDA: Sentiment-Return correlation = 0.011

📈 STEP 5: TRADING STRATEGY ANALYSIS
----------------------------------------
🔄 Running basic strategy analysis...
🔄 Running basic strategy analysis with threshold: 0.1
✅ Strategy analysis completed for 3 symbols
🔄 Creating strategy visualization...



📈 TRADING STRATEGY INSIGHTS
🎯 Strategy Overview:
   • Symbols analyzed: 3
   • Total trading signals: 15
   • Average signals per symbol: 5.0

📊 Performance Comparison:
   • AMD: Strategy 28.11% vs Buy-Hold -13.55% (Excess: 41.67%)
   • INTC: Strategy -2.03% vs Buy-Hold -25.67% (Excess: 23.64%)
   • NVDA: Strategy -16.16% vs Buy-Hold 48.22% (Excess: -64.38%)

🏆 Summary:
   • Strategy outperformed buy-and-hold in 2/3 cases (66.7%)
   • Average strategy return: 3.31%
   • Average buy-and-hold return: 3.00%
   • Average excess return: 0.31%

🎯 STEP 6: DATASET READINESS ASSESSMENT
---------------------------------------------
🔄 Analyzing dataset readiness...
🔄 Creating readiness summary visualization...



🎯 FINAL RECOMMENDATIONS
🔧 Dataset needs significant improvements before production use.
• Enhance news collection and sentiment analysis coverage
• Add more symbols to diversify the portfolio

✅ ANALYSIS COMPLETE!
📊 All visualizations have been generated above.
📈 Review the insights and recommendations for next steps.
🚀 Ready for production decision making!
