In [2]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Statistical libraries
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Time series libraries
import pandas_ta as ta
from datetime import datetime, timedelta

# Configuration
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
pd.set_option('display.max_columns', 100)
pd.set_option('display.precision', 4)

# Set figure parameters
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

print("📚 Libraries loaded successfully!")

ModuleNotFoundError: No module named 'pandas_ta'

In [None]:
# Load the dataset created by Step 1
try:
    df = pd.read_parquet('data/processed/combined_dataset.parquet')
    print(f"✅ Dataset loaded successfully!")
    print(f"Shape: {df.shape}")
    print(f"Date range: {df.index.min()} to {df.index.max()}")
    print(f"Symbols: {df['symbol'].unique()}")
except FileNotFoundError:
    print("❌ Dataset not found. Please run Step 1 first:")
    print("python run_experiment.py --step 1")
    raise

In [None]:
def dataset_overview(df):
    """Comprehensive dataset overview"""
    
    print("=" * 60)
    print("📊 DATASET OVERVIEW")
    print("=" * 60)
    
    # Basic info
    print(f"📅 Date Range: {df.index.min().date()} to {df.index.max().date()}")
    print(f"📊 Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
    print(f"📈 Symbols: {len(df['symbol'].unique())} ({', '.join(df['symbol'].unique())})")
    
    if 'sector' in df.columns:
        print(f"🏢 Sectors: {len(df['sector'].unique())} ({', '.join(df['sector'].unique())})")
    
    # Memory usage
    memory_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
    print(f"💾 Memory Usage: {memory_mb:.1f} MB")
    
    print("\n" + "=" * 40)
    print("📋 COLUMN BREAKDOWN")
    print("=" * 40)
    
    # Categorize columns
    feature_groups = {
        'Price Features': [col for col in df.columns if col in ['Open', 'High', 'Low', 'Close', 'Volume']],
        'Technical Features': [col for col in df.columns if any(tech in col for tech in ['SMA', 'EMA', 'RSI', 'MACD', 'BB', 'volatility'])],
        'News Features': [col for col in df.columns if 'news' in col.lower()],
        'Lag Features': [col for col in df.columns if 'lag' in col.lower()],
        'Target Features': [col for col in df.columns if col.startswith(('target_', 'return_', 'direction_'))],
        'Meta Features': [col for col in df.columns if col in ['symbol', 'sector']]
    }
    
    for group, features in feature_groups.items():
        if features:
            print(f"{group}: {len(features)}")
            print(f"  └─ {', '.join(features[:5])}{'...' if len(features) > 5 else ''}")
    
    print(f"\nTotal Features: {sum(len(features) for features in feature_groups.values())}")
    
    return feature_groups

# Run overview
feature_groups = dataset_overview(df)

In [None]:
# Data quality assessment
def assess_data_quality(df):
    """Assess data quality across all features"""
    
    print("\n" + "=" * 60)
    print("🔍 DATA QUALITY ASSESSMENT")
    print("=" * 60)
    
    # Missing values
    missing_summary = df.isnull().sum()
    missing_pct = (missing_summary / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing_Count': missing_summary,
        'Missing_Pct': missing_pct
    }).sort_values('Missing_Pct', ascending=False)
    
    # Show columns with missing values
    has_missing = missing_df[missing_df['Missing_Count'] > 0]
    if not has_missing.empty:
        print("⚠️ COLUMNS WITH MISSING VALUES:")
        print(has_missing.head(10))
    else:
        print("✅ No missing values found!")
    
    # Data types
    print(f"\n📊 DATA TYPES:")
    dtype_counts = df.dtypes.value_counts()
    for dtype, count in dtype_counts.items():
        print(f"  {dtype}: {count} columns")
    
    # Basic statistics for numerical columns
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    print(f"\n📈 NUMERICAL FEATURES: {len(numerical_cols)}")
    
    # Identify potential outliers (using IQR method)
    outlier_summary = {}
    for col in numerical_cols[:10]:  # Check first 10 numerical columns
        if col not in ['symbol']:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            outliers = df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)][col]
            outlier_summary[col] = len(outliers)
    
    outlier_df = pd.DataFrame(list(outlier_summary.items()), 
                             columns=['Feature', 'Outlier_Count']).sort_values('Outlier_Count', ascending=False)
    print(f"\n🎯 OUTLIERS (Top 5):")
    print(outlier_df.head())

assess_data_quality(df)

In [None]:
def analyze_price_movements(df):
    """Analyze price movements and volatility patterns"""
    
    print("\n" + "=" * 60)
    print("💰 PRICE MOVEMENT ANALYSIS")
    print("=" * 60)
    
    # Calculate daily returns for each symbol
    symbols = df['symbol'].unique()
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Plot 1: Price Evolution
    ax1 = axes[0, 0]
    for symbol in symbols:
        symbol_data = df[df['symbol'] == symbol]['Close']
        # Normalize to starting price for comparison
        normalized_price = (symbol_data / symbol_data.iloc[0]) * 100
        ax1.plot(normalized_price.index, normalized_price.values, label=symbol, alpha=0.8, linewidth=2)
    
    ax1.set_title('📈 Normalized Price Evolution (Base=100)', fontsize=14, fontweight='bold')
    ax1.set_ylabel('Normalized Price')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Volume Patterns
    ax2 = axes[0, 1]
    for symbol in symbols:
        symbol_data = df[df['symbol'] == symbol]['Volume']
        # Rolling 30-day average volume
        volume_ma = symbol_data.rolling(30).mean()
        ax2.plot(volume_ma.index, volume_ma.values, label=symbol, alpha=0.8, linewidth=2)
    
    ax2.set_title('📊 Volume Trends (30-day MA)', fontsize=14, fontweight='bold')
    ax2.set_ylabel('Volume (30-day MA)')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # Plot 3: Volatility Analysis
    ax3 = axes[1, 0]
    for symbol in symbols:
        symbol_data = df[df['symbol'] == symbol]
        if 'volatility_20d' in symbol_data.columns:
            volatility = symbol_data['volatility_20d'] * 100  # Convert to percentage
            ax3.plot(volatility.index, volatility.values, label=symbol, alpha=0.8, linewidth=2)
    
    ax3.set_title('📉 20-Day Volatility (%)', fontsize=14, fontweight='bold')
    ax3.set_ylabel('Volatility (%)')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # Plot 4: Return Distribution
    ax4 = axes[1, 1]
    all_returns = []
    for symbol in symbols:
        symbol_data = df[df['symbol'] == symbol]['Close']
        returns = symbol_data.pct_change().dropna() * 100
        all_returns.extend(returns.values)
        ax4.hist(returns.values, bins=50, alpha=0.6, label=symbol, density=True)
    
    ax4.set_title('📊 Daily Return Distributions (%)', fontsize=14, fontweight='bold')
    ax4.set_xlabel('Daily Return (%)')
    ax4.set_ylabel('Density')
    ax4.legend()
    ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print("\n📊 SUMMARY STATISTICS:")
    for symbol in symbols:
        symbol_data = df[df['symbol'] == symbol]
        returns = symbol_data['Close'].pct_change().dropna() * 100
        
        print(f"\n{symbol}:")
        print(f"  📈 Total Return: {((symbol_data['Close'].iloc[-1] / symbol_data['Close'].iloc[0]) - 1) * 100:.1f}%")
        print(f"  📊 Daily Return (Mean): {returns.mean():.3f}%")
        print(f"  📉 Daily Volatility: {returns.std():.3f}%")
        print(f"  📋 Sharpe Ratio*: {returns.mean() / returns.std():.3f}")
        print(f"  🎯 Max Daily Gain: {returns.max():.2f}%")
        print(f"  ⬇️ Max Daily Loss: {returns.min():.2f}%")

analyze_price_movements(df)

In [None]:
# Interactive price chart with volume
def create_interactive_price_chart(df):
    """Create interactive price and volume charts"""
    
    symbols = df['symbol'].unique()
    
    # Create dropdown for symbol selection
    fig = make_subplots(
        rows=2, cols=1,
        shared_xaxes=True,
        vertical_spacing=0.05,
        subplot_titles=('Price (OHLC)', 'Volume'),
        row_heights=[0.7, 0.3]
    )
    
    # Use first symbol as default
    symbol = symbols[0]
    symbol_data = df[df['symbol'] == symbol].copy()
    
    # OHLC Candlestick
    fig.add_trace(
        go.Candlestick(
            x=symbol_data.index,
            open=symbol_data['Open'],
            high=symbol_data['High'],
            low=symbol_data['Low'],
            close=symbol_data['Close'],
            name='OHLC'
        ),
        row=1, col=1
    )
    
    # Add moving averages if available
    if 'SMA_20' in symbol_data.columns:
        fig.add_trace(
            go.Scatter(
                x=symbol_data.index,
                y=symbol_data['SMA_20'],
                name='SMA 20',
                line=dict(color='orange', width=1)
            ),
            row=1, col=1
        )
    
    if 'SMA_50' in symbol_data.columns:
        fig.add_trace(
            go.Scatter(
                x=symbol_data.index,
                y=symbol_data['SMA_50'],
                name='SMA 50',
                line=dict(color='red', width=1)
            ),
            row=1, col=1
        )
    
    # Volume
    fig.add_trace(
        go.Bar(
            x=symbol_data.index,
            y=symbol_data['Volume'],
            name='Volume',
            marker_color='rgba(0,100,200,0.5)'
        ),
        row=2, col=1
    )
    
    fig.update_layout(
        title=f'📈 {symbol} Price and Volume Analysis',
        xaxis_rangeslider_visible=False,
        height=600,
        showlegend=True
    )
    
    fig.show()
    
    print(f"📊 Interactive chart shown for {symbol}")
    print(f"💡 TIP: Modify the symbol variable to explore other stocks")

# Create interactive chart
create_interactive_price_chart(df)

In [None]:
def analyze_technical_indicators(df):
    """Analyze technical indicators and their effectiveness"""
    
    print("\n" + "=" * 60)
    print("🔧 TECHNICAL INDICATORS ANALYSIS")
    print("=" * 60)
    
    # Get technical indicator columns
    tech_indicators = [col for col in df.columns if any(tech in col for tech in 
                      ['RSI', 'MACD', 'BB', 'SMA', 'EMA', 'volatility'])]
    
    print(f"📊 Found {len(tech_indicators)} technical indicators")
    
    # Focus on key indicators for analysis
    key_indicators = ['RSI_14', 'MACD', 'BB_upper_20', 'BB_lower_20', 'SMA_20', 'SMA_50', 'volatility_20d']
    available_indicators = [ind for ind in key_indicators if ind in df.columns]
    
    if not available_indicators:
        print("⚠️ No key technical indicators found in dataset")
        return
    
    symbols = df['symbol'].unique()
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Plot 1: RSI Distribution and Signals
    if 'RSI_14' in df.columns:
        ax1 = axes[0, 0]
        for symbol in symbols:
            symbol_data = df[df['symbol'] == symbol]['RSI_14'].dropna()
            ax1.hist(symbol_data, bins=30, alpha=0.6, label=symbol, density=True)
        
        ax1.axvline(30, color='red', linestyle='--', label='Oversold (30)')
        ax1.axvline(70, color='green', linestyle='--', label='Overbought (70)')
        ax1.set_title('📊 RSI Distribution Analysis', fontsize=14, fontweight='bold')
        ax1.set_xlabel('RSI Value')
        ax1.set_ylabel('Density')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
    
    # Plot 2: MACD Histogram
    if 'MACD' in df.columns:
        ax2 = axes[0, 1]
        symbol = symbols[0]  # Use first symbol for detailed view
        symbol_data = df[df['symbol'] == symbol]
        macd_data = symbol_data['MACD'].dropna()
        ax2.plot(macd_data.index, macd_data.values, label='MACD', linewidth=1.5)
        ax2.axhline(0, color='black', linestyle='-', alpha=0.5)
        ax2.fill_between(macd_data.index, macd_data.values, 0, 
                        where=(macd_data.values >= 0), color='green', alpha=0.3, label='Positive')
        ax2.fill_between(macd_data.index, macd_data.values, 0, 
                        where=(macd_data.values < 0), color='red', alpha=0.3, label='Negative')
        ax2.set_title(f'📈 MACD Analysis - {symbol}', fontsize=14, fontweight='bold')
        ax2.set_ylabel('MACD Value')
        ax2.legend()
        ax2.grid(True, alpha=0.3)
    
    # Plot 3: Bollinger Bands Analysis
    if all(col in df.columns for col in ['BB_upper_20', 'BB_lower_20', 'Close']):
        ax3 = axes[1, 0]
        symbol = symbols[0]  # Use first symbol
        symbol_data = df[df['symbol'] == symbol].tail(252)  # Last year
        
        ax3.plot(symbol_data.index, symbol_data['Close'], label='Close Price', linewidth=2)
        ax3.plot(symbol_data.index, symbol_data['BB_upper_20'], label='BB Upper', alpha=0.7)
        ax3.plot(symbol_data.index, symbol_data['BB_lower_20'], label='BB Lower', alpha=0.7)
        ax3.fill_between(symbol_data.index, symbol_data['BB_upper_20'], symbol_data['BB_lower_20'], 
                        alpha=0.2, label='BB Range')
        ax3.set_title(f'📊 Bollinger Bands - {symbol} (Last Year)', fontsize=14, fontweight='bold')
        ax3.set_ylabel('Price')
        ax3.legend()
        ax3.grid(True, alpha=0.3)
    
    # Plot 4: Volatility Patterns
    if 'volatility_20d' in df.columns:
        ax4 = axes[1, 1]
        for symbol in symbols:
            symbol_data = df[df['symbol'] == symbol]['volatility_20d'].dropna() * 100
            ax4.plot(symbol_data.index, symbol_data.values, label=symbol, alpha=0.8, linewidth=2)
        
        ax4.set_title('📉 Volatility Patterns (20-day)', fontsize=14, fontweight='bold')
        ax4.set_ylabel('Volatility (%)')
        ax4.legend()
        ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Technical indicator statistics
    print("\n📊 TECHNICAL INDICATOR STATISTICS:")
    for indicator in available_indicators:
        values = df[indicator].dropna()
        print(f"\n{indicator}:")
        print(f"  Mean: {values.mean():.4f}")
        print(f"  Std: {values.std():.4f}")
        print(f"  Min: {values.min():.4f}")
        print(f"  Max: {values.max():.4f}")
        
        # Specific insights for each indicator
        if 'RSI' in indicator:
            oversold = (values < 30).sum()
            overbought = (values > 70).sum()
            print(f"  🔴 Oversold signals: {oversold} ({oversold/len(values)*100:.1f}%)")
            print(f"  🟢 Overbought signals: {overbought} ({overbought/len(values)*100:.1f}%)")

analyze_technical_indicators(df)

In [1]:
# Correlation analysis of technical indicators
def analyze_indicator_correlations(df):
    """Analyze correlations between technical indicators"""
    
    print("\n" + "=" * 60)
    print("🔗 TECHNICAL INDICATOR CORRELATIONS")
    print("=" * 60)
    
    # Get technical indicators
    tech_cols = [col for col in df.columns if any(tech in col for tech in 
                ['RSI', 'MACD', 'SMA', 'EMA', 'BB', 'volatility'])]
    
    if len(tech_cols) < 2:
        print("⚠️ Not enough technical indicators for correlation analysis")
        return
    
    # Calculate correlation matrix
    corr_matrix = df[tech_cols].corr()
    
    # Create heatmap
    plt.figure(figsize=(14, 10))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='RdYlBu_r', center=0,
                square=True, linewidths=0.5, cbar_kws={"shrink": .8}, fmt='.2f')
    plt.title('🔗 Technical Indicators Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()
    
    # Find highly correlated pairs
    print("\n🔍 HIGHLY CORRELATED INDICATORS (|r| > 0.7):")
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_val = corr_matrix.iloc[i, j]
            if abs(corr_val) > 0.7:
                high_corr_pairs.append({
                    'Indicator 1': corr_matrix.columns[i],
                    'Indicator 2': corr_matrix.columns[j],
                    'Correlation': corr_val
                })
    
    if high_corr_pairs:
        high_corr_df = pd.DataFrame(high_corr_pairs).sort_values('Correlation', key=abs, ascending=False)
        for _, row in high_corr_df.iterrows():
            print(f"  {row['Indicator 1']} ↔ {row['Indicator 2']}: {row['Correlation']:.3f}")
    else:
        print("  No highly correlated indicators found (all |r| ≤ 0.7)")

analyze_indicator_correlations(df)

NameError: name 'df' is not defined

In [None]:
# Target variable correlations across horizons
def analyze_target_correlations(df):
    """Analyze correlations between different prediction horizons"""
    
    horizons = [5, 30, 90]
    target_cols = [f'target_{h}d' for h in horizons if f'target_{h}d' in df.columns]
    
    if len(target_cols) < 2:
        print("⚠️ Not enough target variables for correlation analysis")
        return
    
    print("\n" + "=" * 40)
    print("🔗 TARGET HORIZON CORRELATIONS")
    print("=" * 40)
    
    # Calculate correlations
    target_corr = df[target_cols].corr()
    
    # Create heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(target_corr, annot=True, cmap='RdYlBu_r', center=0,
                square=True, linewidths=0.5, fmt='.3f', 
                cbar_kws={"shrink": .8})
    plt.title('🔗 Target Variable Correlations Across Horizons', fontsize=14, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.show()
    
    # Print correlation insights
    print("\n🔍 CORRELATION INSIGHTS:")
    for i in range(len(target_cols)):
        for j in range(i+1, len(target_cols)):
            corr_val = target_corr.iloc[i, j]
            col1 = target_cols[i].replace('target_', '').replace('d', '-day')
            col2 = target_cols[j].replace('target_', '').replace('d', '-day')
            print(f"  {col1} ↔ {col2}: {corr_val:.3f}")

analyze_target_correlations(df)

In [None]:
def analyze_cross_sectional_patterns(df):
    """Analyze patterns across different stocks and sectors"""
    
    print("\n" + "=" * 60)
    print("🔄 CROSS-SECTIONAL ANALYSIS")
    print("=" * 60)
    
    symbols = df['symbol'].unique()
    
    # 1. Price level comparison
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Plot 1: Current price levels
    ax1 = axes[0, 0]
    current_prices = []
    for symbol in symbols:
        current_price = df[df['symbol'] == symbol]['Close'].iloc[-1]
        current_prices.append(current_price)
    
    bars = ax1.bar(symbols, current_prices, color=plt.cm.viridis(np.linspace(0, 1, len(symbols))))
    ax1.set_title('💰 Current Price Levels', fontsize=14, fontweight='bold')
    ax1.set_ylabel('Price ($)')
    ax1.tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for bar, price in zip(bars, current_prices):
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(current_prices)*0.01,
                f'${price:.2f}', ha='center', va='bottom', fontweight='bold')
    
    # Plot 2: Volatility comparison
    ax2 = axes[0, 1]
    avg_volatilities = []
    for symbol in symbols:
        if 'volatility_20d' in df.columns:
            avg_vol = df[df['symbol'] == symbol]['volatility_20d'].mean() * 100
        else:
            # Calculate simple volatility
            returns = df[df['symbol'] == symbol]['Close'].pct_change()
            avg_vol = returns.std() * np.sqrt(252) * 100  # Annualized
        avg_volatilities.append(avg_vol)
    
    bars = ax2.bar(symbols, avg_volatilities, color=plt.cm.plasma(np.linspace(0, 1, len(symbols))))
    ax2.set_title('📉 Average Volatility', fontsize=14, fontweight='bold')
    ax2.set_ylabel('Volatility (%)')
    ax2.tick_params(axis='x', rotation=45)
    
    # Add value labels
    for bar, vol in zip(bars, avg_volatilities):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(avg_volatilities)*0.01,
                f'{vol:.1f}%', ha='center', va='bottom', fontweight='bold')
    
    # Plot 3: Trading volume comparison
    ax3 = axes[1, 0]
    avg_volumes = []
    for symbol in symbols:
        avg_vol = df[df['symbol'] == symbol]['Volume'].mean() / 1_000_000  # In millions
        avg_volumes.append(avg_vol)
    
    bars = ax3.bar(symbols, avg_volumes, color=plt.cm.coolwarm(np.linspace(0, 1, len(symbols))))
    ax3.set_title('📊 Average Daily Volume', fontsize=14, fontweight='bold')
    ax3.set_ylabel('Volume (Millions)')
    ax3.tick_params(axis='x', rotation=45)
    
    # Add value labels
    for bar, vol in zip(bars, avg_volumes):
        ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(avg_volumes)*0.01,
                f'{vol:.1f}M', ha='center', va='bottom', fontweight='bold')
    
    # Plot 4: News coverage comparison
    ax4 = axes[1, 1]
    if 'news_count' in df.columns:
        avg_news = []
        for symbol in symbols:
            avg_news_count = df[df['symbol'] == symbol]['news_count'].mean()
            avg_news.append(avg_news_count)
        
        bars = ax4.bar(symbols, avg_news, color=plt.cm.spring(np.linspace(0, 1, len(symbols))))
        ax4.set_title('📰 Average Daily News Coverage', fontsize=14, fontweight='bold')
        ax4.set_ylabel('Articles per Day')
        ax4.tick_params(axis='x', rotation=45)
        
        # Add value labels
        for bar, news in zip(bars, avg_news):
            ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(avg_news)*0.01,
                    f'{news:.1f}', ha='center', va='bottom', fontweight='bold')
    else:
        ax4.text(0.5, 0.5, 'No news data\navailable', ha='center', va='center', 
                transform=ax4.transAxes, fontsize=12)
        ax4.set_title('📰 News Coverage (Not Available)', fontsize=14)
    
    plt.tight_layout()
    plt.show()
    
    # Cross-sectional statistics table
    print("\n📊 CROSS-SECTIONAL STATISTICS SUMMARY:")
    print("=" * 80)
    
    stats_data = []
    for symbol in symbols:
        symbol_data = df[df['symbol'] == symbol]
        
        # Calculate key metrics
        total_return = ((symbol_data['Close'].iloc[-1] / symbol_data['Close'].iloc[0]) - 1) * 100
        daily_returns = symbol_data['Close'].pct_change().dropna()
        volatility = daily_returns.std() * np.sqrt(252) * 100  # Annualized
        sharpe = (daily_returns.mean() * 252) / (daily_returns.std() * np.sqrt(252))
        max_drawdown = ((symbol_data['Close'] / symbol_data['Close'].cummax()) - 1).min() * 100
        avg_volume = symbol_data['Volume'].mean() / 1_000_000
        
        news_coverage = symbol_data['news_count'].mean() if 'news_count' in df.columns else 0
        
        stats_data.append({
            'Symbol': symbol,
            'Total Return (%)': f"{total_return:.1f}",
            'Volatility (%)': f"{volatility:.1f}",
            'Sharpe Ratio': f"{sharpe:.2f}",
            'Max Drawdown (%)': f"{max_drawdown:.1f}",
            'Avg Volume (M)': f"{avg_volume:.1f}",
            'News/Day': f"{news_coverage:.1f}"
        })
    
    stats_df = pd.DataFrame(stats_data)
    print(stats_df.to_string(index=False))

analyze_cross_sectional_patterns(df)

In [None]:
def comprehensive_correlation_analysis(df):
    """Comprehensive correlation analysis across all feature types"""
    
    print("\n" + "=" * 60)
    print("📊 COMPREHENSIVE CORRELATION ANALYSIS")
    print("=" * 60)
    
    # Exclude non-numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Remove identifier columns
    analysis_cols = [col for col in numeric_cols if col not in ['symbol']]
    
    if len(analysis_cols) < 2:
        print("⚠️ Not enough numeric features for correlation analysis")
        return
    
    print(f"📊 Analyzing correlations for {len(analysis_cols)} features")
    
    # Calculate correlation matrix (use subset for visualization)
    sample_data = df[analysis_cols].sample(min(10000, len(df)), random_state=42)
    corr_matrix = sample_data.corr()
    
    # 1. Full correlation heatmap (top correlations only)
    plt.figure(figsize=(20, 16))
    
    # Get top correlated features with targets
    target_cols = [col for col in analysis_cols if col.startswith('target_')]
    if target_cols:
        # Find features most correlated with targets
        target_corrs = []
        for target in target_cols:
            for feature in analysis_cols:
                if feature != target:
                    corr_val = abs(corr_matrix.loc[target, feature])
                    target_corrs.append((feature, target, corr_val))
        
        # Sort by correlation strength
        target_corrs.sort(key=lambda x: x[2], reverse=True)
        
        # Take top 30 features most correlated with any target
        top_features = list(set([item[0] for item in target_corrs[:30]]))
        top_features.extend(target_cols)
        
        # Create heatmap for top features
        top_corr_matrix = corr_matrix.loc[top_features, top_features]
        
        sns.heatmap(top_corr_matrix, annot=False, cmap='RdYlBu_r', center=0,
                    square=True, linewidths=0.1, cbar_kws={"shrink": .8})
        plt.title('🔗 Feature Correlation Matrix (Top 30 + Targets)', fontsize=16, fontweight='bold', pad=20)
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=0)
        plt.tight_layout()
        plt.show()
        
        # Print top correlations with targets
        print("\n🎯 TOP FEATURES CORRELATED WITH TARGETS:")
        print("=" * 60)
        
        for target in target_cols:
            print(f"\n{target}:")
            target_corr_series = corr_matrix[target].abs().sort_values(ascending=False)
            # Exclude the target itself
            target_corr_series = target_corr_series[target_corr_series.index != target]
            
            for i, (feature, corr_val) in enumerate(target_corr_series.head(10).items()):
                print(f"  {i+1:2d}. {feature:<25} : {corr_matrix.loc[target, feature]:6.3f}")
    
    # 2. Feature group correlation analysis
    feature_groups = {
        'Price': [col for col in analysis_cols if col in ['Open', 'High', 'Low', 'Close', 'Volume']],
        'Technical': [col for col in analysis_cols if any(tech in col for tech in ['SMA', 'EMA', 'RSI', 'MACD', 'BB', 'volatility'])],
        'News': [col for col in analysis_cols if 'news' in col.lower()],
        'Targets': [col for col in analysis_cols if col.startswith('target_')]
    }
    
    print(f"\n📊 CORRELATION BY FEATURE GROUPS:")
    print("=" * 60)
    
    for group_name, group_features in feature_groups.items():
        if len(group_features) > 1:
            group_corr = corr_matrix.loc[group_features, group_features]
            
            # Calculate average absolute correlation within group
            mask = np.triu(np.ones_like(group_corr, dtype=bool), k=1)
            avg_corr = group_corr.where(mask).abs().mean().mean()
            
            print(f"\n{group_name} Features ({len(group_features)} features):")
            print(f"  Average |correlation|: {avg_corr:.3f}")
            
            # Find most correlated pair within group
            max_corr = 0
            max_pair = None
            for i in range(len(group_features)):
                for j in range(i+1, len(group_features)):
                    corr_val = abs(group_corr.iloc[i, j])
                    if corr_val > max_corr:
                        max_corr = corr_val
                        max_pair = (group_features[i], group_features[j])
            
            if max_pair:
                print(f"  Highest correlation: {max_pair[0]} ↔ {max_pair[1]} ({max_corr:.3f})")

    # 3. Multicollinearity detection
    print(f"\n⚠️ MULTICOLLINEARITY DETECTION:")
    print("=" * 60)
    
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_val = abs(corr_matrix.iloc[i, j])
            if corr_val > 0.9:  # Very high correlation threshold
                high_corr_pairs.append({
                    'Feature 1': corr_matrix.columns[i],
                    'Feature 2': corr_matrix.columns[j],
                    'Correlation': corr_matrix.iloc[i, j]
                })
    
    if high_corr_pairs:
        print("🚨 HIGH CORRELATION PAIRS (|r| > 0.9) - Potential multicollinearity:")
        for pair in sorted(high_corr_pairs, key=lambda x: abs(x['Correlation']), reverse=True)[:10]:
            print(f"  {pair['Feature 1']:<25} ↔ {pair['Feature 2']:<25}: {pair['Correlation']:6.3f}")
    else:
        print("✅ No severe multicollinearity detected (no |r| > 0.9)")

comprehensive_correlation_analysis(df)

In [None]:
def analyze_time_series_patterns(df):
    """Analyze time series patterns and seasonality"""
    
    print("\n" + "=" * 60)
    print("📈 TIME SERIES PATTERNS ANALYSIS")
    print("=" * 60)
    
    symbols = df['symbol'].unique()
    
    # 1. Seasonality Analysis
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Day of week patterns
    ax1 = axes[0, 0]
    df_with_dow = df.copy()
    df_with_dow['day_of_week'] = df_with_dow.index.day_name()
    df_with_dow['daily_return'] = df_with_dow.groupby('symbol')['Close'].pct_change() * 100
    
    dow_returns = df_with_dow.groupby('day_of_week')['daily_return'].mean()
    # Reorder for Monday-Sunday
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    dow_returns = dow_returns.reindex([day for day in day_order if day in dow_returns.index])
    
    bars = ax1.bar(range(len(dow_returns)), dow_returns.values, 
                   color=['red' if x < 0 else 'green' for x in dow_returns.values])
    ax1.set_title('📅 Day of Week Effect', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Day of Week')
    ax1.set_ylabel('Average Return (%)')
    ax1.set_xticks(range(len(dow_returns)))
    ax1.set_xticklabels([day[:3] for day in dow_returns.index], rotation=45)
    ax1.grid(True, alpha=0.3)
    ax1.axhline(0, color='black', linestyle='-', alpha=0.7)
    
    # Month of year patterns
    ax2 = axes[0, 1]
    df_with_month = df.copy()
    df_with_month['month'] = df_with_month.index.month
    df_with_month['daily_return'] = df_with_month.groupby('symbol')['Close'].pct_change() * 100
    
    month_returns = df_with_month.groupby('month')['daily_return'].mean()
    month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                   'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    
    bars = ax2.bar(range(len(month_returns)), month_returns.values,
                   color=['red' if x < 0 else 'green' for x in month_returns.values])
    ax2.set_title('📅 Month of Year Effect', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Month')
    ax2.set_ylabel('Average Return (%)')
    ax2.set_xticks(range(len(month_returns)))
    ax2.set_xticklabels([month_names[i-1] for i in month_returns.index], rotation=45)
    ax2.grid(True, alpha=0.3)
    ax2.axhline(0, color='black', linestyle='-', alpha=0.7)
    
    # Volatility clustering
    ax3 = axes[1, 0]
    symbol = symbols[0]  # Use first symbol for detailed analysis
    symbol_data = df[df['symbol'] == symbol].copy()
    symbol_data['daily_return'] = symbol_data['Close'].pct_change() * 100
    symbol_data['abs_return'] = symbol_data['daily_return'].abs()
    symbol_data['volatility_ma'] = symbol_data['abs_return'].rolling(20).mean()
    
    ax3.plot(symbol_data.index, symbol_data['abs_return'], alpha=0.5, color='blue', linewidth=0.5, label='Daily |Return|')
    ax3.plot(symbol_data.index, symbol_data['volatility_ma'], color='red', linewidth=2, label='20-day MA')
    ax3.set_title(f'📉 Volatility Clustering - {symbol}', fontsize=14, fontweight='bold')
    ax3.set_ylabel('Absolute Return (%)')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # Auto-correlation of returns
    ax4 = axes[1, 1]
    symbol_returns = symbol_data['daily_return'].dropna()
    
    # Calculate autocorrelations for lags 1-20
    lags = range(1, 21)
    autocorrs = [symbol_returns.autocorr(lag) for lag in lags]
    
    ax4.bar(lags, autocorrs, alpha=0.7, color='steelblue')
    ax4.set_title(f'📊 Return Autocorrelation - {symbol}', fontsize=14, fontweight='bold')
    ax4.set_xlabel('Lag (days)')
    ax4.set_ylabel('Autocorrelation')
    ax4.grid(True, alpha=0.3)
    ax4.axhline(0, color='black', linestyle='-', alpha=0.7)
    
    # Add significance bounds (approximate)
    n = len(symbol_returns)
    bound = 1.96 / np.sqrt(n)
    ax4.axhline(bound, color='red', linestyle='--', alpha=0.7, label=f'95% bound (±{bound:.3f})')
    ax4.axhline(-bound, color='red', linestyle='--', alpha=0.7)
    ax4.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Print seasonality insights
    print("\n📅 SEASONALITY INSIGHTS:")
    print("=" * 40)
    
    print(f"📊 Day of Week Effects:")
    for day, return_val in dow_returns.items():
        print(f"  {day}: {return_val:+.3f}%")
    
    best_day = dow_returns.idxmax()
    worst_day = dow_returns.idxmin()
    print(f"  🎯 Best day: {best_day} ({dow_returns[best_day]:+.3f}%)")
    print(f"  📉 Worst day: {worst_day} ({dow_returns[worst_day]:+.3f}%)")
    
    print(f"\n📊 Month of Year Effects:")
    for month_num, return_val in month_returns.items():
        month_name = month_names[month_num-1]
        print(f"  {month_name}: {return_val:+.3f}%")
    
    best_month = month_returns.idxmax()
    worst_month = month_returns.idxmin()
    print(f"  🎯 Best month: {month_names[best_month-1]} ({month_returns[best_month]:+.3f}%)")
    print(f"  📉 Worst month: {month_names[worst_month-1]} ({month_returns[worst_month]:+.3f}%)")

analyze_time_series_patterns(df)

In [None]:
def generate_insights_and_recommendations(df):
    """Generate key insights and recommendations based on the analysis"""
    
    print("\n" + "=" * 60)
    print("💡 KEY INSIGHTS & RECOMMENDATIONS")
    print("=" * 60)
    
    symbols = df['symbol'].unique()
    
    insights = []
    recommendations = []
    
    # 1. Data Quality Insights
    total_missing = df.isnull().sum().sum()
    total_cells = df.shape[0] * df.shape[1]
    missing_pct = (total_missing / total_cells) * 100
    
    if missing_pct < 1:
        insights.append(f"✅ Excellent data quality: Only {missing_pct:.2f}% missing values")
    elif missing_pct < 5:
        insights.append(f"⚠️ Good data quality: {missing_pct:.2f}% missing values")
        recommendations.append("Consider imputation strategies for missing values")
    else:
        insights.append(f"🚨 Data quality concerns: {missing_pct:.2f}% missing values")
        recommendations.append("Prioritize data cleaning and missing value handling")
    
    # 2. Target Variable Insights
    target_cols = [col for col in df.columns if col.startswith('target_')]
    if target_cols:
        for target_col in target_cols:
            returns = df[target_col].dropna() * 100
            horizon = target_col.replace('target_', '').replace('d', '')
            
            sharpe = returns.mean() / returns.std() if returns.std() > 0 else 0
            positive_pct = (returns > 0).mean() * 100
            
            insights.append(f"📊 {horizon}-day forecasting: {positive_pct:.1f}% positive returns, Sharpe: {sharpe:.3f}")
            
            if positive_pct > 55:
                insights.append(f"🎯 {horizon}-day horizon shows positive bias - good for long strategies")
            elif positive_pct < 45:
                insights.append(f"📉 {horizon}-day horizon shows negative bias - consider short strategies")
    
    # 3. Volatility Insights
    if 'volatility_20d' in df.columns:
        avg_vol = df['volatility_20d'].mean() * 100
        insights.append(f"📉 Average portfolio volatility: {avg_vol:.1f}% (20-day)")
        
        if avg_vol > 30:
            recommendations.append("High volatility detected - consider volatility-adjusted position sizing")
        elif avg_vol < 15:
            insights.append("📊 Low volatility environment - suitable for higher leverage strategies")
    
    # 4. News Coverage Insights
    if 'news_count' in df.columns:
        avg_news = df['news_count'].mean()
        insights.append(f"📰 Average news coverage: {avg_news:.1f} articles/day/symbol")
        
        if avg_news > 5:
            insights.append("📈 Rich news environment - sentiment analysis highly valuable")
            recommendations.append("Implement advanced NLP techniques for sentiment extraction")
        elif avg_news < 1:
            insights.append("📰 Limited news coverage - consider expanding news sources")
            recommendations.append("Add more news sources or consider alternative text data")
    
    # 5. Cross-Sectional Insights
    total_return_by_symbol = {}
    vol_by_symbol = {}
    
    for symbol in symbols:
        symbol_data = df[df['symbol'] == symbol]
        total_return = ((symbol_data['Close'].iloc[-1] / symbol_data['Close'].iloc[0]) - 1) * 100
        daily_returns = symbol_data['Close'].pct_change().dropna()
        volatility = daily_returns.std() * np.sqrt(252) * 100
        
        total_return_by_symbol[symbol] = total_return
        vol_by_symbol[symbol] = volatility
    
    best_performer = max(total_return_by_symbol, key=total_return_by_symbol.get)
    worst_performer = min(total_return_by_symbol, key=total_return_by_symbol.get)
    
    insights.append(f"🏆 Best performer: {best_performer} ({total_return_by_symbol[best_performer]:+.1f}%)")
    insights.append(f"📉 Worst performer: {worst_performer} ({total_return_by_symbol[worst_performer]:+.1f}%)")
    
    # 6. Feature Engineering Insights
    feature_groups = {
        'Technical': [col for col in df.columns if any(tech in col for tech in ['SMA', 'EMA', 'RSI', 'MACD', 'BB'])],
        'News': [col for col in df.columns if 'news' in col.lower()],
        'Lag': [col for col in df.columns if 'lag' in col.lower()]
    }
    
    for group, features in feature_groups.items():
        if features:
            insights.append(f"🔧 {group} features: {len(features)} engineered")
        else:
            recommendations.append(f"Consider adding {group.lower()} features")
    
    # Print insights
    print("\n🔍 KEY INSIGHTS:")
    for i, insight in enumerate(insights, 1):
        print(f"{i:2d}. {insight}")
    
    print("\n📋 RECOMMENDATIONS:")
    for i, rec in enumerate(recommendations, 1):
        print(f"{i:2d}. {rec}")
    
    # 7. Model Training Recommendations
    print("\n🤖 MODEL TRAINING RECOMMENDATIONS:")
    print("=" * 50)
    
    model_recs = [
        "Use cross-validation with time series splits (avoid look-ahead bias)",
        "Implement early stopping to prevent overfitting",
        "Consider ensemble methods to improve robustness",
        "Use proper evaluation metrics: Sharpe ratio, directional accuracy, risk-adjusted returns",
        "Implement walk-forward analysis for realistic backtesting",
    ]
    
    # Add specific recommendations based on data characteristics
    if len(symbols) > 1:
        model_recs.append("Leverage cross-sectional information in multi-stock models")
    
    if 'news_count' in df.columns and df['news_count'].mean() > 2:
        model_recs.append("Implement temporal decay for sentiment features (key innovation)")
    
    if len([col for col in df.columns if 'volatility' in col]) > 0:
        model_recs.append("Use volatility features for dynamic position sizing")
    
    for i, rec in enumerate(model_recs, 1):
        print(f"{i:2d}. {rec}")
    
    print("\n🎯 NEXT STEPS:")
    print("=" * 30)
    print("1. Run Step 2: Sentiment Analysis")
    print("   → python run_experiment.py --step 2")
    print("2. Run Step 3: Temporal Decay Processing")
    print("   → python run_experiment.py --step 3")
    print("3. Run Complete Pipeline")
    print("   → python run_experiment.py")

generate_insights_and_recommendations(df)

In [None]:
def create_summary_dashboard(df):
    """Create a comprehensive summary dashboard"""
    
    print("\n" + "=" * 60)
    print("📊 DATASET SUMMARY DASHBOARD")
    print("=" * 60)
    
    # Key metrics
    symbols = df['symbol'].unique()
    date_range = (df.index.max() - df.index.min()).days
    
    print(f"📅 Analysis Period: {df.index.min().date()} to {df.index.max().date()} ({date_range} days)")
    print(f"📈 Symbols Analyzed: {len(symbols)} ({', '.join(symbols)})")
    print(f"📊 Dataset Size: {df.shape[0]:,} rows × {df.shape[1]} features")
    print(f"💾 Memory Usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")
    
    # Feature breakdown
    feature_counts = {
        'Price': len([col for col in df.columns if col in ['Open', 'High', 'Low', 'Close', 'Volume']]),
        'Technical': len([col for col in df.columns if any(tech in col for tech in ['SMA', 'EMA', 'RSI', 'MACD', 'BB', 'volatility'])]),
        'News': len([col for col in df.columns if 'news' in col.lower()]),
        'Targets': len([col for col in df.columns if col.startswith('target_')]),
        'Other': df.shape[1] - sum([
            len([col for col in df.columns if col in ['Open', 'High', 'Low', 'Close', 'Volume']]),
            len([col for col in df.columns if any(tech in col for tech in ['SMA', 'EMA', 'RSI', 'MACD', 'BB', 'volatility'])]),
            len([col for col in df.columns if 'news' in col.lower()]),
            len([col for col in df.columns if col.startswith('target_')])
        ])
    }
    
    print(f"\n🔧 Feature Breakdown:")
    for feature_type, count in feature_counts.items():
        if count > 0:
            print(f"   {feature_type}: {count} features")
    
    # Data quality summary
    missing_pct = (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
    print(f"\n🔍 Data Quality:")
    print(f"   Missing Values: {missing_pct:.2f}%")
    print(f"   Complete Cases: {df.dropna().shape[0]:,} ({(df.dropna().shape[0]/df.shape[0])*100:.1f}%)")
    
    # Performance summary
    print(f"\n📈 Performance Summary (Total Returns):")
    for symbol in symbols:
        symbol_data = df[df['symbol'] == symbol]
        total_return = ((symbol_data['Close'].iloc[-1] / symbol_data['Close'].iloc[0]) - 1) * 100
        volatility = symbol_data['Close'].pct_change().std() * np.sqrt(252) * 100
        print(f"   {symbol}: {total_return:+6.1f}% (Vol: {volatility:4.1f}%)")
    
    # News coverage summary
    if 'news_count' in df.columns:
        print(f"\n📰 News Coverage:")
        total_articles = df['news_count'].sum()
        avg_daily = df['news_count'].mean()
        print(f"   Total Articles: {total_articles:,.0f}")
        print(f"   Average/Day: {avg_daily:.1f}")
        
        for symbol in symbols:
            symbol_news = df[df['symbol'] == symbol]['news_count'].mean()
            print(f"   {symbol}: {symbol_news:.1f} articles/day")
    
    print(f"\n✅ Dataset ready for sentiment analysis and temporal decay processing!")
    print(f"🚀 Run: python run_experiment.py --step 2")

create_summary_dashboard(df)