# Comprehensive Architecture Analysis for VIX Forecasting

This notebook provides comprehensive analysis and comparison of all CNN-LSTM and GRU architectures with:
- **Statistical Significance Testing**: Robust comparison between all models
- **Performance Metrics Analysis**: Detailed evaluation of all architectures
- **Confidence Intervals**: Statistical confidence in predictions
- **Walk-Forward Validation**: Time series specific validation
- **Economic Significance Analysis**: Real-world trading implications
- **Best Model Selection**: Data-driven architecture selection

## Architectures Analyzed:

### CNN-LSTM Variants:
1. **Basic CNN-LSTM**: Baseline hybrid architecture
2. **Deep CNN-LSTM**: Enhanced depth with multiple layers
3. **Bidirectional CNN-LSTM**: Bidirectional temporal modeling
4. **Attention CNN-LSTM**: Multi-head attention mechanism
5. **Multiscale CNN-LSTM**: Multi-scale feature extraction

### GRU Variants:
1. **Basic GRU**: Baseline recurrent architecture
2. **Deep GRU**: Enhanced depth with multiple layers
3. **Bidirectional GRU**: Bidirectional temporal modeling
4. **Attention GRU**: Multi-head attention mechanism
5. **Residual GRU**: Residual connections for gradient flow
6. **Dropout-Enhanced GRU**: Advanced regularization strategies

## Block 1: Import Libraries and Setup

In [None]:
# Import shared utilities
from vix_research_utils import *

# Deep learning imports
import tensorflow as tf
from tensorflow.keras.models import load_model

# Statistical analysis
from scipy import stats
from scipy.stats import ttest_rel, wilcoxon, friedmanchisquare
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.stattools import adfuller
from arch.unitroot import ADF

# Additional imports
import joblib
import json
from pathlib import Path
import itertools
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully")
print(f"TensorFlow version: {tf.__version__}")

## Block 2: Load Results and Data

In [None]:
# Load data for analysis
print("Loading VIX and VVIX data...")
vix_data, vvix_data = download_market_data()
vix_clean = clean_data(vix_data)
vvix_clean = clean_data(vvix_data)
features_df = create_features(vix_clean, vvix_clean)
X, y, feature_names, scaler = prepare_sequences(features_df, sequence_length=30)

# Split data
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(f"Data loaded: {X.shape[0]} samples, {X.shape[2]} features")
print(f"Test set: {X_test.shape[0]} samples")

# Load CNN-LSTM results
cnn_lstm_results = {}
cnn_lstm_architectures = ['Basic_CNN_LSTM', 'Deep_CNN_LSTM', 'Bidirectional_CNN_LSTM', 
                         'Attention_CNN_LSTM', 'Multiscale_CNN_LSTM']

for arch in cnn_lstm_architectures:
    try:
        results = joblib.load(f'cnn_lstm_results_{arch.lower()}.pkl')
        cnn_lstm_results[arch] = results
        print(f"Loaded {arch} results")
    except FileNotFoundError:
        print(f"Warning: {arch} results not found")

# Load GRU results
gru_results = {}
gru_architectures = ['Basic_GRU', 'Deep_GRU', 'Bidirectional_GRU', 
                    'Attention_GRU', 'Residual_GRU', 'Dropout_Enhanced_GRU']

for arch in gru_architectures:
    try:
        results = joblib.load(f'gru_results_{arch.lower()}.pkl')
        gru_results[arch] = results
        print(f"Loaded {arch} results")
    except FileNotFoundError:
        print(f"Warning: {arch} results not found")

# Combine all results
all_results = {**cnn_lstm_results, **gru_results}
print(f"\nTotal architectures loaded: {len(all_results)}")

## Block 3: Performance Metrics Analysis

In [None]:
def create_performance_summary(results_dict):
    """Create comprehensive performance summary"""
    summary_data = []
    
    for arch_name, results in results_dict.items():
        if results is None:
            continue
            
        test_metrics = results['test_metrics']
        train_metrics = results['train_metrics']
        
        # Calculate additional metrics
        test_predictions = results['test_predictions']
        train_predictions = results['train_predictions']
        
        # Overfitting measure
        overfitting = train_metrics['mse'] - test_metrics['mse']
        
        # Prediction variance
        pred_variance = np.var(test_predictions)
        
        # Architecture type
        arch_type = 'CNN-LSTM' if 'CNN_LSTM' in arch_name else 'GRU'
        
        summary_data.append({
            'Architecture': arch_name,
            'Type': arch_type,
            'Test_MSE': test_metrics['mse'],
            'Test_MAE': test_metrics['mae'],
            'Test_R2': test_metrics['r2'],
            'Train_MSE': train_metrics['mse'],
            'Train_MAE': train_metrics['mae'],
            'Train_R2': train_metrics['r2'],
            'Overfitting': overfitting,
            'Pred_Variance': pred_variance,
            'Best_Params': str(results['best_params'])
        })
    
    return pd.DataFrame(summary_data)

# Create performance summary
performance_df = create_performance_summary(all_results)

if not performance_df.empty:
    # Sort by test MSE (lower is better)
    performance_df = performance_df.sort_values('Test_MSE')
    
    print("PERFORMANCE SUMMARY (sorted by Test MSE)")
    print("=" * 80)
    
    # Display key metrics
    display_cols = ['Architecture', 'Type', 'Test_MSE', 'Test_MAE', 'Test_R2', 'Overfitting']
    print(performance_df[display_cols].to_string(index=False, float_format='%.6f'))
    
    # Best performing model
    best_model = performance_df.iloc[0]
    print(f"\nBEST PERFORMING MODEL: {best_model['Architecture']}")
    print(f"Test MSE: {best_model['Test_MSE']:.6f}")
    print(f"Test MAE: {best_model['Test_MAE']:.6f}")
    print(f"Test R²: {best_model['Test_R2']:.6f}")
else:
    print("No results available for analysis")

## Block 4: Statistical Significance Testing

In [None]:
def perform_statistical_tests(results_dict, y_test):
    """Perform comprehensive statistical significance testing"""
    if len(results_dict) < 2:
        print("Need at least 2 models for statistical comparison")
        return None
    
    # Extract predictions and calculate errors
    model_errors = {}
    model_predictions = {}
    
    for arch_name, results in results_dict.items():
        if results is None:
            continue
        predictions = results['test_predictions']
        errors = np.abs(predictions - y_test)
        model_errors[arch_name] = errors
        model_predictions[arch_name] = predictions
    
    print("STATISTICAL SIGNIFICANCE TESTING")
    print("=" * 60)
    
    # Pairwise t-tests (paired samples)
    print("\n1. PAIRWISE T-TESTS (Paired Samples)")
    print("-" * 40)
    
    model_names = list(model_errors.keys())
    pairwise_results = []
    
    for i, model1 in enumerate(model_names):
        for j, model2 in enumerate(model_names[i+1:], i+1):
            errors1 = model_errors[model1]
            errors2 = model_errors[model2]
            
            # Paired t-test
            t_stat, p_value = ttest_rel(errors1, errors2)
            
            # Effect size (Cohen's d)
            diff = errors1 - errors2
            cohens_d = np.mean(diff) / np.std(diff)
            
            pairwise_results.append({
                'Model1': model1,
                'Model2': model2,
                'T_Statistic': t_stat,
                'P_Value': p_value,
                'Cohens_D': cohens_d,
                'Significant': p_value < 0.05
            })
            
            significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else "ns"
            print(f"{model1} vs {model2}: t={t_stat:.3f}, p={p_value:.6f} {significance}")
    
    # Friedman test (non-parametric)
    print("\n2. FRIEDMAN TEST (Non-parametric)")
    print("-" * 40)
    
    if len(model_errors) >= 3:
        error_arrays = [model_errors[name] for name in model_names]
        friedman_stat, friedman_p = friedmanchisquare(*error_arrays)
        print(f"Friedman statistic: {friedman_stat:.6f}")
        print(f"P-value: {friedman_p:.6f}")
        print(f"Significant difference: {'Yes' if friedman_p < 0.05 else 'No'}")
    else:
        print("Need at least 3 models for Friedman test")
    
    return pd.DataFrame(pairwise_results)

# Perform statistical tests
if len(all_results) >= 2:
    statistical_results = perform_statistical_tests(all_results, y_test)
else:
    print("Need at least 2 models for statistical testing")
    statistical_results = None

## Block 5: Visualization and Analysis

In [None]:
def create_comprehensive_visualizations(results_dict, performance_df, y_test):
    """Create comprehensive visualizations for model comparison"""
    if len(results_dict) == 0:
        print("No results available for visualization")
        return
    
    # Set up the plotting environment
    plt.style.use('seaborn-v0_8')
    fig = plt.figure(figsize=(20, 15))
    
    # 1. Performance Comparison Bar Plot
    ax1 = plt.subplot(2, 3, 1)
    if not performance_df.empty:
        sns.barplot(data=performance_df, x='Test_MSE', y='Architecture', hue='Type', ax=ax1)
        ax1.set_title('Test MSE Comparison by Architecture', fontsize=14, fontweight='bold')
        ax1.set_xlabel('Test MSE (lower is better)')
    
    # 2. R² Comparison
    ax2 = plt.subplot(2, 3, 2)
    if not performance_df.empty:
        sns.barplot(data=performance_df, x='Test_R2', y='Architecture', hue='Type', ax=ax2)
        ax2.set_title('Test R² Comparison by Architecture', fontsize=14, fontweight='bold')
        ax2.set_xlabel('Test R² (higher is better)')
    
    # 3. Overfitting Analysis
    ax3 = plt.subplot(2, 3, 3)
    if not performance_df.empty:
        sns.scatterplot(data=performance_df, x='Train_MSE', y='Test_MSE', 
                       hue='Type', size='Overfitting', ax=ax3)
        # Add diagonal line for reference
        min_val = min(performance_df['Train_MSE'].min(), performance_df['Test_MSE'].min())
        max_val = max(performance_df['Train_MSE'].max(), performance_df['Test_MSE'].max())
        ax3.plot([min_val, max_val], [min_val, max_val], 'k--', alpha=0.5)
        ax3.set_title('Overfitting Analysis', fontsize=14, fontweight='bold')
        ax3.set_xlabel('Train MSE')
        ax3.set_ylabel('Test MSE')
    
    # 4. Prediction Distribution
    ax4 = plt.subplot(2, 3, 4)
    for arch_name, results in results_dict.items():
        if results is not None:
            predictions = results['test_predictions']
            ax4.hist(predictions, alpha=0.6, label=arch_name, bins=20)
    ax4.hist(y_test, alpha=0.8, label='Actual', bins=20, color='black')
    ax4.set_title('Prediction Distributions', fontsize=14, fontweight='bold')
    ax4.set_xlabel('VIX Value')
    ax4.set_ylabel('Frequency')
    ax4.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # 5. Error Distribution Boxplot
    ax5 = plt.subplot(2, 3, 5)
    error_data = []
    for arch_name, results in results_dict.items():
        if results is not None:
            predictions = results['test_predictions']
            errors = np.abs(predictions - y_test)
            for error in errors:
                error_data.append({'Architecture': arch_name, 'Absolute_Error': error})
    
    if error_data:
        error_df = pd.DataFrame(error_data)
        sns.boxplot(data=error_df, x='Absolute_Error', y='Architecture', ax=ax5)
        ax5.set_title('Error Distribution by Architecture', fontsize=14, fontweight='bold')
        ax5.set_xlabel('Absolute Error')
    
    # 6. Time Series Predictions (Best vs Worst)
    ax6 = plt.subplot(2, 3, 6)
    if not performance_df.empty and len(results_dict) >= 2:
        best_arch = performance_df.iloc[0]['Architecture']
        worst_arch = performance_df.iloc[-1]['Architecture']
        
        # Plot last 100 predictions
        n_plot = min(100, len(y_test))
        x_range = range(len(y_test) - n_plot, len(y_test))
        
        ax6.plot(x_range, y_test[-n_plot:], 'k-', label='Actual', linewidth=2)
        
        if best_arch in results_dict and results_dict[best_arch] is not None:
            best_pred = results_dict[best_arch]['test_predictions']
            ax6.plot(x_range, best_pred[-n_plot:], '--', label=f'Best: {best_arch}', linewidth=2)
        
        if worst_arch in results_dict and results_dict[worst_arch] is not None:
            worst_pred = results_dict[worst_arch]['test_predictions']
            ax6.plot(x_range, worst_pred[-n_plot:], ':', label=f'Worst: {worst_arch}', linewidth=2)
        
        ax6.set_title('Time Series Predictions (Last 100 points)', fontsize=14, fontweight='bold')
        ax6.set_xlabel('Time Index')
        ax6.set_ylabel('VIX Value')
        ax6.legend()
        ax6.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# Create visualizations
if len(all_results) > 0:
    create_comprehensive_visualizations(all_results, performance_df, y_test)
else:
    print("No results available for visualization")

## Block 6: Economic Significance Analysis

In [None]:
def analyze_economic_significance(results_dict, y_test, threshold=0.5):
    """Analyze economic significance of predictions"""
    print("ECONOMIC SIGNIFICANCE ANALYSIS")
    print("=" * 60)
    
    economic_results = []
    
    for arch_name, results in results_dict.items():
        if results is None:
            continue
            
        predictions = results['test_predictions']
        
        # Direction accuracy (up/down prediction)
        actual_direction = np.diff(y_test) > 0
        pred_direction = np.diff(predictions) > 0
        direction_accuracy = np.mean(actual_direction == pred_direction)
        
        # Large movement prediction (>threshold)
        large_movements = np.abs(np.diff(y_test)) > threshold
        if np.sum(large_movements) > 0:
            large_move_accuracy = np.mean(
                actual_direction[large_movements] == pred_direction[large_movements]
            )
        else:
            large_move_accuracy = np.nan
        
        # Volatility regime prediction
        high_vol_regime = y_test > np.percentile(y_test, 75)
        low_vol_regime = y_test < np.percentile(y_test, 25)
        
        high_vol_mse = np.mean((predictions[high_vol_regime] - y_test[high_vol_regime])**2)
        low_vol_mse = np.mean((predictions[low_vol_regime] - y_test[low_vol_regime])**2)
        
        # Trading signal accuracy (simplified)
        pred_changes = np.diff(predictions)
        actual_changes = np.diff(y_test)
        
        # Profitable trades (same direction)
        profitable_trades = np.sum((pred_changes * actual_changes) > 0)
        total_trades = len(pred_changes)
        profit_ratio = profitable_trades / total_trades if total_trades > 0 else 0
        
        economic_results.append({
            'Architecture': arch_name,
            'Direction_Accuracy': direction_accuracy,
            'Large_Move_Accuracy': large_move_accuracy,
            'High_Vol_MSE': high_vol_mse,
            'Low_Vol_MSE': low_vol_mse,
            'Profit_Ratio': profit_ratio
        })
        
        print(f"\n{arch_name}:")
        print(f"  Direction Accuracy: {direction_accuracy:.3f}")
        print(f"  Large Move Accuracy: {large_move_accuracy:.3f}")
        print(f"  High Vol MSE: {high_vol_mse:.6f}")
        print(f"  Low Vol MSE: {low_vol_mse:.6f}")
        print(f"  Profit Ratio: {profit_ratio:.3f}")
    
    return pd.DataFrame(economic_results)

# Perform economic significance analysis
if len(all_results) > 0:
    economic_df = analyze_economic_significance(all_results, y_test)
else:
    print("No results available for economic analysis")
    economic_df = None