# CNN-LSTM Architecture Testing for VIX Forecasting

This notebook implements comprehensive CNN-LSTM architecture testing with:

- **Multiple Architecture Variants**: Basic, Deep, Bidirectional, Attention, Multiscale
- **Hyperparameter Optimization**: Optuna-based optimization for each architecture
- **Time Series Cross-Validation**: Proper temporal validation
- **Statistical Significance Testing**: Robust model comparison
- **Comprehensive Training & Evaluation**: Full pipeline for each variant
- **Results Storage**: Systematic storage of all results for comparison

## Architecture Variants Tested:

1. **Basic CNN-LSTM**: Baseline architecture with standard CNN + LSTM layers
2. **Deep CNN-LSTM**: Enhanced depth with multiple CNN and LSTM layers
3. **Bidirectional CNN-LSTM**: Bidirectional LSTM for better temporal modeling
4. **Attention CNN-LSTM**: Multi-head attention mechanism for feature importance
5. **Multiscale CNN-LSTM**: Multiple kernel sizes for multi-scale feature extraction

## Block 1: Import Libraries and Setup

In [None]:
# Import shared utilities
from vix_research_utils import *

# Deep learning imports
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Dense, Dropout, LSTM, Conv1D, MaxPooling1D, Input, 
    MultiHeadAttention, LayerNormalization, Bidirectional, 
    BatchNormalization, GlobalAveragePooling1D, GlobalMaxPooling1D,
    Concatenate, Add, GaussianNoise, Flatten
)
from tensorflow.keras.optimizers import Adam, AdamW
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.regularizers import l1_l2

# Hyperparameter optimization
import optuna
from optuna.integration import TFKerasPruningCallback
from optuna.trial import TrialState

# Additional imports
import time
import json
import joblib
from pathlib import Path
from sklearn.model_selection import TimeSeriesSplit

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Configure TensorFlow
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

## Block 2: Data Loading and Preprocessing

In [None]:
# Load and preprocess data using shared utilities
print("Loading VIX and VVIX data...")
vix_data, vvix_data = download_market_data()

print("\nCreating features...")
features_df = create_features(vix_data, vvix_data)

print("\nPreparing sequences for deep learning...")
X, y, feature_names, scaler = prepare_sequences(features_df, sequence_length=30)

print(f"\nData shapes:")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Number of features: {len(feature_names)}")
print(f"Features: {feature_names}")

# Split data for time series
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

input_shape = (X_train.shape[1], X_train.shape[2])
print(f"Input shape: {input_shape}")

## Block 3: CNN-LSTM Architecture Definitions

In [None]:
def build_basic_cnn_lstm(trial, input_shape):
    """Basic CNN-LSTM architecture with hyperparameter optimization"""
    # Hyperparameters
    cnn_filters_1 = trial.suggest_int('cnn_filters_1', 32, 128, step=32)
    cnn_filters_2 = trial.suggest_int('cnn_filters_2', 16, 64, step=16)
    lstm_units_1 = trial.suggest_int('lstm_units_1', 32, 128, step=32)
    lstm_units_2 = trial.suggest_int('lstm_units_2', 16, 64, step=16)
    dense_units = trial.suggest_int('dense_units', 8, 32, step=8)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    
    model = Sequential([
        # CNN layers
        Conv1D(filters=cnn_filters_1, kernel_size=3, activation='relu', 
               padding='same', input_shape=input_shape),
        MaxPooling1D(pool_size=2),
        Conv1D(filters=cnn_filters_2, kernel_size=3, activation='relu', padding='same'),
        
        # LSTM layers
        LSTM(lstm_units_1, return_sequences=True),
        Dropout(dropout_rate),
        LSTM(lstm_units_2),
        Dropout(dropout_rate),
        
        # Dense layers
        Dense(dense_units, activation='relu'),
        Dense(1)
    ])
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

def build_deep_cnn_lstm(trial, input_shape):
    """Deep CNN-LSTM with multiple layers"""
    # Hyperparameters
    cnn_filters_1 = trial.suggest_int('cnn_filters_1', 64, 256, step=64)
    cnn_filters_2 = trial.suggest_int('cnn_filters_2', 32, 128, step=32)
    cnn_filters_3 = trial.suggest_int('cnn_filters_3', 16, 64, step=16)
    lstm_units_1 = trial.suggest_int('lstm_units_1', 64, 256, step=64)
    lstm_units_2 = trial.suggest_int('lstm_units_2', 32, 128, step=32)
    lstm_units_3 = trial.suggest_int('lstm_units_3', 16, 64, step=16)
    dense_units_1 = trial.suggest_int('dense_units_1', 16, 64, step=16)
    dense_units_2 = trial.suggest_int('dense_units_2', 8, 32, step=8)
    dropout_rate = trial.suggest_float('dropout_rate', 0.2, 0.6)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    
    model = Sequential([
        # Deep CNN layers
        Conv1D(filters=cnn_filters_1, kernel_size=3, activation='relu', 
               padding='same', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        
        Conv1D(filters=cnn_filters_2, kernel_size=3, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        
        Conv1D(filters=cnn_filters_3, kernel_size=3, activation='relu', padding='same'),
        BatchNormalization(),
        
        # Deep LSTM layers
        LSTM(lstm_units_1, return_sequences=True),
        Dropout(dropout_rate),
        LSTM(lstm_units_2, return_sequences=True),
        Dropout(dropout_rate),
        LSTM(lstm_units_3),
        Dropout(dropout_rate),
        
        # Dense layers
        Dense(dense_units_1, activation='relu'),
        Dropout(dropout_rate * 0.7),
        Dense(dense_units_2, activation='relu'),
        Dense(1)
    ])
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

def build_bidirectional_cnn_lstm(trial, input_shape):
    """CNN-LSTM with bidirectional LSTM layers"""
    # Hyperparameters
    cnn_filters_1 = trial.suggest_int('cnn_filters_1', 32, 128, step=32)
    cnn_filters_2 = trial.suggest_int('cnn_filters_2', 16, 64, step=16)
    lstm_units_1 = trial.suggest_int('lstm_units_1', 32, 128, step=32)
    lstm_units_2 = trial.suggest_int('lstm_units_2', 16, 64, step=16)
    dense_units = trial.suggest_int('dense_units', 16, 64, step=16)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    
    model = Sequential([
        # CNN layers
        Conv1D(filters=cnn_filters_1, kernel_size=3, activation='relu', 
               padding='same', input_shape=input_shape),
        MaxPooling1D(pool_size=2),
        Conv1D(filters=cnn_filters_2, kernel_size=3, activation='relu', padding='same'),
        
        # Bidirectional LSTM layers
        Bidirectional(LSTM(lstm_units_1, return_sequences=True)),
        Dropout(dropout_rate),
        Bidirectional(LSTM(lstm_units_2)),
        Dropout(dropout_rate),
        
        # Dense layers
        Dense(dense_units, activation='relu'),
        Dropout(dropout_rate * 0.7),
        Dense(1)
    ])
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

## Block 4: Advanced CNN-LSTM Architectures

In [None]:
def build_attention_cnn_lstm(trial, input_shape):
    """CNN-LSTM with multi-head attention mechanism"""
    # Hyperparameters
    cnn_filters_1 = trial.suggest_int('cnn_filters_1', 32, 128, step=32)
    cnn_filters_2 = trial.suggest_int('cnn_filters_2', 16, 64, step=16)
    lstm_units = trial.suggest_int('lstm_units', 32, 128, step=32)
    attention_heads = trial.suggest_int('attention_heads', 2, 8, step=2)
    dense_units = trial.suggest_int('dense_units', 16, 64, step=16)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    
    # Input layer
    inputs = Input(shape=input_shape)
    
    # CNN layers
    x = Conv1D(filters=cnn_filters_1, kernel_size=3, activation='relu', padding='same')(inputs)
    x = MaxPooling1D(pool_size=2)(x)
    x = Conv1D(filters=cnn_filters_2, kernel_size=3, activation='relu', padding='same')(x)
    
    # LSTM layer
    lstm_out = LSTM(lstm_units, return_sequences=True)(x)
    lstm_out = Dropout(dropout_rate)(lstm_out)
    
    # Multi-head attention
    attention_out = MultiHeadAttention(
        num_heads=attention_heads, 
        key_dim=lstm_units // attention_heads
    )(lstm_out, lstm_out)
    
    # Layer normalization and residual connection
    attention_out = LayerNormalization()(attention_out)
    combined = Add()([lstm_out, attention_out])
    
    # Global pooling
    pooled = GlobalAveragePooling1D()(combined)
    
    # Dense layers
    x = Dense(dense_units, activation='relu')(pooled)
    x = Dropout(dropout_rate)(x)
    outputs = Dense(1)(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

def build_multiscale_cnn_lstm(trial, input_shape):
    """CNN-LSTM with multiple kernel sizes for multi-scale feature extraction"""
    # Hyperparameters
    filters_small = trial.suggest_int('filters_small', 16, 64, step=16)
    filters_medium = trial.suggest_int('filters_medium', 16, 64, step=16)
    filters_large = trial.suggest_int('filters_large', 16, 64, step=16)
    lstm_units = trial.suggest_int('lstm_units', 32, 128, step=32)
    dense_units = trial.suggest_int('dense_units', 16, 64, step=16)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    
    # Input layer
    inputs = Input(shape=input_shape)
    
    # Multi-scale CNN branches
    # Small kernel (short-term patterns)
    conv_small = Conv1D(filters=filters_small, kernel_size=3, activation='relu', padding='same')(inputs)
    conv_small = MaxPooling1D(pool_size=2)(conv_small)
    
    # Medium kernel (medium-term patterns)
    conv_medium = Conv1D(filters=filters_medium, kernel_size=5, activation='relu', padding='same')(inputs)
    conv_medium = MaxPooling1D(pool_size=2)(conv_medium)
    
    # Large kernel (long-term patterns)
    conv_large = Conv1D(filters=filters_large, kernel_size=7, activation='relu', padding='same')(inputs)
    conv_large = MaxPooling1D(pool_size=2)(conv_large)
    
    # Concatenate multi-scale features
    combined_conv = Concatenate()([conv_small, conv_medium, conv_large])
    
    # LSTM layers
    lstm_out = LSTM(lstm_units, return_sequences=True)(combined_conv)
    lstm_out = Dropout(dropout_rate)(lstm_out)
    lstm_out = LSTM(lstm_units // 2)(lstm_out)
    lstm_out = Dropout(dropout_rate)(lstm_out)
    
    # Dense layers
    x = Dense(dense_units, activation='relu')(lstm_out)
    x = Dropout(dropout_rate)(x)
    outputs = Dense(1)(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

# Architecture registry
CNN_LSTM_ARCHITECTURES = {
    'Basic_CNN_LSTM': build_basic_cnn_lstm,
    'Deep_CNN_LSTM': build_deep_cnn_lstm,
    'Bidirectional_CNN_LSTM': build_bidirectional_cnn_lstm,
    'Attention_CNN_LSTM': build_attention_cnn_lstm,
    'Multiscale_CNN_LSTM': build_multiscale_cnn_lstm
}

print(f"CNN-LSTM architectures available: {list(CNN_LSTM_ARCHITECTURES.keys())}")

## Block 5: Hyperparameter Optimization Framework

In [None]:
def objective_function(trial, architecture_name, X_train, y_train, input_shape):
    """Objective function for hyperparameter optimization"""
    try:
        # Build model with trial hyperparameters
        model_builder = CNN_LSTM_ARCHITECTURES[architecture_name]
        model = model_builder(trial, input_shape)
        
        # Time series cross-validation
        tscv = TimeSeriesSplit(n_splits=3)
        cv_scores = []
        
        for train_idx, val_idx in tscv.split(X_train):
            X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
            y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]
            
            # Callbacks
            early_stopping = EarlyStopping(
                monitor='val_loss', patience=10, restore_best_weights=True
            )
            reduce_lr = ReduceLROnPlateau(
                monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6
            )
            pruning_callback = TFKerasPruningCallback(trial, 'val_loss')
            
            # Train model
            history = model.fit(
                X_fold_train, y_fold_train,
                validation_data=(X_fold_val, y_fold_val),
                epochs=50,
                batch_size=32,
                callbacks=[early_stopping, reduce_lr, pruning_callback],
                verbose=0
            )
            
            # Get validation score
            val_loss = min(history.history['val_loss'])
            cv_scores.append(val_loss)
            
            # Clear memory
            tf.keras.backend.clear_session()
        
        # Return mean CV score
        return np.mean(cv_scores)
        
    except Exception as e:
        print(f"Trial failed: {e}")
        return float('inf')

def optimize_architecture(architecture_name, X_train, y_train, input_shape, n_trials=100):
    """Optimize hyperparameters for a specific architecture"""
    print(f"\nOptimizing {architecture_name}...")
    
    # Create study
    study = optuna.create_study(
        direction='minimize',
        study_name=f'{architecture_name}_optimization',
        sampler=optuna.samplers.TPESampler(seed=42)
    )
    
    # Optimize
    study.optimize(
        lambda trial: objective_function(trial, architecture_name, X_train, y_train, input_shape),
        n_trials=n_trials,
        timeout=3600  # 1 hour timeout
    )
    
    print(f"Best trial for {architecture_name}:")
    print(f"  Value: {study.best_trial.value:.6f}")
    print(f"  Params: {study.best_trial.params}")
    
    return study

## Block 6: Training and Evaluation Framework

In [None]:
def train_final_model(architecture_name, best_params, X_train, y_train, X_test, y_test, input_shape):
    """Train final model with best hyperparameters"""
    print(f"\nTraining final {architecture_name} model...")
    
    # Create mock trial with best parameters
    class MockTrial:
        def __init__(self, params):
            self.params = params
        
        def suggest_int(self, name, low, high, step=1):
            return self.params.get(name, low)
        
        def suggest_float(self, name, low, high, log=False):
            return self.params.get(name, low)
    
    mock_trial = MockTrial(best_params)
    
    # Build model with best parameters
    model_builder = CNN_LSTM_ARCHITECTURES[architecture_name]
    model = model_builder(mock_trial, input_shape)
    
    # Callbacks for final training
    early_stopping = EarlyStopping(
        monitor='val_loss', patience=20, restore_best_weights=True
    )
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss', factor=0.5, patience=10, min_lr=1e-6
    )
    
    # Train final model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=100,
        batch_size=32,
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )
    
    # Make predictions
    train_pred = model.predict(X_train, verbose=0)
    test_pred = model.predict(X_test, verbose=0)
    
    # Calculate metrics
    train_metrics = calculate_metrics(y_train, train_pred.flatten())
    test_metrics = calculate_metrics(y_test, test_pred.flatten())
    
    results = {
        'architecture': architecture_name,
        'best_params': best_params,
        'model': model,
        'history': history,
        'train_metrics': train_metrics,
        'test_metrics': test_metrics,
        'train_predictions': train_pred.flatten(),
        'test_predictions': test_pred.flatten()
    }
    
    print(f"Final {architecture_name} Results:")
    print(f"  Train MSE: {train_metrics['MSE']:.6f}")
    print(f"  Test MSE: {test_metrics['MSE']:.6f}")
    print(f"  Test MAE: {test_metrics['MAE']:.6f}")
    print(f"  Test R²: {test_metrics['R2']:.6f}")
    
    return results

## Block 7: Execute Comprehensive Testing

In [None]:
# Execute comprehensive testing for all CNN-LSTM architectures
print("Starting CNN-LSTM Architecture Testing...")
print("=" * 60)

all_results = {}
all_studies = {}

for architecture_name in CNN_LSTM_ARCHITECTURES.keys():
    print(f"\nTesting {architecture_name}...")
    
    # Optimize hyperparameters
    study = optimize_architecture(architecture_name, X_train, y_train, input_shape, n_trials=50)
    all_studies[architecture_name] = study
    
    # Train final model with best parameters
    best_params = study.best_trial.params
    results = train_final_model(architecture_name, best_params, X_train, y_train, X_test, y_test, input_shape)
    all_results[architecture_name] = results
    
    # Save individual results
    save_model_results(results, f'cnn_lstm_results_{architecture_name.lower()}.pkl')
    
    print(f"Completed {architecture_name}")
    print("-" * 40)

# Save comprehensive results
comprehensive_results = {
    'results': all_results,
    'studies': all_studies,
    'data_info': {
        'input_shape': input_shape,
        'train_samples': X_train.shape[0],
        'test_samples': X_test.shape[0],
        'features': feature_names
    }
}

save_model_results(comprehensive_results, 'cnn_lstm_comprehensive_results.pkl')

print("\nCNN-LSTM Architecture Testing Complete!")
print("=" * 60)

## Block 8: Results Analysis and Visualization

In [None]:
# Analyze and compare all CNN-LSTM results
import matplotlib.pyplot as plt
import seaborn as sns

# Create performance comparison
performance_data = []
for arch_name, results in all_results.items():
    test_metrics = results['test_metrics']
    performance_data.append({
        'Architecture': arch_name,
        'MSE': test_metrics['MSE'],
        'MAE': test_metrics['MAE'],
        'R2': test_metrics['R2'],
        'Directional_Accuracy': test_metrics['Directional_Accuracy']
    })

performance_df = pd.DataFrame(performance_data)
performance_df = performance_df.sort_values('MSE')

print("CNN-LSTM Architecture Performance Comparison:")
print("=" * 60)
print(performance_df.to_string(index=False, float_format='%.6f'))

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# MSE Comparison
sns.barplot(data=performance_df, x='MSE', y='Architecture', ax=axes[0,0])
axes[0,0].set_title('Test MSE Comparison')

# R² Comparison
sns.barplot(data=performance_df, x='R2', y='Architecture', ax=axes[0,1])
axes[0,1].set_title('Test R² Comparison')

# MAE Comparison
sns.barplot(data=performance_df, x='MAE', y='Architecture', ax=axes[1,0])
axes[1,0].set_title('Test MAE Comparison')

# Directional Accuracy Comparison
sns.barplot(data=performance_df, x='Directional_Accuracy', y='Architecture', ax=axes[1,1])
axes[1,1].set_title('Directional Accuracy Comparison')

plt.tight_layout()
plt.show()

# Best model identification
best_model = performance_df.iloc[0]
print(f"\nBest CNN-LSTM Architecture: {best_model['Architecture']}")
print(f"Test MSE: {best_model['MSE']:.6f}")
print(f"Test R²: {best_model['R2']:.6f}")
print(f"Directional Accuracy: {best_model['Directional_Accuracy']:.3f}")