# Comprehensive GRU Architecture Testing for VIX Forecasting

This notebook implements comprehensive GRU architecture testing with:
- **Multiple Architecture Variants**: Basic, Deep, Bidirectional, Attention, Residual, Dropout-Enhanced
- **Hyperparameter Optimization**: Optuna-based optimization for each architecture
- **Time Series Cross-Validation**: Proper temporal validation
- **Statistical Significance Testing**: Robust model comparison
- **Comprehensive Training & Evaluation**: Full pipeline for each variant
- **Results Storage**: Systematic storage of all results for comparison

## Architecture Variants Tested:
1. **Basic GRU**: Baseline architecture with standard GRU layers
2. **Deep GRU**: Enhanced depth with multiple GRU layers
3. **Bidirectional GRU**: Bidirectional GRU for better temporal modeling
4. **Attention GRU**: Multi-head attention mechanism for feature importance
5. **Residual GRU**: Residual connections for better gradient flow
6. **Dropout-Enhanced GRU**: Advanced regularization with multiple dropout strategies

## Block 1: Import Libraries and Setup

In [None]:
# Import shared utilities
from vix_research_utils import *

# Deep learning imports
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Dense, Dropout, GRU, Input, MultiHeadAttention, LayerNormalization,
    Bidirectional, BatchNormalization, GlobalAveragePooling1D, Add,
    GaussianNoise, Concatenate
)
from tensorflow.keras.optimizers import Adam, AdamW
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.regularizers import l1_l2

# Hyperparameter optimization
import optuna
from optuna.integration import TFKerasPruningCallback
from optuna.trial import TrialState

# Additional imports
import time
import json
import joblib
from pathlib import Path
from sklearn.model_selection import TimeSeriesSplit

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Configure TensorFlow
tf.config.experimental.enable_memory_growth = True
print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

## Block 2: Data Loading and Preprocessing

In [None]:
# Load and preprocess data using shared utilities
print("Loading VIX and VVIX data...")
vix_data, vvix_data = download_market_data()

print("\nCleaning and preprocessing data...")
vix_clean = clean_data(vix_data)
vvix_clean = clean_data(vvix_data)

print("\nCreating features...")
features_df = create_features(vix_clean, vvix_clean)

print("\nPreparing sequences for deep learning...")
X, y, feature_names, scaler = prepare_sequences(features_df, sequence_length=30)

print(f"\nData shapes:")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Number of features: {len(feature_names)}")
print(f"Features: {feature_names}")

# Split data for time series
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

## Block 3: GRU Architecture Definitions

In [None]:
def build_basic_gru(trial, input_shape):
    """Basic GRU architecture with hyperparameter optimization"""
    # Hyperparameters
    gru_units_1 = trial.suggest_int('gru_units_1', 32, 128, step=32)
    gru_units_2 = trial.suggest_int('gru_units_2', 16, 64, step=16)
    dense_units = trial.suggest_int('dense_units', 8, 32, step=8)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    
    model = Sequential([
        GRU(gru_units_1, return_sequences=True, input_shape=input_shape),
        Dropout(dropout_rate),
        GRU(gru_units_2),
        Dropout(dropout_rate),
        Dense(dense_units, activation='relu'),
        Dense(1)
    ])
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

def build_deep_gru(trial, input_shape):
    """Deep GRU with multiple layers"""
    # Hyperparameters
    gru_units_1 = trial.suggest_int('gru_units_1', 64, 256, step=64)
    gru_units_2 = trial.suggest_int('gru_units_2', 32, 128, step=32)
    gru_units_3 = trial.suggest_int('gru_units_3', 16, 64, step=16)
    dense_units_1 = trial.suggest_int('dense_units_1', 16, 64, step=16)
    dense_units_2 = trial.suggest_int('dense_units_2', 8, 32, step=8)
    dropout_rate = trial.suggest_float('dropout_rate', 0.2, 0.6)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    
    model = Sequential([
        # Deep GRU layers
        GRU(gru_units_1, return_sequences=True, input_shape=input_shape),
        BatchNormalization(),
        Dropout(dropout_rate),
        
        GRU(gru_units_2, return_sequences=True),
        BatchNormalization(),
        Dropout(dropout_rate),
        
        GRU(gru_units_3),
        BatchNormalization(),
        Dropout(dropout_rate),
        
        # Dense layers
        Dense(dense_units_1, activation='relu'),
        Dropout(dropout_rate * 0.7),
        Dense(dense_units_2, activation='relu'),
        Dense(1)
    ])
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

def build_bidirectional_gru(trial, input_shape):
    """GRU with bidirectional layers"""
    # Hyperparameters
    gru_units_1 = trial.suggest_int('gru_units_1', 32, 128, step=32)
    gru_units_2 = trial.suggest_int('gru_units_2', 16, 64, step=16)
    dense_units = trial.suggest_int('dense_units', 16, 64, step=16)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    
    model = Sequential([
        # Bidirectional GRU layers
        Bidirectional(GRU(gru_units_1, return_sequences=True), input_shape=input_shape),
        Dropout(dropout_rate),
        Bidirectional(GRU(gru_units_2)),
        Dropout(dropout_rate),
        
        # Dense layers
        Dense(dense_units, activation='relu'),
        Dropout(dropout_rate * 0.7),
        Dense(1)
    ])
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

def build_attention_gru(trial, input_shape):
    """GRU with multi-head attention mechanism"""
    # Hyperparameters
    gru_units = trial.suggest_int('gru_units', 32, 128, step=32)
    attention_heads = trial.suggest_int('attention_heads', 2, 8, step=2)
    dense_units = trial.suggest_int('dense_units', 16, 64, step=16)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    
    # Input layer
    inputs = Input(shape=input_shape)
    
    # GRU layer
    gru_out = GRU(gru_units, return_sequences=True)(inputs)
    gru_out = Dropout(dropout_rate)(gru_out)
    
    # Multi-head attention
    attention_out = MultiHeadAttention(
        num_heads=attention_heads, 
        key_dim=gru_units // attention_heads
    )(gru_out, gru_out)
    
    # Layer normalization and residual connection
    attention_out = LayerNormalization()(attention_out)
    combined = Add()([gru_out, attention_out])
    
    # Global pooling
    pooled = GlobalAveragePooling1D()(combined)
    
    # Dense layers
    x = Dense(dense_units, activation='relu')(pooled)
    x = Dropout(dropout_rate)(x)
    outputs = Dense(1)(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

def build_residual_gru(trial, input_shape):
    """GRU with residual connections"""
    # Hyperparameters
    gru_units_1 = trial.suggest_int('gru_units_1', 32, 128, step=32)
    gru_units_2 = trial.suggest_int('gru_units_2', 32, 128, step=32)
    dense_units = trial.suggest_int('dense_units', 16, 64, step=16)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    
    # Input layer
    inputs = Input(shape=input_shape)
    
    # First GRU block
    gru1 = GRU(gru_units_1, return_sequences=True)(inputs)
    gru1 = Dropout(dropout_rate)(gru1)
    
    # Second GRU block with residual connection
    gru2 = GRU(gru_units_2, return_sequences=True)(gru1)
    gru2 = Dropout(dropout_rate)(gru2)
    
    # Residual connection (if dimensions match)
    if gru_units_1 == gru_units_2:
        combined = Add()([gru1, gru2])
    else:
        combined = gru2
    
    # Final GRU layer
    final_gru = GRU(gru_units_2 // 2)(combined)
    final_gru = Dropout(dropout_rate)(final_gru)
    
    # Dense layers
    x = Dense(dense_units, activation='relu')(final_gru)
    x = Dropout(dropout_rate)(x)
    outputs = Dense(1)(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

## Block 4: Advanced GRU Architectures

In [None]:
def build_dropout_enhanced_gru(trial, input_shape):
    """GRU with enhanced dropout regularization"""
    # Hyperparameters
    gru_units_1 = trial.suggest_int('gru_units_1', 32, 128, step=32)
    gru_units_2 = trial.suggest_int('gru_units_2', 16, 64, step=16)
    gru_units_3 = trial.suggest_int('gru_units_3', 8, 32, step=8)
    dense_units_1 = trial.suggest_int('dense_units_1', 16, 64, step=16)
    dense_units_2 = trial.suggest_int('dense_units_2', 8, 32, step=8)
    dropout_rate = trial.suggest_float('dropout_rate', 0.2, 0.6)
    recurrent_dropout = trial.suggest_float('recurrent_dropout', 0.1, 0.4)
    noise_level = trial.suggest_float('noise_level', 0.05, 0.2)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    
    model = Sequential([
        # Input noise for regularization
        GaussianNoise(noise_level, input_shape=input_shape),
        
        # Enhanced dropout GRU layers
        GRU(gru_units_1, return_sequences=True, 
            dropout=dropout_rate, recurrent_dropout=recurrent_dropout),
        BatchNormalization(),
        Dropout(dropout_rate * 1.2),
        
        GRU(gru_units_2, return_sequences=True,
            dropout=dropout_rate, recurrent_dropout=recurrent_dropout),
        BatchNormalization(),
        Dropout(dropout_rate * 1.2),
        
        GRU(gru_units_3, dropout=dropout_rate, recurrent_dropout=recurrent_dropout),
        BatchNormalization(),
        Dropout(dropout_rate),
        
        # Dense layers with progressive dropout
        Dense(dense_units_1, activation='relu'),
        Dropout(dropout_rate * 0.8),
        Dense(dense_units_2, activation='relu'),
        Dropout(dropout_rate * 0.6),
        Dense(1)
    ])
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

# Architecture registry
GRU_ARCHITECTURES = {
    'Basic_GRU': build_basic_gru,
    'Deep_GRU': build_deep_gru,
    'Bidirectional_GRU': build_bidirectional_gru,
    'Attention_GRU': build_attention_gru,
    'Residual_GRU': build_residual_gru,
    'Dropout_Enhanced_GRU': build_dropout_enhanced_gru
}

print(f"GRU architectures available: {list(GRU_ARCHITECTURES.keys())}")

## Block 5: Hyperparameter Optimization Framework

In [None]:
def objective_function(trial, architecture_name, X_train, y_train, input_shape):
    """Objective function for hyperparameter optimization"""
    try:
        # Build model with trial hyperparameters
        model_builder = GRU_ARCHITECTURES[architecture_name]
        model = model_builder(trial, input_shape)
        
        # Time series cross-validation
        tscv = TimeSeriesSplit(n_splits=3)
        cv_scores = []
        
        for train_idx, val_idx in tscv.split(X_train):
            X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
            y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]
            
            # Callbacks
            early_stopping = EarlyStopping(
                monitor='val_loss', patience=10, restore_best_weights=True
            )
            reduce_lr = ReduceLROnPlateau(
                monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6
            )
            pruning_callback = TFKerasPruningCallback(trial, 'val_loss')
            
            # Train model
            history = model.fit(
                X_fold_train, y_fold_train,
                validation_data=(X_fold_val, y_fold_val),
                epochs=50,
                batch_size=32,
                callbacks=[early_stopping, reduce_lr, pruning_callback],
                verbose=0
            )
            
            # Get validation score
            val_loss = min(history.history['val_loss'])
            cv_scores.append(val_loss)
            
            # Clear memory
            tf.keras.backend.clear_session()
        
        # Return mean CV score
        return np.mean(cv_scores)
        
    except Exception as e:
        print(f"Trial failed: {e}")
        return float('inf')

def optimize_architecture(architecture_name, X_train, y_train, input_shape, n_trials=100):
    """Optimize hyperparameters for a specific architecture"""
    print(f"\nOptimizing {architecture_name}...")
    
    # Create study
    study = optuna.create_study(
        direction='minimize',
        study_name=f'{architecture_name}_optimization',
        sampler=optuna.samplers.TPESampler(seed=42)
    )
    
    # Optimize
    study.optimize(
        lambda trial: objective_function(trial, architecture_name, X_train, y_train, input_shape),
        n_trials=n_trials,
        timeout=3600  # 1 hour timeout
    )
    
    print(f"Best trial for {architecture_name}:")
    print(f"  Value: {study.best_trial.value:.6f}")
    print(f"  Params: {study.best_trial.params}")
    
    return study

## Block 6: Training and Evaluation Framework

In [None]:
def train_final_model(architecture_name, best_params, X_train, y_train, X_test, y_test, input_shape):
    """Train final model with best hyperparameters"""
    print(f"\nTraining final {architecture_name} model...")
    
    # Create mock trial with best parameters
    class MockTrial:
        def __init__(self, params):
            self.params = params
        
        def suggest_int(self, name, low, high, step=1):
            return self.params.get(name, low)
        
        def suggest_float(self, name, low, high, log=False):
            return self.params.get(name, low)
    
    mock_trial = MockTrial(best_params)
    
    # Build model with best parameters
    model_builder = GRU_ARCHITECTURES[architecture_name]
    model = model_builder(mock_trial, input_shape)
    
    # Callbacks for final training
    early_stopping = EarlyStopping(
        monitor='val_loss', patience=20, restore_best_weights=True
    )
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss', factor=0.5, patience=10, min_lr=1e-6
    )
    
    # Train final model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=100,
        batch_size=32,
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )
    
    # Make predictions
    train_pred = model.predict(X_train, verbose=0)
    test_pred = model.predict(X_test, verbose=0)
    
    # Calculate metrics
    train_metrics = calculate_metrics(y_train, train_pred.flatten())
    test_metrics = calculate_metrics(y_test, test_pred.flatten())
    
    results = {
        'architecture': architecture_name,
        'best_params': best_params,
        'model': model,
        'history': history,
        'train_metrics': train_metrics,
        'test_metrics': test_metrics,
        'train_predictions': train_pred.flatten(),
        'test_predictions': test_pred.flatten()
    }
    
    print(f"Final {architecture_name} Results:")
    print(f"  Train MSE: {train_metrics['mse']:.6f}")
    print(f"  Test MSE: {test_metrics['mse']:.6f}")
    print(f"  Test MAE: {test_metrics['mae']:.6f}")
    print(f"  Test RÂ²: {test_metrics['r2']:.6f}")
    
    return results