# GRU Architecture Testing for VIX Forecasting

This notebook implements comprehensive GRU architecture testing with:

- **Multiple Architecture Variants**: Basic, Deep, Bidirectional, Attention, Residual, Dropout-Enhanced
- **Hyperparameter Optimization**: Optuna-based optimization for each architecture
- **Time Series Cross-Validation**: Proper temporal validation
- **Statistical Significance Testing**: Robust model comparison
- **Comprehensive Training & Evaluation**: Full pipeline for each variant
- **Results Storage**: Systematic storage of all results for comparison

## Block 1: Import Libraries and Setup

In [None]:
# Import shared utilities
from vix_research_utils import *

# Deep learning imports
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Dense, Dropout, GRU, Input, MultiHeadAttention, LayerNormalization,
    Bidirectional, BatchNormalization, GlobalAveragePooling1D, Add,
    GaussianNoise, Concatenate
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Hyperparameter optimization
import optuna
from optuna.integration import TFKerasPruningCallback

# Additional imports
import time
import joblib
from sklearn.model_selection import TimeSeriesSplit

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

## Block 2: Data Loading and Preprocessing

In [None]:
# Load and preprocess data using shared utilities
print("Loading VIX and VVIX data...")
vix_data, vvix_data = download_market_data()

print("Creating features...")
features_df = create_features(vix_data, vvix_data)

print("Preparing sequences for deep learning...")
X, y, feature_names, scaler = prepare_sequences(features_df, sequence_length=30)

# Split data for time series
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

input_shape = (X_train.shape[1], X_train.shape[2])
print(f"Input shape: {input_shape}")
print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")

## Block 3: GRU Architecture Definitions

In [None]:
def build_basic_gru(trial, input_shape):
    """Basic GRU architecture"""
    gru_units_1 = trial.suggest_int('gru_units_1', 32, 128, step=32)
    gru_units_2 = trial.suggest_int('gru_units_2', 16, 64, step=16)
    dense_units = trial.suggest_int('dense_units', 8, 32, step=8)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    
    model = Sequential([
        GRU(gru_units_1, return_sequences=True, input_shape=input_shape),
        Dropout(dropout_rate),
        GRU(gru_units_2),
        Dropout(dropout_rate),
        Dense(dense_units, activation='relu'),
        Dense(1)
    ])
    
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse', metrics=['mae'])
    return model

def build_deep_gru(trial, input_shape):
    """Deep GRU with multiple layers"""
    gru_units_1 = trial.suggest_int('gru_units_1', 64, 256, step=64)
    gru_units_2 = trial.suggest_int('gru_units_2', 32, 128, step=32)
    gru_units_3 = trial.suggest_int('gru_units_3', 16, 64, step=16)
    dense_units = trial.suggest_int('dense_units', 16, 64, step=16)
    dropout_rate = trial.suggest_float('dropout_rate', 0.2, 0.6)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    
    model = Sequential([
        GRU(gru_units_1, return_sequences=True, input_shape=input_shape),
        BatchNormalization(),
        Dropout(dropout_rate),
        GRU(gru_units_2, return_sequences=True),
        BatchNormalization(),
        Dropout(dropout_rate),
        GRU(gru_units_3),
        BatchNormalization(),
        Dropout(dropout_rate),
        Dense(dense_units, activation='relu'),
        Dense(1)
    ])
    
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse', metrics=['mae'])
    return model

def build_bidirectional_gru(trial, input_shape):
    """Bidirectional GRU"""
    gru_units_1 = trial.suggest_int('gru_units_1', 32, 128, step=32)
    gru_units_2 = trial.suggest_int('gru_units_2', 16, 64, step=16)
    dense_units = trial.suggest_int('dense_units', 16, 64, step=16)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    
    model = Sequential([
        Bidirectional(GRU(gru_units_1, return_sequences=True), input_shape=input_shape),
        Dropout(dropout_rate),
        Bidirectional(GRU(gru_units_2)),
        Dropout(dropout_rate),
        Dense(dense_units, activation='relu'),
        Dense(1)
    ])
    
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse', metrics=['mae'])
    return model

# Architecture registry
GRU_ARCHITECTURES = {
    'Basic_GRU': build_basic_gru,
    'Deep_GRU': build_deep_gru,
    'Bidirectional_GRU': build_bidirectional_gru
}

print(f"GRU architectures available: {list(GRU_ARCHITECTURES.keys())}")

## Block 4: Hyperparameter Optimization

In [None]:
def objective_function(trial, architecture_name, X_train, y_train, input_shape):
    """Objective function for hyperparameter optimization"""
    try:
        model_builder = GRU_ARCHITECTURES[architecture_name]
        model = model_builder(trial, input_shape)
        
        # Time series cross-validation
        tscv = TimeSeriesSplit(n_splits=3)
        cv_scores = []
        
        for train_idx, val_idx in tscv.split(X_train):
            X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
            y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]
            
            early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
            reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
            
            history = model.fit(
                X_fold_train, y_fold_train,
                validation_data=(X_fold_val, y_fold_val),
                epochs=50,
                batch_size=32,
                callbacks=[early_stopping, reduce_lr],
                verbose=0
            )
            
            val_loss = min(history.history['val_loss'])
            cv_scores.append(val_loss)
            tf.keras.backend.clear_session()
        
        return np.mean(cv_scores)
        
    except Exception as e:
        print(f"Trial failed: {e}")
        return float('inf')

def optimize_architecture(architecture_name, X_train, y_train, input_shape, n_trials=50):
    """Optimize hyperparameters for a specific architecture"""
    print(f"Optimizing {architecture_name}...")
    
    study = optuna.create_study(
        direction='minimize',
        study_name=f'{architecture_name}_optimization',
        sampler=optuna.samplers.TPESampler(seed=42)
    )
    
    study.optimize(
        lambda trial: objective_function(trial, architecture_name, X_train, y_train, input_shape),
        n_trials=n_trials
    )
    
    print(f"Best trial for {architecture_name}: Value={study.best_trial.value:.6f}")
    return study

## Block 5: Execute Testing

In [None]:
# Execute comprehensive testing for all GRU architectures
print("Starting GRU Architecture Testing...")
print("=" * 60)

all_results = {}
all_studies = {}

for architecture_name in GRU_ARCHITECTURES.keys():
    print(f"Testing {architecture_name}...")
    
    # Optimize hyperparameters
    study = optimize_architecture(architecture_name, X_train, y_train, input_shape, n_trials=25)
    all_studies[architecture_name] = study
    
    # Train final model with best parameters
    best_params = study.best_trial.params
    
    # Create mock trial for final training
    class MockTrial:
        def __init__(self, params):
            self.params = params
        def suggest_int(self, name, low, high, step=1):
            return self.params.get(name, low)
        def suggest_float(self, name, low, high, log=False):
            return self.params.get(name, low)
    
    mock_trial = MockTrial(best_params)
    model_builder = GRU_ARCHITECTURES[architecture_name]
    final_model = model_builder(mock_trial, input_shape)
    
    # Train final model
    early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-6)
    
    history = final_model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=100,
        batch_size=32,
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )
    
    # Make predictions and calculate metrics
    train_pred = final_model.predict(X_train, verbose=0).flatten()
    test_pred = final_model.predict(X_test, verbose=0).flatten()
    
    train_metrics = calculate_metrics(y_train, train_pred)
    test_metrics = calculate_metrics(y_test, test_pred)
    
    results = {
        'architecture': architecture_name,
        'best_params': best_params,
        'model': final_model,
        'history': history,
        'train_metrics': train_metrics,
        'test_metrics': test_metrics,
        'train_predictions': train_pred,
        'test_predictions': test_pred
    }
    
    all_results[architecture_name] = results
    
    print(f"Final {architecture_name} Results:")
    print(f"  Test MSE: {test_metrics['MSE']:.6f}")
    print(f"  Test R²: {test_metrics['R2']:.6f}")
    print(f"Completed {architecture_name}")
    print("-" * 40)

# Save comprehensive results
comprehensive_results = {
    'results': all_results,
    'studies': all_studies,
    'data_info': {
        'input_shape': input_shape,
        'train_samples': X_train.shape[0],
        'test_samples': X_test.shape[0],
        'features': feature_names
    }
}

save_model_results(comprehensive_results, 'gru_comprehensive_results.pkl')
print("GRU Architecture Testing Complete!")