# Ensemble Volatility Prediction

This notebook implements an ensemble approach combining LSTM, Transformer, and Dense Neural Network models for volatility prediction using PyTorch.

## 1. Import Libraries and Setup

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully")

Libraries imported successfully


## 2. Load and Prepare Data

In [2]:
# Load data
print("Loading data...")
train = pd.read_parquet('train_data.parquet')
test = pd.read_parquet('test_data.parquet')
sample_sub = pd.read_csv('sample_submission.csv')

print(f"\nTrain data shape: {train.shape}")
print(f"Test data shape: {test.shape}")
print(f"Sample submission shape: {sample_sub.shape}")

# Get all IV columns from TEST data
iv_columns = [col for col in test.columns if col.startswith(('call_iv_', 'put_iv_'))]
print(f"\nNumber of IV columns: {len(iv_columns)}")

# Create strike dictionary from TEST columns
strike_dict = {}
for col in iv_columns:
    strike = col.split('_')[-1]
    if strike not in strike_dict:
        strike_dict[strike] = {'call': None, 'put': None}
    
    if col.startswith('call_iv_'):
        strike_dict[strike]['call'] = col
    else:
        strike_dict[strike]['put'] = col

print(f"\nNumber of unique strikes: {len(strike_dict)}")
print("\nStrike dictionary:")
print(strike_dict)

Loading data...

Train data shape: (178340, 97)
Test data shape: (12065, 96)
Sample submission shape: (12065, 53)

Number of IV columns: 52

Number of unique strikes: 36

Strike dictionary:
{'24000': {'call': 'call_iv_24000', 'put': 'put_iv_24000'}, '24100': {'call': 'call_iv_24100', 'put': 'put_iv_24100'}, '24200': {'call': 'call_iv_24200', 'put': 'put_iv_24200'}, '24300': {'call': 'call_iv_24300', 'put': 'put_iv_24300'}, '24400': {'call': 'call_iv_24400', 'put': 'put_iv_24400'}, '24500': {'call': 'call_iv_24500', 'put': 'put_iv_24500'}, '24600': {'call': 'call_iv_24600', 'put': 'put_iv_24600'}, '24700': {'call': 'call_iv_24700', 'put': 'put_iv_24700'}, '24800': {'call': 'call_iv_24800', 'put': 'put_iv_24800'}, '24900': {'call': 'call_iv_24900', 'put': 'put_iv_24900'}, '25000': {'call': 'call_iv_25000', 'put': 'put_iv_25000'}, '25100': {'call': 'call_iv_25100', 'put': 'put_iv_25100'}, '25200': {'call': 'call_iv_25200', 'put': 'put_iv_25200'}, '25300': {'call': 'call_iv_25300', 'put': 

## 3. Define Model Architectures

In [3]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=2, dropout=0.2):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        last_hidden = lstm_out[:, -1, :]
        return self.fc(last_hidden)

class TransformerModel(nn.Module):
    def __init__(self, input_dim, d_model=128, nhead=8, num_layers=2, dropout=0.2):
        super(TransformerModel, self).__init__()
        
        self.embedding = nn.Linear(input_dim, d_model)
        self.pos_encoder = nn.Dropout(dropout)
        
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward=256, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        
        self.decoder = nn.Sequential(
            nn.Linear(d_model, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        x = self.embedding(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)  # Global average pooling
        return self.decoder(x)

class DenseModel(nn.Module):
    def __init__(self, input_dim, hidden_dims=[256, 128, 64], dropout=0.2):
        super(DenseModel, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_dim),
                nn.Dropout(dropout)
            ])
            prev_dim = hidden_dim
        
        layers.extend([
            nn.Linear(prev_dim, 1),
            nn.Sigmoid()
        ])
        
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)

print("Model architectures defined successfully")

Model architectures defined successfully


## 4. Feature Engineering

In [4]:
def create_features(df, sequence_length=10):
    """Create features for the models"""
    print("Creating features...")
    features = df.copy()
    
    # Basic features
    features['log_underlying'] = np.log(features['underlying'])
    features['underlying_returns'] = features['underlying'].pct_change()
    features['underlying_volatility'] = features['underlying_returns'].rolling(window=20).std()
    
    # Technical indicators
    windows = [5, 10, 20]
    for window in windows:
        features[f'underlying_ma_{window}'] = features['underlying'].rolling(window=window).mean()
        features[f'underlying_std_{window}'] = features['underlying'].rolling(window=window).std()
        features[f'underlying_skew_{window}'] = features['underlying_returns'].rolling(window=window).skew()
        features[f'underlying_kurt_{window}'] = features['underlying_returns'].rolling(window=window).kurt()
    
    # Moneyness features
    for strike in strike_dict.keys():
        strike_price = float(strike)
        features[f'moneyness_{strike}'] = features['underlying'] / strike_price
        features[f'log_moneyness_{strike}'] = np.log(features['underlying'] / strike_price)
    
    # Fill missing values
    features = features.fillna(method='ffill').fillna(method='bfill').fillna(0)
    
    # Create sequences for LSTM and Transformer
    feature_cols = [col for col in features.columns if col not in ['timestamp'] + iv_columns]
    X = features[feature_cols].values
    
    sequences = []
    for i in range(len(X) - sequence_length + 1):
        sequences.append(X[i:i + sequence_length])
    
    print(f"Created {len(sequences)} sequences with {len(feature_cols)} features each")
    return np.array(sequences), feature_cols

# Test feature creation on a small sample
sample_sequences, feature_cols = create_features(train.head(100))
print(f"\nSample sequences shape: {sample_sequences.shape}")
print("\nFeature columns:")
print(feature_cols)

Creating features...
Created 91 sequences with 141 features each

Sample sequences shape: (91, 10, 141)

Feature columns:
['underlying', 'expiry', 'call_iv_23500', 'call_iv_23600', 'call_iv_23700', 'call_iv_23800', 'call_iv_23900', 'put_iv_22500', 'put_iv_22600', 'put_iv_22700', 'put_iv_22800', 'put_iv_22900', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37', 'X38', 'X39', 'X40', 'X41', 'log_underlying', 'underlying_returns', 'underlying_volatility', 'underlying_ma_5', 'underlying_std_5', 'underlying_skew_5', 'underlying_kurt_5', 'underlying_ma_10', 'underlying_std_10', 'underlying_skew_10', 'underlying_kurt_10', 'underlying_ma_20', 'underlying_std_20', 'underlying_skew_20', 'underlying_kurt_20', 'moneyness_24000', 'log_moneyness_24000', 'moneyness_24100', 'log_moneyness_24100', 'mon

## 5. Model Training Function

In [5]:
def train_model(model, train_loader, val_loader, num_epochs=100, patience=10):
    """Train the model with early stopping"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    model = model.to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    best_val_loss = float('inf')
    patience_counter = 0
    best_model_state = None
    
    print("\nStarting training...")
    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                outputs = model(batch_X)
                val_loss += criterion(outputs, batch_y).item()
        
        # Print progress
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss/len(train_loader):.6f} - Val Loss: {val_loss/len(val_loader):.6f}")
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_model_state = model.state_dict().copy()
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"\nEarly stopping at epoch {epoch+1}")
                break
    
    # Load best model
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    
    return model

## 6. Ensemble Prediction Function

In [6]:
def predict_iv(data):
    print("Starting ensemble IV prediction...")
    data = data.copy()
    
    # Phase 1: Put-call parity
    print("\nPhase 1: Applying put-call parity...")
    for strike, cols in strike_dict.items():
        call_col = cols['call']
        put_col = cols['put']
        
        if call_col in data.columns and put_col in data.columns:
            call_mask = data[call_col].isna() & data[put_col].notna()
            data.loc[call_mask, call_col] = data.loc[call_mask, put_col]
            
            put_mask = data[put_col].isna() & data[call_col].notna()
            data.loc[put_mask, put_col] = data.loc[put_mask, call_col]
    
    # Phase 2: Feature creation
    print("\nPhase 2: Creating features...")
    sequences, feature_cols = create_features(data)
    
    scaler = StandardScaler()
    X = scaler.fit_transform(sequences.reshape(-1, sequences.shape[-1])).reshape(sequences.shape)
    X_tensor = torch.FloatTensor(X)
    
    # Phase 3: Train models
    print("\nPhase 3: Training models...")
    models = {}
    for col in iv_columns:
        print(f"\nTraining models for {col}...")
        y = data[col].values[9:]  # Adjust for sequence length
        y_tensor = torch.FloatTensor(y).reshape(-1, 1)
        
        train_size = int(0.8 * len(X_tensor))
        train_dataset = TensorDataset(X_tensor[:train_size], y_tensor[:train_size])
        val_dataset = TensorDataset(X_tensor[train_size:], y_tensor[train_size:])
        
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=32)
        
        # Train LSTM model
        print("Training LSTM model...")
        lstm_model = LSTMModel(X.shape[-1])
        lstm_model = train_model(lstm_model, train_loader, val_loader)
        
        # Train Transformer model
        print("Training Transformer model...")
        transformer_model = TransformerModel(X.shape[-1])
        transformer_model = train_model(transformer_model, train_loader, val_loader)
        
        # Train Dense model
        print("Training Dense model...")
        dense_model = DenseModel(X.shape[-1])
        dense_model = train_model(dense_model, train_loader, val_loader)
        
        models[col] = {
            'lstm': lstm_model,
            'transformer': transformer_model,
            'dense': dense_model
        }
    
    # Phase 4: Make predictions
    print("\nPhase 4: Making predictions...")
    for col in iv_columns:
        mask = data[col].isna()
        if mask.any():
            # Create sequences for missing values
            missing_indices = mask[mask].index
            for idx in missing_indices:
                if idx >= 9:  # Ensure we have enough history
                    seq = X_tensor[idx-9:idx+1].unsqueeze(0)
                    
                    # Get predictions from each model
                    models[col]['lstm'].eval()
                    models[col]['transformer'].eval()
                    models[col]['dense'].eval()
                    
                    with torch.no_grad():
                        lstm_pred = models[col]['lstm'](seq).item()
                        transformer_pred = models[col]['transformer'](seq).item()
                        dense_pred = models[col]['dense'](seq).item()
                    
                    # Combine predictions (weighted average)
                    pred = 0.4 * lstm_pred + 0.4 * transformer_pred + 0.2 * dense_pred
                    data.at[idx, col] = pred
    
    # Phase 5: Smoothing and consistency
    print("\nPhase 5: Applying smoothing and consistency checks...")
    for idx, row in data.iterrows():
        for strike, cols in strike_dict.items():
            call_col = cols['call']
            put_col = cols['put']
            
            if call_col in data.columns and put_col in data.columns:
                avg_iv = (data.at[idx, call_col] + data.at[idx, put_col]) / 2
                data.at[idx, call_col] = 0.9 * data.at[idx, call_col] + 0.1 * avg_iv
                data.at[idx, put_col] = 0.9 * data.at[idx, put_col] + 0.1 * avg_iv
    
    # Ensure all values are within reasonable bounds
    for col in iv_columns:
        if col in data.columns:
            data[col] = np.clip(data[col], 0.01, 1.0)
    
    print("\nPrediction completed successfully")
    return data

## 7. Validation and Testing

In [7]:
# Create validation split
print("Creating validation split...")
train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

# Apply to validation set
print("\nRunning validation...")
val_pred = predict_iv(val_df)

# Calculate MSE only on originally masked validation points
mse_vals = []
for col in iv_columns:
    if col in val_df.columns and col in val_pred.columns:
        mask = val_df[col].isna() & val_pred[col].notna()
        if mask.any():
            se = (val_df.loc[mask, col] - val_pred.loc[mask, col]) ** 2
            mse_vals.append(se.mean())

validation_mse = np.mean(mse_vals) if mse_vals else 0
print(f"\nValidation MSE (masked points only): {validation_mse:.12f}")

Creating validation split...
Training set size: 142672
Validation set size: 35668

Running validation...
Starting ensemble IV prediction...

Phase 1: Applying put-call parity...

Phase 2: Creating features...
Creating features...
Created 35659 sequences with 141 features each


MemoryError: Unable to allocate 384. MiB for an array with shape (356590, 141) and data type float64

## 8. Generate Final Predictions

In [None]:
# Apply to test set
print("Generating final predictions...")
test_pred = predict_iv(test)

# Prepare submission
submission = test_pred[['timestamp'] + iv_columns].copy()
submission.columns = sample_sub.columns

# Verify no missing values
assert submission.isna().sum().sum() == 0, "Missing values detected"
submission.to_csv('submission.csv', index=False)

print("\nFinal Submission Preview:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")
print(f"Validation MSE: {validation_mse:.12f}")