# Transformer-based Volatility Prediction

This notebook implements a transformer-based model for volatility prediction using PyTorch, with advanced feature engineering and attention mechanisms.

## 1. Import Libraries and Setup

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully")

Libraries imported successfully


## 2. Load and Prepare Data

In [15]:
# Load data
print("Loading data...")
train = pd.read_parquet('train_data.parquet')
test = pd.read_parquet('test_data.parquet')
sample_sub = pd.read_csv('sample_submission.csv')

print(f"\nTrain data shape: {train.shape}")
print(f"Test data shape: {test.shape}")
print(f"Sample submission shape: {sample_sub.shape}")

# Get all IV columns from TEST data
iv_columns = [col for col in test.columns if col.startswith(('call_iv_', 'put_iv_'))]
print(f"\nNumber of IV columns: {len(iv_columns)}")

# Create strike dictionary from TEST columns
strike_dict = {}
for col in iv_columns:
    strike = col.split('_')[-1]
    if strike not in strike_dict:
        strike_dict[strike] = {'call': None, 'put': None}
    
    if col.startswith('call_iv_'):
        strike_dict[strike]['call'] = col
    else:
        strike_dict[strike]['put'] = col

print(f"\nNumber of unique strikes: {len(strike_dict)}")
print("\nStrike dictionary:")
print(strike_dict)

Loading data...

Train data shape: (178340, 97)
Test data shape: (12065, 96)
Sample submission shape: (12065, 53)

Number of IV columns: 52

Number of unique strikes: 36

Strike dictionary:
{'24000': {'call': 'call_iv_24000', 'put': 'put_iv_24000'}, '24100': {'call': 'call_iv_24100', 'put': 'put_iv_24100'}, '24200': {'call': 'call_iv_24200', 'put': 'put_iv_24200'}, '24300': {'call': 'call_iv_24300', 'put': 'put_iv_24300'}, '24400': {'call': 'call_iv_24400', 'put': 'put_iv_24400'}, '24500': {'call': 'call_iv_24500', 'put': 'put_iv_24500'}, '24600': {'call': 'call_iv_24600', 'put': 'put_iv_24600'}, '24700': {'call': 'call_iv_24700', 'put': 'put_iv_24700'}, '24800': {'call': 'call_iv_24800', 'put': 'put_iv_24800'}, '24900': {'call': 'call_iv_24900', 'put': 'put_iv_24900'}, '25000': {'call': 'call_iv_25000', 'put': 'put_iv_25000'}, '25100': {'call': 'call_iv_25100', 'put': 'put_iv_25100'}, '25200': {'call': 'call_iv_25200', 'put': 'put_iv_25200'}, '25300': {'call': 'call_iv_25300', 'put': 

## 3. Define Transformer Model

In [22]:
import torch.nn as nn

class TransformerModel(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=4, num_layers=2, dropout=0.1):
        super().__init__()
        
        # Ensure d_model is divisible by nhead
        self.d_model = d_model - (d_model % nhead)
        
        # Input embedding
        self.embedding = nn.Linear(input_dim, self.d_model)
        
        # Positional encoding (using dropout as a simple form)
        self.pos_encoder = nn.Sequential(
            nn.Dropout(dropout),
            nn.LayerNorm(self.d_model)
        )
        
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.d_model,
            nhead=nhead,
            dim_feedforward=self.d_model * 4,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # Decoder (prediction head)
        self.decoder = nn.Sequential(
            nn.Linear(self.d_model, self.d_model // 2),
            nn.LayerNorm(self.d_model // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(self.d_model // 2, 1)
        )
    
    def forward(self, x):
        # Add sequence dimension if input is 2D
        if len(x.shape) == 2:
            x = x.unsqueeze(1)  # Add sequence dimension
            
        # Input embedding
        x = self.embedding(x)
        
        # Apply positional encoding
        x = self.pos_encoder(x)
        
        # Transformer encoder
        x = self.transformer_encoder(x)
        
        # Global average pooling over sequence dimension
        x = x.mean(dim=1)
        
        # Final prediction
        x = self.decoder(x)
        return x

print("Transformer model class defined successfully")

Transformer model class defined successfully


## 4. Feature Engineering

In [23]:
def create_transformer_features(df):
    """Create features for transformer model"""
    print("Creating transformer features...")
    features = df.copy()
    
    # Extract strike prices from IV column names and create a strike column
    iv_cols = [col for col in features.columns if col.startswith(('call_iv_', 'put_iv_'))]
    if iv_cols:
        # Extract strike from first IV column (they all have the same strikes)
        sample_col = iv_cols[0]
        strike_price = float(sample_col.split('_')[-1])
        features['strike'] = strike_price
    else:
        # If no IV columns found, use alternative method or raise error
        raise ValueError("No IV columns found in the data")
    
    # Basic features
    features['moneyness'] = np.log(features['underlying'] / features['strike'])
    
    # Convert timestamps to datetime and calculate time to expiry in days
    features['timestamp'] = pd.to_datetime(features['timestamp'])
    features['expiry'] = pd.to_datetime(features['expiry'])
    features['time_to_expiry'] = (features['expiry'] - features['timestamp']).dt.total_seconds() / (24 * 3600)  # Convert to days
    
    # Technical indicators
    features['returns'] = np.log(features['underlying'] / features['underlying'].shift(1))
    features['volatility'] = features['returns'].rolling(window=20).std() * np.sqrt(252)
    features['skew'] = features['returns'].rolling(window=20).skew()
    features['kurtosis'] = features['returns'].rolling(window=20).kurt()
    
    # Moneyness features
    features['moneyness_squared'] = features['moneyness'] ** 2
    features['moneyness_cubed'] = features['moneyness'] ** 3
    
    # Time features
    features['time_to_expiry_squared'] = features['time_to_expiry'] ** 2
    features['time_to_expiry_cubed'] = features['time_to_expiry'] ** 3
    
    # Fill missing values
    features = features.fillna(method='ffill').fillna(method='bfill').fillna(0)
    
    print(f"Created {len(features.columns)} features")
    return features

# Test feature creation on a small sample
sample_features = create_transformer_features(train.head(100))
print(f"\nSample features shape: {sample_features.shape}")
print("\nFeature columns:")
print(sample_features.columns.tolist())

Creating transformer features...
Created 108 features

Sample features shape: (100, 108)

Feature columns:
['timestamp', 'underlying', 'expiry', 'call_iv_23500', 'call_iv_23600', 'call_iv_23700', 'call_iv_23800', 'call_iv_23900', 'call_iv_24000', 'call_iv_24100', 'call_iv_24200', 'call_iv_24300', 'call_iv_24400', 'call_iv_24500', 'call_iv_24600', 'call_iv_24700', 'call_iv_24800', 'call_iv_24900', 'call_iv_25000', 'call_iv_25100', 'call_iv_25200', 'call_iv_25300', 'call_iv_25400', 'call_iv_25500', 'call_iv_25600', 'call_iv_25700', 'call_iv_25800', 'call_iv_25900', 'call_iv_26000', 'put_iv_22500', 'put_iv_22600', 'put_iv_22700', 'put_iv_22800', 'put_iv_22900', 'put_iv_23000', 'put_iv_23100', 'put_iv_23200', 'put_iv_23300', 'put_iv_23400', 'put_iv_23500', 'put_iv_23600', 'put_iv_23700', 'put_iv_23800', 'put_iv_23900', 'put_iv_24000', 'put_iv_24100', 'put_iv_24200', 'put_iv_24300', 'put_iv_24400', 'put_iv_24500', 'put_iv_24600', 'put_iv_24700', 'put_iv_24800', 'put_iv_24900', 'put_iv_25000

## 5. Model Training Function

In [24]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=50, patience=10):
    """Train the transformer model"""
    print("Starting model training...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    best_val_loss = float('inf')
    patience_counter = 0
    train_losses = []
    val_losses = []
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        
        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                val_loss += loss.item()
        
        val_loss /= len(val_loader)
        val_losses.append(val_loss)
        
        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}")
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_transformer_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break
    
    print("\nTraining completed successfully")
    return train_losses, val_losses

print("Model training function defined successfully")

Model training function defined successfully


## 6. Prediction Function

In [25]:
def predict_iv(data, model=None, scaler=None):
    print("Starting transformer IV prediction...")
    data = data.copy()
    
    # Phase 1: Put-call parity
    print("\nPhase 1: Applying put-call parity...")
    for strike, cols in strike_dict.items():
        call_col = cols['call']
        put_col = cols['put']
        
        if call_col in data.columns and put_col in data.columns:
            call_mask = data[call_col].isna() & data[put_col].notna()
            data.loc[call_mask, call_col] = data.loc[call_mask, put_col]
            
            put_mask = data[put_col].isna() & data[call_col].notna()
            data.loc[put_mask, put_col] = data.loc[put_mask, call_col]
    
    # Phase 2: Create features
    print("\nPhase 2: Creating features...")
    features = create_transformer_features(data)
    
    # Phase 3: Scale features
    print("\nPhase 3: Scaling features...")
    feature_cols = [col for col in features.columns if col not in ['timestamp', 'expiry'] + iv_columns]
    if scaler is None:
        scaler = StandardScaler()
        features[feature_cols] = scaler.fit_transform(features[feature_cols])
    else:
        features[feature_cols] = scaler.transform(features[feature_cols])
    
    # Phase 4: Model prediction
    print("\nPhase 4: Making predictions...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if model is None:
        model = TransformerModel(input_dim=len(feature_cols))
        try:
            model.load_state_dict(torch.load('best_transformer_model.pth'))
        except FileNotFoundError:
            raise ValueError("No trained model provided and no saved model found")
    
    model = model.to(device)
    model.eval()
    
    # Create data loader
    X = torch.FloatTensor(features[feature_cols].values)
    dataset = TensorDataset(X)
    dataloader = DataLoader(dataset, batch_size=64, shuffle=False)
    
    predictions = []
    with torch.no_grad():
        for batch_x in dataloader:
            batch_x = batch_x[0].to(device)
            outputs = model(batch_x)
            predictions.extend(outputs.cpu().numpy())
    
    predictions = np.array(predictions).flatten()
    
    # Apply predictions to missing values
    for idx, row in data.iterrows():
        for strike, cols in strike_dict.items():
            call_col = cols['call']
            put_col = cols['put']
            
            if call_col in data.columns and put_col in data.columns:
                if np.isnan(data.at[idx, call_col]):
                    data.at[idx, call_col] = predictions[idx]
                
                if np.isnan(data.at[idx, put_col]):
                    data.at[idx, put_col] = predictions[idx]
    
    # Phase 5: Smoothing and consistency
    print("\nPhase 5: Applying smoothing and consistency checks...")
    for idx, row in data.iterrows():
        for strike, cols in strike_dict.items():
            call_col = cols['call']
            put_col = cols['put']
            
            if call_col in data.columns and put_col in data.columns:
                avg_iv = (data.at[idx, call_col] + data.at[idx, put_col]) / 2
                data.at[idx, call_col] = 0.9 * data.at[idx, call_col] + 0.1 * avg_iv
                data.at[idx, put_col] = 0.9 * data.at[idx, put_col] + 0.1 * avg_iv
    
    # Ensure all values are within reasonable bounds
    for col in iv_columns:
        if col in data.columns:
            data[col] = np.clip(data[col], 0.01, 1.0)
    
    print("\nPrediction completed successfully")
    return data

## 7. Validation and Testing

In [26]:
# Create validation split
print("Creating validation split...")
train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

# Apply to validation set
print("\nRunning validation...")
val_pred = predict_iv(val_df, model=model, scaler=scaler)

# Calculate MSE only on originally masked validation points
mse_vals = []
for col in iv_columns:
    if col in val_df.columns and col in val_pred.columns:
        mask = val_df[col].isna() & val_pred[col].notna()
        if mask.any():
            se = (val_df.loc[mask, col] - val_pred.loc[mask, col]) ** 2
            mse_vals.append(se.mean())

validation_mse = np.mean(mse_vals) if mse_vals else 0
print(f"\nValidation MSE (masked points only): {validation_mse:.12f}")

Creating validation split...
Training set size: 142672
Validation set size: 35668

Running validation...
Starting transformer IV prediction...

Phase 1: Applying put-call parity...

Phase 2: Creating features...
Creating transformer features...
Created 108 features

Phase 3: Scaling features...

Phase 4: Making predictions...
Created 108 features

Phase 3: Scaling features...

Phase 4: Making predictions...


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x20 and 64x32)

In [21]:
# Prepare data for training
print("Preparing training data...")
train_features = create_transformer_features(train_df)

# Scale features
scaler = StandardScaler()
feature_cols = [col for col in train_features.columns if col not in ['timestamp', 'expiry'] + iv_columns]
X_train = scaler.fit_transform(train_features[feature_cols])

# Prepare target variable (we'll use first IV column as target for demonstration)
target_col = iv_columns[0]
y_train = train_features[target_col].values

# Convert to tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train).reshape(-1, 1)

# Create data loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Prepare validation data
val_features = create_transformer_features(val_df)
X_val = scaler.transform(val_features[feature_cols])
y_val = val_features[target_col].values

X_val_tensor = torch.FloatTensor(X_val)
y_val_tensor = torch.FloatTensor(y_val).reshape(-1, 1)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Initialize model and training components
model = TransformerModel(input_dim=len(feature_cols))
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train model
print("\nTraining model...")
train_losses, val_losses = train_model(model, train_loader, val_loader, criterion, optimizer)

print("\nModel training completed. Now running validation...")


Preparing training data...
Creating transformer features...
Created 108 features
Created 108 features
Creating transformer features...
Created 108 features
Creating transformer features...
Created 108 features

Training model...
Starting model training...

Training model...
Starting model training...


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x16 and 64x32)

## 8. Generate Final Predictions

In [None]:
# Apply to test set
print("Generating final predictions...")
test_pred = predict_iv(test, model=model, scaler=scaler)

# Prepare submission
submission = test_pred[['timestamp'] + iv_columns].copy()
submission.columns = sample_sub.columns

# Verify no missing values
assert submission.isna().sum().sum() == 0, "Missing values detected"
submission.to_csv('submission.csv', index=False)

print("\nFinal Submission Preview:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")
print(f"Validation MSE: {validation_mse:.12f}")