# Simple Volatility Predictor

This notebook implements a simple volatility prediction model using basic statistical methods and interpolation techniques.

## 1. Import Libraries and Setup

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully")

Libraries imported successfully


## 2. Load and Prepare Data

In [2]:
# Load data
print("Loading data...")
train = pd.read_parquet('train_data.parquet')
test = pd.read_parquet('test_data.parquet')
sample_sub = pd.read_csv('sample_submission.csv')

print(f"\nTrain data shape: {train.shape}")
print(f"Test data shape: {test.shape}")
print(f"Sample submission shape: {sample_sub.shape}")

# Get all IV columns from TEST data
iv_columns = [col for col in test.columns if col.startswith(('call_iv_', 'put_iv_'))]
print(f"\nNumber of IV columns: {len(iv_columns)}")

# Create strike dictionary from TEST columns
strike_dict = {}
for col in iv_columns:
    strike = col.split('_')[-1]
    if strike not in strike_dict:
        strike_dict[strike] = {'call': None, 'put': None}
    
    if col.startswith('call_iv_'):
        strike_dict[strike]['call'] = col
    else:
        strike_dict[strike]['put'] = col

print(f"\nNumber of unique strikes: {len(strike_dict)}")
print("\nStrike dictionary:")
print(strike_dict)

Loading data...

Train data shape: (178340, 97)
Test data shape: (12065, 96)
Sample submission shape: (12065, 53)

Number of IV columns: 52

Number of unique strikes: 36

Strike dictionary:
{'24000': {'call': 'call_iv_24000', 'put': 'put_iv_24000'}, '24100': {'call': 'call_iv_24100', 'put': 'put_iv_24100'}, '24200': {'call': 'call_iv_24200', 'put': 'put_iv_24200'}, '24300': {'call': 'call_iv_24300', 'put': 'put_iv_24300'}, '24400': {'call': 'call_iv_24400', 'put': 'put_iv_24400'}, '24500': {'call': 'call_iv_24500', 'put': 'put_iv_24500'}, '24600': {'call': 'call_iv_24600', 'put': 'put_iv_24600'}, '24700': {'call': 'call_iv_24700', 'put': 'put_iv_24700'}, '24800': {'call': 'call_iv_24800', 'put': 'put_iv_24800'}, '24900': {'call': 'call_iv_24900', 'put': 'put_iv_24900'}, '25000': {'call': 'call_iv_25000', 'put': 'put_iv_25000'}, '25100': {'call': 'call_iv_25100', 'put': 'put_iv_25100'}, '25200': {'call': 'call_iv_25200', 'put': 'put_iv_25200'}, '25300': {'call': 'call_iv_25300', 'put': 

## 3. Basic Statistical Calculations

In [3]:
def calculate_basic_statistics(df):
    """Calculate basic statistical metrics"""
    print("Calculating basic statistics...")
    stats = df.copy()
    
    # Calculate returns first
    stats['underlying_returns'] = np.log(stats['underlying'] / stats['underlying'].shift(1))
    
    # Realized volatility
    stats['realized_vol'] = stats['underlying_returns'].rolling(window=20).std() * np.sqrt(252)
    
    # Volatility skew
    stats['vol_skew'] = stats['underlying_returns'].rolling(window=20).skew()
    
    # Volatility kurtosis
    stats['vol_kurt'] = stats['underlying_returns'].rolling(window=20).kurt()
    
    # Fill missing values
    stats = stats.fillna(method='ffill').fillna(method='bfill').fillna(0)
    
    print(f"Created {len(stats.columns)} statistical features")
    return stats

# Test statistics calculation on a small sample
sample_stats = calculate_basic_statistics(train.head(100))
print(f"\nSample statistics shape: {sample_stats.shape}")
print("\nStatistical features:")
print(sample_stats.columns.tolist())

Calculating basic statistics...
Created 101 statistical features

Sample statistics shape: (100, 101)

Statistical features:
['timestamp', 'underlying', 'expiry', 'call_iv_23500', 'call_iv_23600', 'call_iv_23700', 'call_iv_23800', 'call_iv_23900', 'call_iv_24000', 'call_iv_24100', 'call_iv_24200', 'call_iv_24300', 'call_iv_24400', 'call_iv_24500', 'call_iv_24600', 'call_iv_24700', 'call_iv_24800', 'call_iv_24900', 'call_iv_25000', 'call_iv_25100', 'call_iv_25200', 'call_iv_25300', 'call_iv_25400', 'call_iv_25500', 'call_iv_25600', 'call_iv_25700', 'call_iv_25800', 'call_iv_25900', 'call_iv_26000', 'put_iv_22500', 'put_iv_22600', 'put_iv_22700', 'put_iv_22800', 'put_iv_22900', 'put_iv_23000', 'put_iv_23100', 'put_iv_23200', 'put_iv_23300', 'put_iv_23400', 'put_iv_23500', 'put_iv_23600', 'put_iv_23700', 'put_iv_23800', 'put_iv_23900', 'put_iv_24000', 'put_iv_24100', 'put_iv_24200', 'put_iv_24300', 'put_iv_24400', 'put_iv_24500', 'put_iv_24600', 'put_iv_24700', 'put_iv_24800', 'put_iv_249

## 4. Simple Interpolation Function

In [4]:
def simple_interpolation(x, y, x_new):
    """Perform simple linear interpolation"""
    return np.interp(x_new, x, y)

print("Simple interpolation function defined successfully")

Simple interpolation function defined successfully


## 5. Prediction Function

In [5]:
def predict_iv(data):
    print("Starting simple volatility prediction...")
    data = data.copy()
    
    # Add missing IV columns from test set
    for col in iv_columns:
        if col not in data.columns:
            data[col] = np.nan
    
    # Phase 1: Put-call parity
    print("\nPhase 1: Applying put-call parity...")
    for strike, cols in strike_dict.items():
        call_col = cols['call']
        put_col = cols['put']
        
        # Ensure both columns exist
        if call_col not in data.columns:
            data[call_col] = np.nan
        if put_col not in data.columns:
            data[put_col] = np.nan
        
        call_mask = data[call_col].isna() & data[put_col].notna()
        data.loc[call_mask, call_col] = data.loc[call_mask, put_col]
        
        put_mask = data[put_col].isna() & data[call_col].notna()
        data.loc[put_mask, put_col] = data.loc[put_mask, call_col]
    
    # Phase 2: Calculate basic statistics
    print("\nPhase 2: Calculating basic statistics...")
    stats = calculate_basic_statistics(data)
    
    # Phase 3: Simple interpolation
    print("\nPhase 3: Performing simple interpolation...")
    for idx, row in data.iterrows():
        # Get available IVs for this timestamp
        available_strikes = []
        available_ivs = []
        
        # Collect all available IVs
        for s, c in strike_dict.items():
            for col_type in ['call', 'put']:
                col = c[col_type]
                if not np.isnan(data.at[idx, col]):
                    available_strikes.append(float(s))
                    available_ivs.append(data.at[idx, col])
        
        # Sort by strike price for proper interpolation
        if len(available_strikes) >= 2:
            sort_idx = np.argsort(available_strikes)
            available_strikes = np.array(available_strikes)[sort_idx]
            available_ivs = np.array(available_ivs)[sort_idx]
            
            # For each missing IV, interpolate using available values
            for strike, cols in strike_dict.items():
                strike_price = float(strike)
                call_col = cols['call']
                put_col = cols['put']
                
                try:
                    if strike_price < min(available_strikes):
                        # Use nearest value for extrapolation
                        iv_pred = available_ivs[0]
                    elif strike_price > max(available_strikes):
                        # Use nearest value for extrapolation
                        iv_pred = available_ivs[-1]
                    else:
                        # Interpolate
                        iv_pred = simple_interpolation(available_strikes, available_ivs, strike_price)
                    
                    if np.isnan(data.at[idx, call_col]):
                        data.at[idx, call_col] = iv_pred
                    
                    if np.isnan(data.at[idx, put_col]):
                        data.at[idx, put_col] = iv_pred
                except:
                    # If interpolation fails, use nearest available IV
                    if np.isnan(data.at[idx, call_col]) or np.isnan(data.at[idx, put_col]):
                        nearest_idx = np.argmin(np.abs(np.array(available_strikes) - strike_price))
                        iv_pred = available_ivs[nearest_idx]
                        
                        if np.isnan(data.at[idx, call_col]):
                            data.at[idx, call_col] = iv_pred
                        
                        if np.isnan(data.at[idx, put_col]):
                            data.at[idx, put_col] = iv_pred
        else:
            # Not enough points for interpolation, use realized volatility as fallback
            realized_vol = stats.at[idx, 'realized_vol']
            default_iv = min(max(realized_vol, 0.01), 1.0) if not np.isnan(realized_vol) else 0.2  # Use 0.2 as last resort
            
            for strike, cols in strike_dict.items():
                call_col = cols['call']
                put_col = cols['put']
                
                if np.isnan(data.at[idx, call_col]):
                    data.at[idx, call_col] = default_iv
                
                if np.isnan(data.at[idx, put_col]):
                    data.at[idx, put_col] = default_iv
    
    # Phase 4: Smoothing and consistency
    print("\nPhase 4: Applying smoothing and consistency checks...")
    for idx, row in data.iterrows():
        for strike, cols in strike_dict.items():
            call_col = cols['call']
            put_col = cols['put']
            
            avg_iv = (data.at[idx, call_col] + data.at[idx, put_col]) / 2
            data.at[idx, call_col] = 0.9 * data.at[idx, call_col] + 0.1 * avg_iv
            data.at[idx, put_col] = 0.9 * data.at[idx, put_col] + 0.1 * avg_iv
    
    # Final check for missing values
    missing_cols = data[iv_columns].isna().any()
    if missing_cols.any():
        print("\nWarning: Still have missing values in columns:")
        print(missing_cols[missing_cols].index.tolist())
        # Use global mean as final fallback
        global_mean = data[iv_columns].mean().mean()
        for col in iv_columns:
            data[col].fillna(global_mean, inplace=True)
    
    # Ensure all values are within reasonable bounds
    for col in iv_columns:
        data[col] = np.clip(data[col], 0.01, 1.0)
    
    print("\nPrediction completed successfully")
    return data

## 6. Validation and Testing

In [6]:
# Create validation split
print("Creating validation split...")
train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

# Apply to validation set
print("\nRunning validation...")
val_pred = predict_iv(val_df)

# Calculate MSE only on originally masked validation points
mse_vals = []
for col in iv_columns:
    if col in val_df.columns and col in val_pred.columns:
        mask = val_df[col].isna() & val_pred[col].notna()
        if mask.any():
            se = (val_df.loc[mask, col] - val_pred.loc[mask, col]) ** 2
            mse_vals.append(se.mean())

validation_mse = np.mean(mse_vals) if mse_vals else 0
print(f"\nValidation MSE (masked points only): {validation_mse:.12f}")

Creating validation split...
Training set size: 142672
Validation set size: 35668

Running validation...
Starting simple volatility prediction...

Phase 1: Applying put-call parity...

Phase 2: Calculating basic statistics...
Calculating basic statistics...
Created 112 statistical features

Phase 3: Performing simple interpolation...

Phase 4: Applying smoothing and consistency checks...

Prediction completed successfully

Validation MSE (masked points only): 0.000000000000


## 7. Generate Final Predictions

In [None]:
# Apply to test set
print("Generating final predictions...")
test_pred = predict_iv(test)

# Prepare submission
submission = test_pred[['timestamp'] + iv_columns].cosubmission.to_csv('submission.csv', index=False)dex=False)

print("\nFinal Submission Preview:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")
print(f"Validation MSE: {validation_mse:.12f}")

Generating final predictions...
Starting simple volatility prediction...

Phase 1: Applying put-call parity...

Phase 2: Calculating basic statistics...
Calculating basic statistics...
Created 100 statistical features

Phase 3: Performing simple interpolation...

Phase 4: Applying smoothing and consistency checks...

Phase 4: Applying smoothing and consistency checks...

Prediction completed successfully

Prediction completed successfully


AssertionError: Missing values detected