## 1. Import Libraries and Setup

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.optimize import minimize
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully")

Libraries imported successfully


In [None]:
def calculate_advanced_statistics(df):
    """Calculate advanced statistical metrics"""
    print("Calculating advanced statistics...")
    stats = df.copy()

    stats['underlying_returns'] = np.log(stats['underlying'] / stats['underlying'].shift(1))
    
    # Volatility of volatility
    stats['vol_of_vol'] = stats['underlying_returns'].rolling(window=20).std().rolling(window=20).std()
    
    # Realized volatility
    stats['realized_vol'] = stats['underlying_returns'].rolling(window=20).std() * np.sqrt(252)
    
    # Volatility skew
    stats['vol_skew'] = stats['underlying_returns'].rolling(window=20).skew()
    
    # Volatility kurtosis
    stats['vol_kurt'] = stats['underlying_returns'].rolling(window=20).kurt()
    
    # Volatility term structure
    for window in [5, 10, 20, 60]:
        stats[f'vol_term_{window}'] = stats['underlying_returns'].rolling(window=window).std() * np.sqrt(252)
    
    # Fill numerical values first
    numerical_cols = stats.select_dtypes(include=[np.number]).columns
    stats[numerical_cols] = stats[numerical_cols].fillna(method='ffill').fillna(method='bfill').fillna(0)
    
    # Volatility regime indicators - create after filling numerical values
    try:
        stats['vol_regime'] = pd.qcut(stats['realized_vol'], q=5, labels=['very_low', 'low', 'medium', 'high', 'very_high'])
    except ValueError:
        # Handle case when there are not enough unique values
        stats['vol_regime'] = pd.Series(['medium'] * len(stats), index=stats.index)
    
    # Fill any remaining categorical columns with their mode
    categorical_cols = stats.select_dtypes(include=['category']).columns
    for col in categorical_cols:
        mode_val = stats[col].mode()[0] if not stats[col].mode().empty else None
        stats[col] = stats[col].fillna(mode_val)
    
    print(f"Created {len(stats.columns)} statistical features")
    return stats

# Test statistics calculation on a small sample
sample_stats = calculate_advanced_statistics(train.head(100))
print(f"\nSample statistics shape: {sample_stats.shape}")
print("\nStatistical features:")
print(sample_stats.columns.tolist())

## 2. Load and Prepare Data

In [3]:
# Load data
print("Loading data...")
train = pd.read_parquet('train_data.parquet')
test = pd.read_parquet('test_data.parquet')
sample_sub = pd.read_csv('sample_submission.csv')

print(f"\nTrain data shape: {train.shape}")
print(f"Test data shape: {test.shape}")
print(f"Sample submission shape: {sample_sub.shape}")

# Get all IV columns from TEST data
iv_columns = [col for col in test.columns if col.startswith(('call_iv_', 'put_iv_'))]
print(f"\nNumber of IV columns: {len(iv_columns)}")

# Create strike dictionary from TEST columns
strike_dict = {}
for col in iv_columns:
    strike = col.split('_')[-1]
    if strike not in strike_dict:
        strike_dict[strike] = {'call': None, 'put': None}
    
    if col.startswith('call_iv_'):
        strike_dict[strike]['call'] = col
    else:
        strike_dict[strike]['put'] = col

print(f"\nNumber of unique strikes: {len(strike_dict)}")
print("\nStrike dictionary:")
print(strike_dict)

Loading data...

Train data shape: (178340, 97)
Test data shape: (12065, 96)
Sample submission shape: (12065, 53)

Number of IV columns: 52

Number of unique strikes: 36

Strike dictionary:
{'24000': {'call': 'call_iv_24000', 'put': 'put_iv_24000'}, '24100': {'call': 'call_iv_24100', 'put': 'put_iv_24100'}, '24200': {'call': 'call_iv_24200', 'put': 'put_iv_24200'}, '24300': {'call': 'call_iv_24300', 'put': 'put_iv_24300'}, '24400': {'call': 'call_iv_24400', 'put': 'put_iv_24400'}, '24500': {'call': 'call_iv_24500', 'put': 'put_iv_24500'}, '24600': {'call': 'call_iv_24600', 'put': 'put_iv_24600'}, '24700': {'call': 'call_iv_24700', 'put': 'put_iv_24700'}, '24800': {'call': 'call_iv_24800', 'put': 'put_iv_24800'}, '24900': {'call': 'call_iv_24900', 'put': 'put_iv_24900'}, '25000': {'call': 'call_iv_25000', 'put': 'put_iv_25000'}, '25100': {'call': 'call_iv_25100', 'put': 'put_iv_25100'}, '25200': {'call': 'call_iv_25200', 'put': 'put_iv_25200'}, '25300': {'call': 'call_iv_25300', 'put': 

## 3. SVI Model Implementation

In [4]:
def svi(k, a, b, rho, m, sigma):
    """SVI parametrization"""
    return a + b * (rho * (k - m) + np.sqrt((k - m)**2 + sigma**2))

def svi_objective(params, k, w):
    """Objective function for SVI fitting"""
    a, b, rho, m, sigma = params
    w_fit = svi(k, a, b, rho, m, sigma)
    return np.sum((w - w_fit)**2)

def fit_svi(k, w, initial_params=None):
    """Fit SVI model to data"""
    if initial_params is None:
        initial_params = [0.04, 0.4, 0.0, 0.0, 0.1]
    
    bounds = [
        (-np.inf, np.inf),  # a
        (0, np.inf),        # b
        (-1, 1),            # rho
        (-np.inf, np.inf),  # m
        (0, np.inf)         # sigma
    ]
    
    result = minimize(
        svi_objective,
        initial_params,
        args=(k, w),
        bounds=bounds,
        method='L-BFGS-B'
    )
    
    return result.x

print("SVI model functions defined successfully")

SVI model functions defined successfully


## 4. Advanced Statistical Calculations

In [7]:
def calculate_advanced_statistics(df):
    """Calculate advanced statistical metrics"""
    print("Calculating advanced statistics...")
    stats = df.copy()

    stats['underlying_returns'] = np.log(stats['underlying'] / stats['underlying'].shift(1))
    
    # Volatility of volatility
    stats['vol_of_vol'] = stats['underlying_returns'].rolling(window=20).std().rolling(window=20).std()
    
    # Realized volatility
    stats['realized_vol'] = stats['underlying_returns'].rolling(window=20).std() * np.sqrt(252)
    
    # Volatility skew
    stats['vol_skew'] = stats['underlying_returns'].rolling(window=20).skew()
    
    # Volatility kurtosis
    stats['vol_kurt'] = stats['underlying_returns'].rolling(window=20).kurt()
    
    # Volatility term structure
    for window in [5, 10, 20, 60]:
        stats[f'vol_term_{window}'] = stats['underlying_returns'].rolling(window=window).std() * np.sqrt(252)
    
    # Fill missing numerical values first
    numerical_cols = stats.select_dtypes(include=[np.number]).columns
    stats[numerical_cols] = stats[numerical_cols].fillna(method='ffill').fillna(method='bfill').fillna(0)
    
    # Now create volatility regime after filling numerical values
    try:
        # Handle edge case where all values might be the same
        unique_vals = stats['realized_vol'].nunique()
        if unique_vals >= 5:
            stats['vol_regime'] = pd.qcut(stats['realized_vol'], q=5, labels=['very_low', 'low', 'medium', 'high', 'very_high'])
        else:
            # If not enough unique values, use a simpler classification
            stats['vol_regime'] = pd.cut(stats['realized_vol'], 
                                        bins=[-np.inf, stats['realized_vol'].quantile(0.33), 
                                              stats['realized_vol'].quantile(0.67), np.inf],
                                        labels=['low', 'medium', 'high'])
    except Exception as e:
        print(f"Warning: Could not create volatility regime categories: {e}")
        stats['vol_regime'] = 'medium'  # Default to medium if categorization fails
    
    print(f"Created {len(stats.columns)} statistical features")
    return stats

# Test statistics calculation on a small sample
sample_stats = calculate_advanced_statistics(train.head(100))
print(f"\nSample statistics shape: {sample_stats.shape}")
print("\nStatistical features:")
print(sample_stats.columns.tolist())

Calculating advanced statistics...
        0.011149884145045185,  0.022417607704103816,  0.022417607704103816],
      dtype='float64', name='realized_vol').
You can drop duplicate edges by setting the 'duplicates' kwarg
Created 107 statistical features

Sample statistics shape: (100, 107)

Statistical features:
['timestamp', 'underlying', 'expiry', 'call_iv_23500', 'call_iv_23600', 'call_iv_23700', 'call_iv_23800', 'call_iv_23900', 'call_iv_24000', 'call_iv_24100', 'call_iv_24200', 'call_iv_24300', 'call_iv_24400', 'call_iv_24500', 'call_iv_24600', 'call_iv_24700', 'call_iv_24800', 'call_iv_24900', 'call_iv_25000', 'call_iv_25100', 'call_iv_25200', 'call_iv_25300', 'call_iv_25400', 'call_iv_25500', 'call_iv_25600', 'call_iv_25700', 'call_iv_25800', 'call_iv_25900', 'call_iv_26000', 'put_iv_22500', 'put_iv_22600', 'put_iv_22700', 'put_iv_22800', 'put_iv_22900', 'put_iv_23000', 'put_iv_23100', 'put_iv_23200', 'put_iv_23300', 'put_iv_23400', 'put_iv_23500', 'put_iv_23600', 'put_iv_23700',

## 5. Prediction Function

In [11]:
def predict_iv(data):
    print("Starting SVI IV prediction...")
    data = data.copy()
    
    # Phase 1: Put-call parity
    print("\nPhase 1: Applying put-call parity...")
    for strike, cols in strike_dict.items():
        call_col = cols['call']
        put_col = cols['put']
        
        if call_col is not None and put_col is not None and call_col in data.columns and put_col in data.columns:
            call_mask = data[call_col].isna() & data[put_col].notna()
            data.loc[call_mask, call_col] = data.loc[call_mask, put_col]
            
            put_mask = data[put_col].isna() & data[call_col].notna()
            data.loc[put_mask, put_col] = data.loc[put_mask, call_col]
    
    # Phase 2: Calculate advanced statistics
    print("\nPhase 2: Calculating advanced statistics...")
    stats = calculate_advanced_statistics(data)
    
    # Phase 3: SVI fitting and prediction
    print("\nPhase 3: Fitting SVI models...")
    for idx, row in data.iterrows():
        for strike, cols in strike_dict.items():
            strike_price = float(strike)
            call_col = cols['call']
            put_col = cols['put']
            
            if call_col is not None and put_col is not None and call_col in data.columns and put_col in data.columns:
                # Get available IVs for this timestamp
                available_strikes = []
                available_ivs = []
                
                for s, c in strike_dict.items():
                    if c['call'] is not None and not np.isnan(data.at[idx, c['call']]):
                        available_strikes.append(float(s))
                        available_ivs.append(data.at[idx, c['call']])
                    if c['put'] is not None and not np.isnan(data.at[idx, c['put']]):
                        available_strikes.append(float(s))
                        available_ivs.append(data.at[idx, c['put']])
                
                if len(available_strikes) >= 3:
                    # Fit SVI model
                    k = np.log(np.array(available_strikes) / row['underlying'])
                    w = np.array(available_ivs)**2
                    
                    try:
                        params = fit_svi(k, w)
                        
                        # Predict missing IVs
                        if np.isnan(data.at[idx, call_col]):
                            k_pred = np.log(strike_price / row['underlying'])
                            w_pred = svi(k_pred, *params)
                            data.at[idx, call_col] = np.sqrt(w_pred)
                        
                        if np.isnan(data.at[idx, put_col]):
                            k_pred = np.log(strike_price / row['underlying'])
                            w_pred = svi(k_pred, *params)
                            data.at[idx, put_col] = np.sqrt(w_pred)
                    except:
                        # If SVI fails, use nearest available IV
                        if np.isnan(data.at[idx, call_col]):
                            if available_strikes:  # Check if we have any available strikes
                                nearest_idx = np.argmin(np.abs(np.array(available_strikes) - strike_price))
                                data.at[idx, call_col] = available_ivs[nearest_idx]
                        
                        if np.isnan(data.at[idx, put_col]):
                            if not np.isnan(data.at[idx, call_col]):
                                data.at[idx, put_col] = data.at[idx, call_col]
    
    # Phase 4: Smoothing and consistency
    print("\nPhase 4: Applying smoothing and consistency checks...")
    for idx, row in data.iterrows():
        for strike, cols in strike_dict.items():
            call_col = cols['call']
            put_col = cols['put']
            
            if call_col is not None and put_col is not None and call_col in data.columns and put_col in data.columns:
                if not np.isnan(data.at[idx, call_col]) and not np.isnan(data.at[idx, put_col]):
                    avg_iv = (data.at[idx, call_col] + data.at[idx, put_col]) / 2
                    data.at[idx, call_col] = 0.9 * data.at[idx, call_col] + 0.1 * avg_iv
                    data.at[idx, put_col] = 0.9 * data.at[idx, put_col] + 0.1 * avg_iv
    
    # Ensure all values are within reasonable bounds
    for col in iv_columns:
        if col in data.columns:
            data[col] = np.clip(data[col], 0.01, 1.0)
    
    print("\nPrediction completed successfully")
    return data

## 6. Validation and Testing

In [None]:
# Create validation split
print("Creating validation split...")
train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

# Add any missing IV columns to validation set
for col in iv_columns:
    if col not in val_df.columns:
        val_df[col] = np.nan

# Apply to validation set
print("\nRunning validation...")
val_pred = predict_iv(val_df)

# Calculate MSE only on originally masked validation points
mse_vals = []
for col in iv_columns:
    if col in val_df.columns and col in val_pred.columns:
        mask = val_df[col].isna() & val_pred[col].notna()
        if mask.any():
            se = (val_df.loc[mask, col] - val_pred.loc[mask, col]) ** 2
            mse_vals.append(se.mean())

validation_mse = np.mean(mse_vals) if mse_vals else 0
print(f"\nValidation MSE (masked points only): {validation_mse:.12f}")

Creating validation split...
Training set size: 142672
Validation set size: 35668

Running validation...
Starting SVI IV prediction...

Phase 1: Applying put-call parity...

Phase 2: Calculating advanced statistics...
Calculating advanced statistics...
Created 117 statistical features

Phase 3: Fitting SVI models...
Created 117 statistical features

Phase 3: Fitting SVI models...


## 7. Generate Final Predictions

In [None]:
# Apply to test set
print("Generating final predictions...")
test_pred = predict_iv(test)

# Prepare submission
submission = test_pred[['timestamp'] + iv_columns].copy()
submission.columns = sample_sub.columns

# Verify no missing values
assert submission.isna().sum().sum() == 0, "Missing values detected"
submission.to_csv('submission.csv', index=False)

print("\nFinal Submission Preview:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")
print(f"Validation MSE: {validation_mse:.12f}")