<a href="https://colab.research.google.com/github/Akhil66624/Volatility-Curve-Prediction/blob/main/Volatility_Curve_Prediction_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Kaggle credentials set.
Kaggle credentials successfully validated.


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

nk_iv_prediction_path = kagglehub.competition_download('nk-iv-prediction')

print('Data source import complete.')


Data source import complete.


In [None]:
!ls


sample_data


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data_path = '/content/input'
output_path = '/content/working'
temp_path = '/content/temp'

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from tqdm.auto import tqdm
import warnings

# Suppress warnings for cleaner output
#warnings.filterwarnings('ignore')

# --- 1. Load Data ---
# Load the training and test datasets.
try:
    # Path for Kaggle environment
    train_df = pd.read_parquet('/content/input/train_data.parquet')
    test_df = pd.read_parquet('/content/input/test_data.parquet')
    sample_submission_df = pd.read_csv('/content/input/sample_submission.csv')
except FileNotFoundError:
    # Path for local environment if data is downloaded
    print("Running in a local environment. Ensure data files are in the current directory.")
    train_df = pd.read_parquet('train_data.parquet')
    test_df = pd.read_parquet('test_data.parquet')
    sample_submission_df = pd.read_csv('sample_submission.csv')

In [None]:
# --- 2. Feature Engineering ---
# Function to create features that summarize the volatility smile for each timestamp.
def create_smile_features(df):
    """
    Creates row-wise summary statistics for call and put IVs.
    These features help the model understand the shape of the volatility smile at each timestamp.
    """
    df_copy = df.copy()

    # Identify call and put IV columns
    call_iv_cols = sorted([c for c in df.columns if 'call_iv_' in c], key=lambda x: int(x.split('_')[-1]))
    put_iv_cols = sorted([c for c in df.columns if 'put_iv_' in c], key=lambda x: int(x.split('_')[-1]))
    all_iv_cols = call_iv_cols + put_iv_cols

    # Calculate smile summary statistics (ignoring NaNs)
    df_copy['iv_mean'] = df_copy[all_iv_cols].mean(axis=1)
    df_copy['iv_std'] = df_copy[all_iv_cols].std(axis=1)
    df_copy['iv_skew'] = df_copy[all_iv_cols].skew(axis=1)
    df_copy['iv_kurtosis'] = df_copy[all_iv_cols].kurtosis(axis=1)

    df_copy['call_iv_mean'] = df_copy[call_iv_cols].mean(axis=1)
    df_copy['put_iv_mean'] = df_copy[put_iv_cols].mean(axis=1)
    df_copy['call_iv_std'] = df_copy[call_iv_cols].std(axis=1)
    df_copy['put_iv_std'] = df_copy[put_iv_cols].std(axis=1)

    return df_copy

print("Creating smile features for train and test data...")
train_featured = create_smile_features(train_df)
test_featured = create_smile_features(test_df)
print("Feature engineering complete.")

Creating smile features for train and test data...
Feature engineering complete.


In [None]:
# --- 3. Data Transformation ---
# Function to transform the data from wide to long format.
def transform_to_long_format(df):
    """
    Melts the dataframe to a long format where each row represents a single option.
    Also creates option-specific features like strike, option_type, and moneyness.
    """
    # Identify feature columns (non-IV columns)
    feature_cols = [c for c in df.columns if not ('iv_' in c and c not in ['iv_mean', 'iv_std', 'iv_skew', 'iv_kurtosis', 'call_iv_mean', 'put_iv_mean', 'call_iv_std', 'put_iv_std'])]

    # Melt the dataframe
    long_df = pd.melt(df, id_vars=feature_cols, var_name='option_name', value_name='iv')

    # Extract option type and strike price from the option_name
    long_df['option_type'] = long_df['option_name'].apply(lambda x: 1 if 'call' in x else 0) # 1 for call, 0 for put
    long_df['strike'] = long_df['option_name'].apply(lambda x: int(x.split('_')[-1]))

    # Create moneyness feature
    long_df['moneyness'] = long_df['strike'] / long_df['underlying']

    return long_df

print("Transforming training data to long format...")
train_long = transform_to_long_format(train_featured)
# Drop rows with NaN IVs in training data as they cannot be used for training
train_long.dropna(subset=['iv'], inplace=True)
print("Training data transformation complete.")

Transforming training data to long format...


In [None]:
# --- 4. Model Training ---
# Define features and target for the LightGBM model.
FEATURES = [
    'underlying', 'iv_mean', 'iv_std', 'iv_skew', 'iv_kurtosis',
    'call_iv_mean', 'put_iv_mean', 'call_iv_std', 'put_iv_std',
    'option_type', 'strike', 'moneyness'
] + [f'X{i}' for i in range(42)]

TARGET = 'iv'

# Prepare training data
X_train = train_long[FEATURES]
y_train = train_long[TARGET]

# Define LightGBM model parameters
# These parameters are chosen for a good balance of speed and accuracy.
lgb_params = {
    'objective': 'regression_l1', # MAE is robust to outliers
    'metric': 'rmse',
    'n_estimators': 2000,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'num_leaves': 64,
    'verbose': -1,
    'n_jobs': -1,
    'seed': 42,
    'boosting_type': 'gbdt',
}

print("Training LightGBM model...")
model = lgb.LGBMRegressor(**lgb_params)
model.fit(X_train, y_train)
print("Model training complete.")

In [None]:
# --- 5. Prediction ---
print("Transforming test data and preparing for prediction...")
# Transform test data to long format
test_long = transform_to_long_format(test_featured)

# Identify rows where IV is missing (NaN)
missing_iv_mask = test_long['iv'].isna()
X_test_predict = test_long.loc[missing_iv_mask, FEATURES]

print(f"Predicting {len(X_test_predict)} missing IV values...")
predictions = model.predict(X_test_predict)

# Fill the missing IV values in the long format dataframe with the predictions
test_long.loc[missing_iv_mask, 'iv'] = predictions
print("Prediction complete.")

In [None]:
# --- 6. Generate Submission File ---
print("Pivoting data back to wide format for submission...")
# Pivot the table back to the wide submission format
submission_pivot = test_long.pivot_table(
    index='timestamp',
    columns='option_name',
    values='iv'
).reset_index()

# Ensure the columns are in the same order as the sample submission file
final_submission = submission_pivot[sample_submission_df.columns]

# Save the submission file
final_submission.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' has been generated successfully.")

In [None]:
# Load the test data
test_df = pd.read_parquet(f'{data_path}/test_data.parquet')

# Sample prediction
test_df = test_df.replace(np.nan, 0.2)

In [None]:
sample_submission = pd.read_csv(f'{data_path}/sample_submission.csv')
submission_cols = sample_submission.columns.tolist()

submission_df = test_df[submission_cols]
submission_df.to_csv(f'{output_path}/submission.csv', index = False)