 # Setting up essential libraries for data processing, modeling, and visualization. `lightgbm` is used for fast, high-performance gradient boosting.

In [None]:
# === ENVIRONMENT SETUP ===
import numpy as np
import pandas as pd
import os
import gc
!pip install -q lightgbm
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from scipy import stats
import random
from tqdm import tqdm
from sklearn.model_selection import KFold
from scipy.signal import savgol_filter

# Load training and test datasets along with submission column names.

In [None]:
# === PATHS ===
data_path = '/kaggle/input/volatility-smile-prediction'
output_path = '/kaggle/working'

# === LOAD DATA ===
train_df = pd.read_parquet(f'{data_path}/train_data.parquet')
test_df = pd.read_parquet(f'{data_path}/test_data.parquet')
sample_submission = pd.read_csv(f'{data_path}/sample_submission.csv')
submission_cols = sample_submission.columns.tolist()

# Data Preprocessing

### This section summarizes the main data preprocessing steps applied before modeling:

## 1. Outlier Removal
### - Remove rows where any `call_iv_` or `put_iv_` column is outside the [0.0, 1.0] range.

## 2. Cross-IV Feature Engineering
### - For each strike, add new features: each call IV gets the corresponding put IV, and vice versa.

## 3. Feature Selection
### - Keep columns starting with `X` and valid target columns.
### - Add the new cross-IV features to the feature set.

## 4. Drop Sparse and Low-Variance Features
### - Remove `X` columns with more than 50% zeros.
### - Drop features with standard deviation less than 0.01.

## 5. Denoising
### - Apply Exponential Weighted Moving Average (EWM) smoothing to all IV columns.

## 6. Remove Highly Correlated Features
### - Drop one of any pair of features with correlation greater than 0.98.

## 7. Clipping
### - Clip all feature values to the 1st and 99th percentiles to reduce the impact of outliers.

## 8. Save Cleaned Data
### - Export the cleaned train and test datasets for further modeling.



In [None]:
# === REMOVE OUTLIERS FUNCTION ===
def remove_iv_outliers(df):
    iv_cols = [col for col in df.columns if col.startswith('call_iv_') or col.startswith('put_iv_')]
    outlier_mask = np.zeros(len(df), dtype=bool)
    for col in iv_cols:
        out_of_bounds = (df[col] < 0.0) | (df[col] > 1.0)
        if out_of_bounds.any():
            print(f"{col}: {out_of_bounds.sum()} outlier rows — removing")
            outlier_mask |= out_of_bounds
    df_cleaned = df[~outlier_mask].reset_index(drop=True)
    return df_cleaned

# === CROSS-IV FEATURE ADDITION FUNCTION ===
def add_cross_iv_features(df):
    call_cols = [col for col in df.columns if col.startswith('call_iv_')]
    put_cols = [col for col in df.columns if col.startswith('put_iv_')]

    call_strikes = {col.split('_')[-1]: col for col in call_cols}
    put_strikes = {col.split('_')[-1]: col for col in put_cols}

    common_strikes = set(call_strikes.keys()) & set(put_strikes.keys())
    new_features = []

    for strike in common_strikes:
        call_col = call_strikes[strike]
        put_col = put_strikes[strike]

        call_extra = f"{call_col}_extra"
        put_extra = f"{put_col}_extra"

        df[call_extra] = df[put_col]
        df[put_extra] = df[call_col]

        new_features.extend([call_extra, put_extra])

    return df, new_features

# === APPLY CLEANING ===
train_df = remove_iv_outliers(train_df)
# test_df = remove_iv_outliers(test_df)  # Optional

# === APPLY CROSS-FEATURES ===
train_df, cross_features_train = add_cross_iv_features(train_df)
test_df, cross_features_test = add_cross_iv_features(test_df)
common_cross_features = list(set(cross_features_train) & set(cross_features_test))


# === FEATURE AND TARGET COLUMNS ===
feature_cols = [col for col in train_df.columns if col.startswith('X')]
target_cols = [col for col in submission_cols if col in train_df.columns]
feature_cols += common_cross_features

# === REMOVE 'X' COLUMNS WITH >50% ZEROS ===
x_cols = [col for col in feature_cols if col.startswith('X')]
cols_to_drop = []

for col in x_cols:
    zero_fraction = (train_df[col] == 0).mean()
    if zero_fraction > 0.5:
        print(f"Dropping column {col} — {zero_fraction:.2%} values are zero")
        cols_to_drop.append(col)
        
# === REMOVE FEATURES WITH STD < 0.01 ===
low_std_cols = []
for col in feature_cols:
    if train_df[col].std() < 0.01:
        low_std_cols.append(col)
        print(f"Dropping column {col} — std is {train_df[col].std():.5f} < 0.01")


train_df.drop(columns=cols_to_drop, inplace=True)
test_df.drop(columns=cols_to_drop, inplace=True)

# ====FAST DENOISING==========
def fast_denoise_iv(df, span=3, verbose=True):
    iv_cols = [col for col in df.columns if col.startswith('call_iv_') or col.startswith('put_iv_')]

    if verbose:
        print(f"📉 Applying EWM denoising to {len(iv_cols)} columns...")

    df.sort_values("timestamp", inplace=True)
    df[iv_cols] = df[iv_cols].ewm(span=span, adjust=False).mean()
    return df



# === APPLY DENOISING ===
train_df = fast_denoise_iv(train_df, span=3)
test_df = fast_denoise_iv(test_df, span=3)



feature_cols = [col for col in feature_cols if col not in cols_to_drop]

# === REMOVE HIGHLY CORRELATED FEATURES (corr > 0.98) ===

print("🔁 Removing highly correlated features...")
corr_matrix = train_df[feature_cols].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop_corr = [column for column in upper.columns if any(upper[column] > 0.98)]

for col in to_drop_corr:
    print(f"Dropping {col} — high correlation with another feature")


for col in feature_cols:
    q1 = train_df[col].quantile(0.01)
    q99 = train_df[col].quantile(0.99)
    train_df[col] = np.clip(train_df[col], q1, q99)
    test_df[col] = np.clip(test_df[col], q1, q99)

print(feature_cols)
# === SAVE CLEANED INPUTS ===
train_df.to_csv(f'{output_path}/train_inputs_cleaned.csv', index=False)
test_df.to_csv(f'{output_path}/test_inputs_cleaned.csv', index=False)


# Feature Engineering & Model Training

## 1. IV Aggregates
### - Calculate the mean and standard deviation across all target IV columns for each row (`iv_mean`, `iv_std`).

## 2. Missing Value Indicators
### - For every feature, add a new column indicating if the value is missing (`_na` suffix).

## 3. Strike Extraction
### - Extract the strike value from each target column name and add it as a feature.

## 4. Model Training Loop
### - For each target:
  #### - Filter out rows with missing target values.
  #### - Use KFold cross-validation to split the data (first fold only).
  #### - Train a LightGBM regressor with specified hyperparameters.
  #### - Predict on the test set and store results.


In [None]:
# === FEATURE ENGINEERING ===

train_df["iv_mean"] = train_df[target_cols].mean(axis=1)
train_df["iv_std"] = train_df[target_cols].std(axis=1)
test_df["iv_mean"] = test_df[target_cols].mean(axis=1)
test_df["iv_std"] = test_df[target_cols].std(axis=1)
feature_cols += ["iv_mean", "iv_std"]

for col in feature_cols.copy():
    train_df[col + "_na"] = train_df[col].isna().astype(int)
    test_df[col + "_na"] = test_df[col].isna().astype(int)
    feature_cols.append(col + "_na")

# === STRIKE EXTRACTION FUNCTION ===
def extract_strike(col_name):
    try:
        return int(col_name.split('_')[-1])
    except:
        return np.nan

# === MODEL TRAINING LOOP ===
preds_df = pd.DataFrame(index=test_df.index)

for target in target_cols:
    print(f"\n🔧 Training model for target: {target}")
    strike_val = extract_strike(target)
    train_df["strike"] = strike_val
    test_df["strike"] = strike_val
    if "strike" not in feature_cols:
        feature_cols.append("strike")

    train_target_df = train_df.dropna(subset=[target])
    if train_target_df.empty:
        print(f"⚠️ No data to train for {target}")
        preds_df[target] = 0.2 + (random.random() - 0.5) * 0.1
        continue

    X = train_target_df[feature_cols]
    y = train_target_df[target]
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for train_idx, val_idx in kf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        break

    model = LGBMRegressor(
    objective='regression',
    learning_rate=0.01,
    num_leaves=128,
    max_depth=10,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    n_estimators=4000,
    lambda_l1=1.0,
    lambda_l2=1.0,
    min_child_samples=20,
    random_state=42,
    n_jobs=-1,
    )


    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',  # MSE in LightGBM is 'l2'
        callbacks=[early_stopping(stopping_rounds=200), log_evaluation(100)]
    )

    preds_df[target] = model.predict(test_df[feature_cols])
    del model, train_target_df, X, y, X_train, X_val, y_train, y_val
    gc.collect()




# Submission in the Competition

In [None]:
# === SUBMISSION ===
for col in submission_cols:
    if col not in preds_df.columns:
        preds_df[col] = 0.2 + (random.random() - 0.5) * 0.1


preds_df = preds_df[submission_cols]
preds_df['timestamp'] = range(len(preds_df))

submission_file = os.path.join(output_path, 'submission.csv')
preds_df.to_csv(submission_file, index=False)
print("\n✅ Submission saved to:", submission_file)