In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nk-iv-prediction/test_data.parquet
/kaggle/input/nk-iv-prediction/sample_submission.csv
/kaggle/input/nk-iv-prediction/train_data.parquet


In [2]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures

# 📁 Load Data
train_df = pd.read_parquet('/kaggle/input/nk-iv-prediction/train_data.parquet')
test_df = pd.read_parquet('/kaggle/input/nk-iv-prediction/test_data.parquet')
sample_submission = pd.read_csv('/kaggle/input/nk-iv-prediction/sample_submission.csv')

# 🎯 Target IV columns
iv_cols = [col for col in sample_submission.columns if col.startswith('call_iv_') or col.startswith('put_iv_')]

# 🔍 Identify test-only and train-only columns
train_only = set(train_df.columns) - set(test_df.columns)
test_only = set(test_df.columns) - set(train_df.columns)

# ⚙ Select numeric features present in both
numeric_cols = test_df.select_dtypes(include=np.number).columns.tolist()
valid_features = [col for col in numeric_cols if col not in train_only and col not in test_only]

# ❌ Remove sparse columns (more than 50% zeros)
filtered_features = [col for col in valid_features if (test_df[col] == 0).mean() < 0.5]

# 🔧 Columns to impute: IVs + filtered features
impute_cols = sorted(set(iv_cols + filtered_features))
test_impute_df = test_df[impute_cols].copy()

# 📊 Track missing before
nan_before = test_df[iv_cols].isna().sum().sum()

# 🔄 Impute using IterativeImputer with XGBoost
print("🔄 Using IterativeImputer with XGBoost...")
xgb_estimator = XGBRegressor(n_estimators=200, max_depth=7, learning_rate=0.08,
                              tree_method="gpu_hist", predictor="gpu_predictor", verbosity=0)
imputer = IterativeImputer(estimator=xgb_estimator, random_state=42, max_iter=10, initial_strategy='mean')
test_df[impute_cols] = imputer.fit_transform(test_impute_df)

# 📈 Bayesian Polynomial Regression for smile extrapolation
def fit_iv_smile_bayesian(row, kind='call', degree=5):
    strikes, ivs = [], []
    for col in iv_cols:
        if col.startswith(f'{kind}_iv_') and col not in test_only and not pd.isna(row[col]):
            try:
                strike = int(col.split('_')[-1])
                strikes.append([strike])
                ivs.append(row[col])
            except:
                continue
    if len(strikes) >= degree + 1:
        try:
            poly = PolynomialFeatures(degree=degree)
            X_poly = poly.fit_transform(strikes)
            from sklearn.linear_model import BayesianRidge
            model = BayesianRidge()
            model.fit(X_poly, ivs)
            for col in iv_cols:
                if col.startswith(f'{kind}_iv_') and col in test_only and pd.isna(row.get(col, None)):
                    try:
                        strike = int(col.split('_')[-1])
                        X_pred = poly.transform([[strike]])
                        row[col] = model.predict(X_pred)[0]
                    except:
                        continue
        except:
            pass
    return row

# 🚀 Apply Bayesian smile extrapolation
print("🎯 Applying Bayesian Polynomial Smile Extrapolation (degree=5)...")
test_df = test_df.apply(lambda row: fit_iv_smile_bayesian(row, 'call', degree=5), axis=1)
test_df = test_df.apply(lambda row: fit_iv_smile_bayesian(row, 'put', degree=5), axis=1)

# 📊 Log final NaN count
nan_after = test_df[iv_cols].isna().sum().sum()
print(f"✅ IV NaNs before: {nan_before}, after: {nan_after}")

# 📤 Final Submission
submission = sample_submission.copy()
submission[iv_cols] = test_df[iv_cols]
submission.to_csv("imputed_submission.csv", index=False, float_format="%.6f")
print("📁 Submission saved as 'imputed_submission.csv'")


🔄 Using IterativeImputer with XGBoost...
🎯 Applying Bayesian Polynomial Smile Extrapolation (degree=5)...
✅ IV NaNs before: 376504, after: 0
📁 Submission saved as 'imputed_submission.csv'
