In [None]:
# === CONFIGURATION: All Settings in One Place ===
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from tqdm import tqdm
from scipy.stats import norm, skew as scipy_skew
from scipy.interpolate import griddata, interp1d
from sklearn.linear_model import LassoCV
import matplotlib.pyplot as plt
import seaborn as sns

# IV Surface Grid
M_GRID = np.linspace(0.6, 1.4, 10)              # Moneyness grid (10 points)
DTE_GRID = [7, 14, 30, 60, 91, 122, 152, 182]   # Days to expiration (8 points)

# Data Files
OPTIONS_PARQUET = "../data/options_dataset.parquet"
OM_UNDERLYING_CSV = "../data/OMunderlying.csv"

# Device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

In [None]:
# === SECTION 1: Data Loading Functions ===

def load_om_data_for_volgan(options_path, underlying_path):
    """Load and prepare OptionMetrics data for VolGAN."""
    # Load options
    df_opt = pd.read_parquet(options_path)
    df_opt = df_opt[df_opt['ticker'] == 'SPX'].copy()
    df_opt['date'] = pd.to_datetime(df_opt['date'])
    df_opt['exdate'] = pd.to_datetime(df_opt['exdate'])
    
    # Strike scaling
    if df_opt['strike_price'].median() > 1e4:
        df_opt['strike_price'] = df_opt['strike_price'] / 1000.0
    
    df_opt['dte'] = (df_opt['exdate'] - df_opt['date']).dt.days
    df_opt['mid'] = (df_opt['best_bid'] + df_opt['best_offer']) / 2.0
    
    # Load spot
    df_spot = pd.read_csv(underlying_path)
    df_spot['date'] = pd.to_datetime(df_spot['date'])
    df_spot = df_spot[df_spot['ticker'] == 'SPX'][['date', 'close']].rename(columns={'close': 'spot'})
    df_spot = df_spot.drop_duplicates('date').sort_values('date')
    
    # Merge
    df_opt = df_opt.merge(df_spot, on='date', how='inner')
    df_opt['moneyness'] = df_opt['strike_price'] / df_opt['spot']
    
    return df_opt, df_spot

In [None]:
# === SECTION 2: Load and Quality-Filter Data ===

df_options, df_underlying = load_om_data_for_volgan(OPTIONS_PARQUET, OM_UNDERLYING_CSV)
print(f"Raw data loaded: {len(df_options)} option rows")

# Quality Screening
df = df_options.copy()
df['moneyness'] = df['strike_price'] / df['spot']

# Quality filters
df_1m = df_1m.replace([np.inf, -np.inf], np.nan)
df_1m = df_1m.dropna(subset=['mid', 'delta', 'vega', 'best_bid', 'best_offer'])
df_1m = df_1m[df_1m['best_offer'] >= df_1m['best_bid']].copy()

# Compute spread metrics
df_1m['spread'] = (df_1m['best_offer'] - df_1m['best_bid']).clip(lower=0)
df_1m['rel_spread'] = (df_1m['spread'] / df_1m['mid']).replace([np.inf, -np.inf], np.nan)

# Liquidity screens
liq_mask = (
    (df_1m['mid'] > 0.05) &
    (df_1m['rel_spread'] <= 0.35) &
    df_1m['delta'].between(-1.1, 1.1) &
    (df_1m['vega'] > 0)
)
df_1m = df_1m[liq_mask].copy()

print(f"After quality filters: {len(df_1m)} rows")
print(f"Date range: {df_1m['date'].min().date()} â†’ {df_1m['date'].max().date()}")
print(f"Coverage: {len(df_1m['date'].unique())} unique trading days\n")

# Keep relevant columns
cols_keep = ['date', 'exdate', 'cp_flag', 'strike_price', 'spot', 'moneyness', 
             'dte', 'mid', 'best_bid', 'best_offer', 'volume', 'open_interest', 
             'delta', 'vega', 'impl_volatility', 'spread', 'rel_spread']
cols_available = [c for c in cols_keep if c in df_1m.columns]
df_1m_filtered = df_1m[cols_available].sort_values(['date', 'dte', 'strike_price']).reset_index(drop=True)

print(f"Final filtered dataset: {len(df_1m_filtered)} rows\n")

df_1m_filtered.to_csv('FilteredDataset.csv', index=True)

In [None]:
# === SECTION 3: Build IV Surface Grid ===

def build_iv_surface_grid_robust(df_opt, m_grid, dte_grid):
    """Build IV surface with robust handling of days with insufficient data."""
    dates = sorted(df_opt['date'].unique())
    surfaces = []
    valid_dates = []
    
    for d in tqdm(dates, desc="Building IV surfaces"):
        day_df = df_opt[df_opt['date'] == d]
        points = day_df[['moneyness', 'dte', 'impl_volatility']].dropna()
        
        if len(points) < 10:
            continue
        
        unique_dte = points['dte'].nunique()
        unique_moneyness = points['moneyness'].nunique()
        
        if unique_dte < 2 or unique_moneyness < 3:
            continue
        
        dte_min, dte_max = points['dte'].min(), points['dte'].max()
        if dte_max - dte_min < 10:
            continue
        
        grid_m, grid_dte = np.meshgrid(m_grid, dte_grid)
        
        try:
            iv_surface = griddata(
                points[['moneyness', 'dte']].values,
                points['impl_volatility'].values,
                (grid_m, grid_dte),
                method='linear',
                fill_value=np.nan
            )
            
            if np.isnan(iv_surface).any():
                iv_surface_filled = griddata(
                    points[['moneyness', 'dte']].values,
                    points['impl_volatility'].values,
                    (grid_m, grid_dte),
                    method='nearest'
                )
                iv_surface = np.where(np.isnan(iv_surface), iv_surface_filled, iv_surface)
            
            if np.isnan(iv_surface).sum() > 0.5 * iv_surface.size:
                continue
            
            surfaces.append(iv_surface.T.flatten())
            valid_dates.append(d)
            
        except Exception as e:
            continue
    
    if len(surfaces) == 0:
        raise ValueError("No valid surfaces could be constructed!")
    
    surfaces_array = np.array(surfaces)
    tau_grid_years = np.array(dte_grid) / 365.0
    
    print(f"Surface construction: {len(valid_dates)}/{len(dates)} days ({100*len(valid_dates)/len(dates):.1f}%)")
    
    return surfaces_array, valid_dates, m_grid, tau_grid_years

surfaces_transform, dates_volgan, m, tau = build_iv_surface_grid_robust(
    df_1m_filtered, M_GRID, DTE_GRID
)
print(f"Surface grid shape: {surfaces_transform.shape}\n")