In [1]:
# ============================================================
# Core numerical libraries
# ============================================================
import numpy as np
import pandas as pd

# ============================================================
# PyTorch (MLP model)
# ============================================================
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ============================================================
# Scaling and metrics
# ============================================================
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

# ============================================================
# Reproducibility
# ============================================================
torch.manual_seed(42)
np.random.seed(42)

In [2]:
# Choose GPU if available (WSL should expose GPU through NVIDIA drivers)
# However since pytorch cuda = 12.8 work with latest blackwell RTX5080
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
if device.type == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))

Using device: cuda
GPU: NVIDIA GeForce RTX 5080


In [3]:
# ============================================================
# Load CLEAN datasets (already prepared earlier)
# ------------------------------------------------------------
from pathlib import Path

DATA_DIR = Path("../data")

ASSETS = {
    "SPY": {
        "options": DATA_DIR / "options_clean_SPY.csv",
        "daily":   DATA_DIR / "spy_with_garch.csv",
        "daily_date_col": "Date",
        "daily_close_col": "Close"
    },
    "AAPL": {
        "options": DATA_DIR / "options_clean_AAPL.csv",
        "daily":   DATA_DIR / "aapl_with_garch.csv",  # you must create this in Notebook 02
        "daily_date_col": "Date",
        "daily_close_col": "Close"
    }
}

datasets = {}

for asset, cfg in ASSETS.items():
    opts = pd.read_csv(cfg["options"])
    daily = pd.read_csv(cfg["daily"], parse_dates=[cfg["daily_date_col"]])

    opts["QUOTE_DATE"]  = pd.to_datetime(opts["QUOTE_DATE"],  format="mixed", errors="coerce")
    opts["EXPIRE_DATE"] = pd.to_datetime(opts["EXPIRE_DATE"], format="mixed", errors="coerce")
    opts = opts.dropna(subset=["QUOTE_DATE", "EXPIRE_DATE"])

    daily = daily.sort_values(cfg["daily_date_col"]).reset_index(drop=True)

    datasets[asset] = {
        "opts": opts,
        "daily": daily
    }

    print(f"[{asset}] options rows:", len(opts))
    print(f"[{asset}] daily rows:", len(daily))

[SPY] options rows: 4195810
[SPY] daily rows: 3521
[AAPL] options rows: 1562105
[AAPL] daily rows: 2011


In [4]:
# ============================================================
# Merge SPY and AAPL macro features into option dataset
# ------------------------------------------------------------
# Why?
# - Options prices depend on:
#   • underlying level
#   • market returns
#   • volatility regime
# ============================================================

merged_data = {}

for asset, data_dict in datasets.items():
    opts = data_dict["opts"]
    daily = data_dict["daily"]

    daily_small = daily[["Date", "Close", "log_return", "garch_vol"]].copy()
    daily_small = daily_small.rename(columns={
        "Date": "QUOTE_DATE",
        "Close": f"{asset}_CLOSE"
    })

    opts["QUOTE_DATE"] = pd.to_datetime(opts["QUOTE_DATE"])
    daily_small["QUOTE_DATE"] = pd.to_datetime(daily_small["QUOTE_DATE"])

    merged = opts.merge(daily_small, on="QUOTE_DATE", how="left")
    merged = merged.dropna(subset=["log_return", "garch_vol", f"{asset}_CLOSE"])

    merged_data[asset] = merged

    print(f"[{asset}] merged rows:", len(merged))


[SPY] merged rows: 4169845
[AAPL] merged rows: 1552960


In [5]:
# ============================================================
# Feature selection
# ------------------------------------------------------------
# These are standard option pricing drivers:
#
# UNDERLYING_LAST → spot price
# SPY_CLOSE       → market level
# log_return      → return shock
# garch_vol       → volatility regime
# DTE             → time decay
# MONEINESS       → intrinsic structure
# ============================================================

feature_cols_base = [
    "UNDERLYING_LAST",
    "log_return",
    "garch_vol",
    "DTE",
    "MONEINESS"
]

# Target: Call option mid price
target_col = "C_MID"

In [6]:
# ============================================================
# 80/10/10 split (time-based split)
# ============================================================

splits = {}

for asset, df in merged_data.items():
    df = df.sort_values("QUOTE_DATE").reset_index(drop=True)

    unique_dates = df["QUOTE_DATE"].sort_values().unique()
    n_dates = len(unique_dates)

    train_cut = int(0.8 * n_dates)
    val_cut   = int(0.9 * n_dates)

    train_dates = unique_dates[:train_cut]
    val_dates   = unique_dates[train_cut:val_cut]
    test_dates  = unique_dates[val_cut:]

    train_data = df[df["QUOTE_DATE"].isin(train_dates)]
    val_data   = df[df["QUOTE_DATE"].isin(val_dates)]
    test_data  = df[df["QUOTE_DATE"].isin(test_dates)]

    splits[asset] = {
        "train": train_data,
        "val":   val_data,
        "test":  test_data
    }

    print(f"\n[{asset}] split:")
    print("  Train:", len(train_data))
    print("  Val  :", len(val_data))
    print("  Test :", len(test_data))


[SPY] split:
  Train: 2330311
  Val  : 1024018
  Test : 815516

[AAPL] split:
  Train: 1204151
  Val  : 161572
  Test : 187237


In [7]:
scaled = {}

for asset, split in splits.items():
    X_cols = feature_cols_base + [f"{asset}_CLOSE"]

    scaler_X = StandardScaler()
    scaler_y = StandardScaler()

    X_train = scaler_X.fit_transform(split["train"][X_cols])
    X_val   = scaler_X.transform(split["val"][X_cols])
    X_test  = scaler_X.transform(split["test"][X_cols])

    y_train = scaler_y.fit_transform(split["train"][[target_col]])
    y_val   = scaler_y.transform(split["val"][[target_col]])
    y_test  = scaler_y.transform(split["test"][[target_col]])

    scaled[asset] = {
        "X": (X_train, X_val, X_test),
        "y": (y_train, y_val, y_test),
        "scalers": (scaler_X, scaler_y),
        "features": X_cols
    }

In [8]:
# ============================================================
# Torch Dataset wrapper
# ============================================================

class OptionDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [9]:
# --------------------------------------------------------
# PyTorch Dataset + DataLoader
# --------------------------------------------------------
# Notes:
# - Keep tensors on CPU here (DataLoader works best on CPU tensors)
# - Move batches to GPU inside the training loop (non_blocking=True)
# - pin_memory=True speeds up CPU->GPU transfer
# --------------------------------------------------------
from torch.utils.data import DataLoader, TensorDataset

loaders = {}

for asset, data in scaled.items():
    X_train, X_val, X_test = data["X"]
    y_train, y_val, y_test = data["y"]

    X_train_t = torch.tensor(X_train, dtype=torch.float32)
    y_train_t = torch.tensor(y_train, dtype=torch.float32)
    X_val_t   = torch.tensor(X_val,   dtype=torch.float32)
    y_val_t   = torch.tensor(y_val,   dtype=torch.float32)
    X_test_t  = torch.tensor(X_test,  dtype=torch.float32)
    y_test_t  = torch.tensor(y_test,  dtype=torch.float32)

    train_ds = TensorDataset(X_train_t, y_train_t)
    val_ds   = TensorDataset(X_val_t,   y_val_t)
    test_ds  = TensorDataset(X_test_t,  y_test_t)

    pin = (device.type == "cuda")

    loaders[asset] = {
        "train": DataLoader(
            train_ds, batch_size=4096, shuffle=True,
            num_workers=2, pin_memory=pin, persistent_workers=True
        ),
        "val": DataLoader(
            val_ds, batch_size=4096, shuffle=False,
            num_workers=2, pin_memory=pin, persistent_workers=True
        ),
        "test": DataLoader(
            test_ds, batch_size=4096, shuffle=False,
            num_workers=2, pin_memory=pin, persistent_workers=True
        )
    }

    print(f"[{asset}] loaders ready")

[SPY] loaders ready
[AAPL] loaders ready


In [10]:
# ============================================================
# MLP baseline
# ------------------------------------------------------------
# This sets a performance floor.
# If Transformer can't beat this → it's not worth the complexity.
# ============================================================

class MLP(nn.Module):
    def __init__(self, n_features):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_features, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.net(x)

In [11]:
# ============================================================
# Training (GPU-enabled)
# ============================================================

EPOCHS = 10
results = {}

for asset, data in scaled.items():
    print(f"\n==============================")
    print(f" Training MLP for {asset}")
    print(f"==============================")

    feature_cols = data["features"]
    train_loader = loaders[asset]["train"]
    val_loader   = loaders[asset]["val"]

    model = MLP(n_features=len(feature_cols)).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    train_history = []
    val_history = []

    best_val_mse = float("inf")
    best_state   = None
    best_epoch   = None
    patience     = 2          # stop if no improvement for 2 epochs
    patience_ctr = 0
    
    for epoch in range(EPOCHS):
        # -------- TRAIN --------
        model.train()
        train_losses = []
    
        for Xb, yb in train_loader:
            Xb = Xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)
    
            optimizer.zero_grad(set_to_none=True)
            preds = model(Xb)
            loss = loss_fn(preds, yb)
            loss.backward()
            optimizer.step()
    
            train_losses.append(loss.item())
    
        # -------- VALIDATE --------
        model.eval()
        val_losses = []
    
        with torch.no_grad():
            for Xb, yb in val_loader:
                Xb = Xb.to(device, non_blocking=True)
                yb = yb.to(device, non_blocking=True)
    
                preds = model(Xb)
                loss = loss_fn(preds, yb)
                val_losses.append(loss.item())
    
        train_mse = float(np.mean(train_losses))
        val_mse   = float(np.mean(val_losses))
    
        train_history.append(train_mse)
        val_history.append(val_mse)
    
        print(
            f"Epoch {epoch+1:02d} | "
            f"Train MSE: {train_mse:.6f} | "
            f"Val MSE: {val_mse:.6f}"
        )
    
        # -------- EARLY STOPPING --------
        if val_mse < best_val_mse:
            best_val_mse = val_mse
            best_state   = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            best_epoch   = epoch + 1
            patience_ctr = 0
        else:
            patience_ctr += 1
            if patience_ctr >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}")
                break
    
    model.load_state_dict(best_state)
    model = model.to(device)

    results[asset] = {
        "model": model,
        "best_epoch": best_epoch,
        "best_val_mse": best_val_mse,
        "train_mse": train_history,
        "val_mse": val_history
    }


 Training MLP for SPY
Epoch 01 | Train MSE: 0.089446 | Val MSE: 0.046459
Epoch 02 | Train MSE: 0.003775 | Val MSE: 0.020446
Epoch 03 | Train MSE: 0.001729 | Val MSE: 0.014623
Epoch 04 | Train MSE: 0.001152 | Val MSE: 0.011621
Epoch 05 | Train MSE: 0.000953 | Val MSE: 0.012613
Epoch 06 | Train MSE: 0.000848 | Val MSE: 0.012253
Early stopping triggered at epoch 6

 Training MLP for AAPL
Epoch 01 | Train MSE: 0.256753 | Val MSE: 0.026270
Epoch 02 | Train MSE: 0.014129 | Val MSE: 0.017773
Epoch 03 | Train MSE: 0.008422 | Val MSE: 0.016182
Epoch 04 | Train MSE: 0.005390 | Val MSE: 0.016322
Epoch 05 | Train MSE: 0.003553 | Val MSE: 0.014702
Epoch 06 | Train MSE: 0.002388 | Val MSE: 0.011139
Epoch 07 | Train MSE: 0.001686 | Val MSE: 0.009594
Epoch 08 | Train MSE: 0.001265 | Val MSE: 0.006832
Epoch 09 | Train MSE: 0.001002 | Val MSE: 0.006233
Epoch 10 | Train MSE: 0.000804 | Val MSE: 0.007173


In [12]:
for asset, data in scaled.items():
    print(f"\n==============================")
    print(f" Test evaluation for {asset}")
    print(f"==============================")

    model = results[asset]["model"]
    test_loader = loaders[asset]["test"]
    scaler_y = data["scalers"][1]

    model.eval()
    preds_list = []
    true_list  = []

    with torch.no_grad():
        for Xb, yb in test_loader:
            Xb = Xb.to(device, non_blocking=True)
            preds = model(Xb)

            preds_list.append(preds.cpu().numpy())
            true_list.append(yb.cpu().numpy())

    preds = scaler_y.inverse_transform(np.vstack(preds_list))
    true  = scaler_y.inverse_transform(np.vstack(true_list))

    rmse = np.sqrt(mean_squared_error(true, preds))
    mae  = mean_absolute_error(true, preds)

    mean_price = true.mean()
    rmse_pct = 100 * rmse / mean_price
    mae_pct  = 100 * mae  / mean_price

    print(f"[{asset}] Test RMSE: {rmse:.4f}  ({rmse_pct:.2f}%)")
    print(f"[{asset}] Test MAE : {mae:.4f}  ({mae_pct:.2f}%)")


 Test evaluation for SPY
[SPY] Test RMSE: 5.4851  (10.58%)
[SPY] Test MAE : 4.1243  (7.96%)

 Test evaluation for AAPL
[AAPL] Test RMSE: 2.7421  (11.43%)
[AAPL] Test MAE : 2.1292  (8.87%)
