In [1]:
# ============================================================
# Core
# ============================================================
import numpy as np
import pandas as pd

# ============================================================
# PyTorch
# ============================================================
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ============================================================
# Scaling & metrics
# ============================================================
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

# ============================================================
# Reproducibility
# ============================================================
torch.manual_seed(42)
np.random.seed(42)

In [2]:
# Choose GPU if available (WSL should expose GPU through NVIDIA drivers)
# However since pytorch cuda = 12.8 work with latest blackwell RTX5080
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [3]:
# ============================================================
# Load CLEAN datasets (OPTIONS + DAILY WITH GARCH)
# ============================================================

from pathlib import Path

DATA_DIR = Path("../data")

ASSETS = {
    "SPY": {
        "options": DATA_DIR / "options_clean_SPY.csv",
        "daily":   DATA_DIR / "spy_with_garch.csv",
    },
    "AAPL": {
        "options": DATA_DIR / "options_clean_AAPL.csv",
        "daily":   DATA_DIR / "aapl_with_garch.csv",
    }
}

datasets = {}

for asset, cfg in ASSETS.items():
    opts = pd.read_csv(
        cfg["options"],
        parse_dates=["QUOTE_DATE", "EXPIRE_DATE"]
    )

    daily = pd.read_csv(
        cfg["daily"],
        parse_dates=["Date"]
    )

    assert "log_return" in daily.columns
    assert "garch_vol" in daily.columns

    datasets[asset] = {
        "opts": opts,
        "daily": daily
    }

    print(f"[{asset}] options:", opts.shape)
    print(f"[{asset}] daily+garch:", daily.shape)

[SPY] options: (4195810, 12)
[SPY] daily+garch: (3521, 8)
[AAPL] options: (1562105, 12)
[AAPL] daily+garch: (2011, 8)


In [4]:
# ============================================================
# 80 / 10 / 10 time-based split (per asset)
# ============================================================

# ============================================================
# Merge macro features into options (per asset)
# ============================================================

merged_data = {}

for asset, data_dict in datasets.items():
    opts  = data_dict["opts"]
    daily = data_dict["daily"]

    daily_small = daily[["Date", "Close", "log_return", "garch_vol"]].copy()
    daily_small = daily_small.rename(columns={
        "Date": "QUOTE_DATE",
        "Close": f"{asset}_CLOSE"
    })

    merged = opts.merge(daily_small, on="QUOTE_DATE", how="left")
    merged = merged.dropna(subset=["log_return", "garch_vol", f"{asset}_CLOSE"])
    merged = merged.sort_values("QUOTE_DATE").reset_index(drop=True)

    merged_data[asset] = merged
    print(f"[{asset}] merged rows:", len(merged))

[SPY] merged rows: 4169845
[AAPL] merged rows: 1552960


In [5]:
# ============================================================
# 80 / 10 / 10 time-based split (per asset)
# ============================================================

splits = {}

for asset, df in merged_data.items():
    dates = df["QUOTE_DATE"].sort_values().unique()
    n = len(dates)

    train_end = int(0.8 * n)
    val_end   = int(0.9 * n)

    train_dates = dates[:train_end]
    val_dates   = dates[train_end:val_end]
    test_dates  = dates[val_end:]

    splits[asset] = {
        "train": df[df["QUOTE_DATE"].isin(train_dates)],
        "val":   df[df["QUOTE_DATE"].isin(val_dates)],
        "test":  df[df["QUOTE_DATE"].isin(test_dates)],
    }

    print(f"[{asset}] train={len(splits[asset]['train'])} | "
          f"val={len(splits[asset]['val'])} | "
          f"test={len(splits[asset]['test'])}")

[SPY] train=2330311 | val=1024018 | test=815516
[AAPL] train=1204151 | val=161572 | test=187237


In [6]:
market_features = {}
option_features = ["UNDERLYING_LAST", "DTE", "MONEINESS"]
target_col = "C_MID"

for asset in splits:
    market_features[asset] = [
        f"{asset}_CLOSE",
        "log_return",
        "garch_vol"
    ]

In [7]:
# ============================================================
# Scaling
# ------------------------------------------------------------
# VERY IMPORTANT:
# - Scalers are fit only on training data
# - Prevents future information leakage
# ============================================================
# ============================================================
# Scaling (NO SettingWithCopyWarning)
# ============================================================

scalers = {}
scaled  = {}

for asset, split in splits.items():
    # Explicit copies (IMPORTANT)
    train_df = split["train"].copy()
    val_df   = split["val"].copy()
    test_df  = split["test"].copy()

    scaler_m = StandardScaler()
    scaler_o = StandardScaler()
    scaler_y = StandardScaler()

    # ---- market features ----
    train_df.loc[:, market_features[asset]] = scaler_m.fit_transform(
        train_df[market_features[asset]]
    )
    val_df.loc[:, market_features[asset]] = scaler_m.transform(
        val_df[market_features[asset]]
    )
    test_df.loc[:, market_features[asset]] = scaler_m.transform(
        test_df[market_features[asset]]
    )

    # ---- option features ----
    train_df.loc[:, option_features] = scaler_o.fit_transform(
        train_df[option_features]
    )
    val_df.loc[:, option_features] = scaler_o.transform(
        val_df[option_features]
    )
    test_df.loc[:, option_features] = scaler_o.transform(
        test_df[option_features]
    )

    # ---- target ----
    train_df.loc[:, target_col] = scaler_y.fit_transform(
        train_df[[target_col]]
    )
    val_df.loc[:, target_col] = scaler_y.transform(
        val_df[[target_col]]
    )
    test_df.loc[:, target_col] = scaler_y.transform(
        test_df[[target_col]]
    )

    scalers[asset] = (scaler_m, scaler_o, scaler_y)

    # Overwrite splits with scaled, SAFE versions
    splits[asset] = {
        "train": train_df,
        "val":   val_df,
        "test":  test_df
    }

In [8]:
# ============================================================
# Sequence creation
# ------------------------------------------------------------
# Each sample:
# - Uses past N days of market state
# - Predicts option price today
#
# Example (seq_len=10):
# X = market[t-9 ... t]
# y = option_price[t]
# ============================================================
# ============================================================
# Sequence creation (PER ASSET, SAFE NUMERIC TYPES)
# ------------------------------------------------------------
# Enforces float32 to avoid numpy.object_ issues
# ============================================================

SEQ_LEN = 10

def build_sequences(df, market_cols, option_cols, target_col):
    X_seq = []
    X_opt = []
    y = []

    # Ensure numeric arrays upfront (CRITICAL)
    market_arr = df[market_cols].astype(np.float32).values
    option_arr = df[option_cols].astype(np.float32).values
    target_arr = df[target_col].astype(np.float32).values

    for i in range(SEQ_LEN, len(df)):
        X_seq.append(market_arr[i-SEQ_LEN:i])
        X_opt.append(option_arr[i])
        y.append(target_arr[i])

    return (
        np.asarray(X_seq, dtype=np.float32),
        np.asarray(X_opt, dtype=np.float32),
        np.asarray(y, dtype=np.float32).reshape(-1, 1)
    )

# ----------------------------
# Build sequences PER ASSET
# ----------------------------
X_seq_train, X_opt_train, y_train = {}, {}, {}
X_seq_val,   X_opt_val,   y_val   = {}, {}, {}
X_seq_test,  X_opt_test,  y_test  = {}, {}, {}

for asset, split in splits.items():
    print(f"\nBuilding sequences for {asset}")

    X_seq_train[asset], X_opt_train[asset], y_train[asset] = build_sequences(
        split["train"],
        market_features[asset],
        option_features,
        target_col
    )

    X_seq_val[asset], X_opt_val[asset], y_val[asset] = build_sequences(
        split["val"],
        market_features[asset],
        option_features,
        target_col
    )

    X_seq_test[asset], X_opt_test[asset], y_test[asset] = build_sequences(
        split["test"],
        market_features[asset],
        option_features,
        target_col
    )

    print(f"[{asset}] Train:", X_seq_train[asset].shape)
    print(f"[{asset}] Val  :", X_seq_val[asset].shape)
    print(f"[{asset}] Test :", X_seq_test[asset].shape)


Building sequences for SPY
[SPY] Train: (2330301, 10, 3)
[SPY] Val  : (1024008, 10, 3)
[SPY] Test : (815506, 10, 3)

Building sequences for AAPL
[AAPL] Train: (1204141, 10, 3)
[AAPL] Val  : (161562, 10, 3)
[AAPL] Test : (187227, 10, 3)


In [9]:
print(X_opt_train["SPY"].dtype)
print(X_opt_train["AAPL"].dtype)

float32
float32


In [10]:
# ============================================================
# Torch Dataset for time-series Transformer
# ============================================================

class OptionSeqDataset(Dataset):
    def __init__(self, X_seq, X_opt, y):
        self.X_seq = torch.tensor(X_seq, dtype=torch.float32)
        self.X_opt = torch.tensor(X_opt, dtype=torch.float32)
        self.y     = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X_seq[idx], self.X_opt[idx], self.y[idx]

In [11]:
# ------------------------------------------------------------
# DataLoaders (GPU-friendly)
# ------------------------------------------------------------
# Notes:
# - Keep tensors on CPU, move per-batch to GPU in the training loop
# - pin_memory speeds CPU->GPU transfer
# ------------------------------------------------------------
# ============================================================
# DataLoaders (per asset, GPU-friendly)
# ============================================================

loaders = {}
pin = (device.type == "cuda")

for asset in ["SPY", "AAPL"]:
    print(f"Building DataLoaders for {asset}")

    train_ds = OptionSeqDataset(
        X_seq_train[asset],
        X_opt_train[asset],
        y_train[asset]
    )
    val_ds = OptionSeqDataset(
        X_seq_val[asset],
        X_opt_val[asset],
        y_val[asset]
    )
    test_ds = OptionSeqDataset(
        X_seq_test[asset],
        X_opt_test[asset],
        y_test[asset]
    )

    loaders[asset] = {
        "train": DataLoader(
            train_ds,
            batch_size=512,        # IMPORTANT: smaller batch for Transformer
            shuffle=True,
            num_workers=4,
            pin_memory=pin,
            persistent_workers=True
        ),
        "val": DataLoader(
            val_ds,
            batch_size=512,
            shuffle=False,
            num_workers=4,
            pin_memory=pin,
            persistent_workers=True
        ),
        "test": DataLoader(
            test_ds,
            batch_size=512,
            shuffle=False,
            num_workers=4,
            pin_memory=pin,
            persistent_workers=True
        )
    }

    print(f"[{asset}] loaders ready")

Building DataLoaders for SPY
[SPY] loaders ready
Building DataLoaders for AAPL
[AAPL] loaders ready


In [12]:
# ============================================================
# Transformer model
# ------------------------------------------------------------
# - Encodes market history
# - Combines with option static features
# ============================================================

class OptionTransformer(nn.Module):
    def __init__(self, market_dim, option_dim):
        super().__init__()

        self.market_proj = nn.Linear(market_dim, 32)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=32,
            nhead=4,
            batch_first=True
        )

        self.transformer = nn.TransformerEncoder(
            encoder_layer,
            num_layers=2
        )

        self.fc = nn.Sequential(
            nn.Linear(32 + option_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, X_seq, X_opt):
        # Project market features
        x = self.market_proj(X_seq)

        # Transformer encoder
        x = self.transformer(x)

        # Use last timestep embedding
        x_last = x[:, -1, :]

        # Concatenate static option features
        combined = torch.cat([x_last, X_opt], dim=1)

        return self.fc(combined)

In [13]:
# ------------------------------------------------------------
# Training loop with EARLY STOPPING (per asset) with GPU
# ------------------------------------------------------------

EPOCHS = 20
PATIENCE = 2

results = {}

for asset in ["SPY", "AAPL"]:
    print(f"\n==============================")
    print(f" Training Transformer for {asset}")
    print(f"==============================")

    train_loader = loaders[asset]["train"]
    val_loader   = loaders[asset]["val"]

    model = OptionTransformer(
        market_dim=len(market_features[asset]),
        option_dim=len(option_features)
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    best_val = float("inf")
    best_state = None
    patience_ctr = 0

    train_hist, val_hist = [], []

    for epoch in range(EPOCHS):
        # -------- TRAIN --------
        model.train()
        train_losses = []

        for X_seq, X_opt, y in train_loader:
            X_seq = X_seq.to(device, non_blocking=True)
            X_opt = X_opt.to(device, non_blocking=True)
            y     = y.to(device, non_blocking=True)

            optimizer.zero_grad(set_to_none=True)
            preds = model(X_seq, X_opt)
            loss = loss_fn(preds, y)
            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())

        # -------- VALIDATE --------
        model.eval()
        val_losses = []

        with torch.no_grad():
            for X_seq, X_opt, y in val_loader:
                X_seq = X_seq.to(device, non_blocking=True)
                X_opt = X_opt.to(device, non_blocking=True)
                y     = y.to(device, non_blocking=True)

                preds = model(X_seq, X_opt)
                val_losses.append(loss_fn(preds, y).item())

        train_mse = np.mean(train_losses)
        val_mse   = np.mean(val_losses)

        train_hist.append(train_mse)
        val_hist.append(val_mse)

        print(
            f"Epoch {epoch+1:02d} | "
            f"Train MSE: {train_mse:.6f} | "
            f"Val MSE: {val_mse:.6f}"
        )

        # -------- EARLY STOP --------
        if val_mse < best_val:
            best_val = val_mse
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            patience_ctr = 0
        else:
            patience_ctr += 1
            if patience_ctr >= PATIENCE:
                print(f"Early stopping at epoch {epoch+1}")
                break

    model.load_state_dict(best_state)
    model.to(device)

    results[asset] = {
        "model": model,
        "best_val_mse": best_val,
        "train_mse": train_hist,
        "val_mse": val_hist
    }


 Training Transformer for SPY
Epoch 01 | Train MSE: 0.020654 | Val MSE: 0.028337
Epoch 02 | Train MSE: 0.001089 | Val MSE: 0.048238
Epoch 03 | Train MSE: 0.000929 | Val MSE: 0.023708
Epoch 04 | Train MSE: 0.000665 | Val MSE: 0.021628
Epoch 05 | Train MSE: 0.000671 | Val MSE: 0.024982
Epoch 06 | Train MSE: 0.000697 | Val MSE: 0.019159
Epoch 07 | Train MSE: 0.000619 | Val MSE: 0.019011
Epoch 08 | Train MSE: 0.000559 | Val MSE: 0.020201
Epoch 09 | Train MSE: 0.000484 | Val MSE: 0.018587
Epoch 10 | Train MSE: 0.000360 | Val MSE: 0.021747
Epoch 11 | Train MSE: 0.000422 | Val MSE: 0.020535
Early stopping at epoch 11

 Training Transformer for AAPL
Epoch 01 | Train MSE: 0.114541 | Val MSE: 0.008125
Epoch 02 | Train MSE: 0.005002 | Val MSE: 0.002023
Epoch 03 | Train MSE: 0.001411 | Val MSE: 0.001419
Epoch 04 | Train MSE: 0.000898 | Val MSE: 0.001300
Epoch 05 | Train MSE: 0.000693 | Val MSE: 0.001539
Epoch 06 | Train MSE: 0.000579 | Val MSE: 0.001306
Early stopping at epoch 6


In [14]:
# ============================================================
# Test evaluation (ONCE, after model selection)
# ============================================================

for asset in ["SPY", "AAPL"]:
    print(f"\n==============================")
    print(f" Test evaluation for {asset}")
    print(f"==============================")

    model = results[asset]["model"]
    test_loader = loaders[asset]["test"]
    scaler_y = scalers[asset][2]   # target scaler

    model.eval()
    preds_all = []
    y_all = []

    with torch.no_grad():
        for X_seq, X_opt, y in test_loader:
            X_seq = X_seq.to(device, non_blocking=True)
            X_opt = X_opt.to(device, non_blocking=True)

            preds = model(X_seq, X_opt)

            preds_all.append(preds.cpu().numpy())
            y_all.append(y.cpu().numpy())

    preds = scaler_y.inverse_transform(np.vstack(preds_all))
    true  = scaler_y.inverse_transform(np.vstack(y_all))

    rmse = np.sqrt(mean_squared_error(true, preds))
    mae  = mean_absolute_error(true, preds)

    mean_price = true.mean()
    print(f"[{asset}] Test RMSE: {rmse:.4f} ({100*rmse/mean_price:.2f}%)")
    print(f"[{asset}] Test MAE : {mae:.4f} ({100*mae/mean_price:.2f}%)")


 Test evaluation for SPY
[SPY] Test RMSE: 7.2919 (14.07%)
[SPY] Test MAE : 5.1454 (9.93%)

 Test evaluation for AAPL
[AAPL] Test RMSE: 2.2521 (9.39%)
[AAPL] Test MAE : 1.5432 (6.43%)
