## Libraries & Constants


In [62]:
import pandas as pd
import numpy as np
from keras.models import Model, Sequential, load_model
from keras.layers import Input, LSTM, Dense, Concatenate, BatchNormalization, LeakyReLU
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## Data Pre-Processing

In [49]:
# Load and combine data
PATH_TO_FILE_1 = "../raw data/aapl_2016_2020.csv"
PATH_TO_FILE_2 = "../raw data/aapl_2021_2023.csv"
half_1 = pd.read_csv(PATH_TO_FILE_1, low_memory=False)
half_2 = pd.read_csv(PATH_TO_FILE_2, low_memory=False)
df = pd.concat([half_1, half_2], ignore_index=True)
df.columns = df.columns.str.strip()
print(df.columns.tolist())

def cleaned_options(option_type, df):
    # Convert Unix timestamps to datetime
    df['[QUOTE_DATETIME]'] = pd.to_datetime(df['[QUOTE_UNIXTIME]'], unit='s')
    df['EXPIRE_DATETIME'] = pd.to_datetime(df['[EXPIRE_UNIX]'], unit='s')

    # Load underlying price data for volatility computation
    underlying_prices = df[['[QUOTE_DATETIME]', '[UNDERLYING_LAST]']].drop_duplicates().set_index('[QUOTE_DATETIME]').sort_index()

    # Function to compute historical volatility
    def historical_volatility(series, window=20):
        return np.sqrt(252) * series.pct_change().rolling(window=window).std()

    # Compute 20-day historical volatility
    underlying_prices['hist_vol_20d'] = historical_volatility(underlying_prices['[UNDERLYING_LAST]'])
    underlying_prices.dropna(inplace=True)

    # Merge historical volatility back into main dataframe
    df = df.merge(underlying_prices[['hist_vol_20d']], left_on='[QUOTE_DATETIME]', right_index=True, how='inner')

    # Columns to numeric conversion: choose columns based on option type
    if option_type == 'call':
        numeric_cols = ['[QUOTE_DATETIME]', '[UNDERLYING_LAST]', '[DTE]', '[STRIKE]', '[C_DELTA]', '[C_GAMMA]', '[C_VEGA]',
                        '[C_THETA]', '[C_RHO]', '[C_BID]', '[C_ASK]']
    else:
        numeric_cols = ['[QUOTE_DATETIME]', '[UNDERLYING_LAST]', '[DTE]', '[STRIKE]', '[P_DELTA]', '[P_GAMMA]', '[P_VEGA]',
                        '[P_THETA]', '[P_RHO]', '[P_BID]', '[P_ASK]']

    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

    # Calculate mid-price based on option type
    if option_type == 'call':
        df['CALL_EQUI_PRICE'] = (df['[C_BID]'] + df['[C_ASK]']) / 2
    else:
        df['PUT_EQUI_PRICE'] = (df['[P_BID]'] + df['[P_ASK]']) / 2

    # Drop rows with missing essential data (dynamically chosen based on option type)
    essential_cols = ['[QUOTE_DATETIME]', '[UNDERLYING_LAST]', '[DTE]', '[STRIKE]', 'hist_vol_20d']
    if option_type == 'call':
        essential_cols.append('CALL_EQUI_PRICE')
    else:
        essential_cols.append('PUT_EQUI_PRICE')
    df.dropna(subset=essential_cols, inplace=True)

    # Prepare option data based on the option type
    if option_type == 'call':
        option_cols = ['[QUOTE_DATETIME]', '[UNDERLYING_LAST]', '[STRIKE]', '[DTE]', 'hist_vol_20d', 
                    '[C_DELTA]', '[C_GAMMA]', '[C_VEGA]', '[C_THETA]', '[C_RHO]', 'CALL_EQUI_PRICE']
        col_rename = ['date', 'underlying_price', 'strike_price', 'days_to_expiry', 'hist_volatility', 
                    'delta', 'gamma', 'vega', 'theta', 'rho', 'equilibrium_price']
    else:
        option_cols = ['[QUOTE_DATETIME]', '[UNDERLYING_LAST]', '[STRIKE]', '[DTE]', 'hist_vol_20d', 
                    '[P_DELTA]', '[P_GAMMA]', '[P_VEGA]', '[P_THETA]', '[P_RHO]', 'PUT_EQUI_PRICE']
        col_rename = ['date', 'underlying_price', 'strike_price', 'days_to_expiry', 'hist_volatility', 
                    'delta', 'gamma', 'vega', 'theta', 'rho', 'equilibrium_price']

    option_df = df[option_cols].copy()
    option_df.columns = col_rename

    # Normalize strike price and compute time to expiry in years
    option_df['strike_price'] = option_df['strike_price'] / 1000
    option_df['time_to_expiry'] = option_df['days_to_expiry'] / 365
    option_df.drop('days_to_expiry', axis=1, inplace=True)
    return option_df

calls_df = cleaned_options("call", df)
puts_df = cleaned_options("put", df)
print(calls_df.sample(5))
print(puts_df.sample(5))

['[QUOTE_UNIXTIME]', '[QUOTE_READTIME]', '[QUOTE_DATE]', '[QUOTE_TIME_HOURS]', '[UNDERLYING_LAST]', '[EXPIRE_DATE]', '[EXPIRE_UNIX]', '[DTE]', '[C_DELTA]', '[C_GAMMA]', '[C_VEGA]', '[C_THETA]', '[C_RHO]', '[C_IV]', '[C_VOLUME]', '[C_LAST]', '[C_SIZE]', '[C_BID]', '[C_ASK]', '[STRIKE]', '[P_BID]', '[P_ASK]', '[P_SIZE]', '[P_LAST]', '[P_DELTA]', '[P_GAMMA]', '[P_VEGA]', '[P_THETA]', '[P_RHO]', '[P_IV]', '[P_VOLUME]', '[STRIKE_DISTANCE]', '[STRIKE_DISTANCE_PCT]']
                        date  underlying_price  strike_price  hist_volatility  \
1188485  1646082000000000000            164.95        0.1750         0.229550   
1421487  1614891600000000000            120.30        0.0575         0.305353   
502995   1541624400000000000            209.96        0.2250         0.411440   
883798   1494273600000000000            153.01        0.1500         0.154287   
478257   1524600000000000000            162.94        0.1850         0.268627   

           delta    gamma     vega    theta     

## Hyperparameters

In [59]:
N_TIMESTEPS = 20
NUM_FEATURES = 7
HIDDEN_LAYERS = 2
BATCH_SIZE = 2048
EPOCHS = 5

## Prepare Test and Training Split

In [None]:
def build_price_sequences(df, n_timesteps=20, date_col='date', price_col='underlying_price'):
    """
    Sorts by 'date', then builds rolling sequences of length n_timesteps
    for the 'underlying_price', returning:
      - X_seq: shape (num_rows, n_timesteps, 1)
      - valid_mask: boolean array indicating which rows have a full n_timesteps history
    """
    df_sorted = df.sort_values(date_col).reset_index(drop=True)
    n_rows = len(df_sorted)
    
    X_seq = np.zeros((n_rows, n_timesteps, 1), dtype=np.float32)
    valid_mask = np.ones(n_rows, dtype=bool)
    
    for i in range(n_rows):
        if i < n_timesteps:
            valid_mask[i] = False
            continue
        # last n_timesteps of underlying_price up to row i-1
        window = df_sorted.loc[i - n_timesteps : i - 1, price_col].values
        X_seq[i] = window.reshape(n_timesteps, 1)
    
    return X_seq, valid_mask

# -------------------- Calls --------------------
N_TIMESTEPS = 20
call_seq, call_valid = build_price_sequences(calls_df, n_timesteps=N_TIMESTEPS)

# Keep only rows with a complete sequence
calls_clean = calls_df[call_valid].copy().reset_index(drop=True)
X_call = call_seq[call_valid]  # shape = (valid_count, n_timesteps, 1)
y_call = calls_clean['equilibrium_price'].values  # shape = (valid_count,)

# 80% train, 20% test (time-based split)
train_ratio = 0.8
n_call = len(X_call)
n_call_train = int(n_call * train_ratio)
X_call_train, X_call_test = X_call[:n_call_train], X_call[n_call_train:]
y_call_train, y_call_test = y_call[:n_call_train], y_call[n_call_train:]

print("CALLS:")
print("Train shapes:", X_call_train.shape, y_call_train.shape)
print("Test shapes :", X_call_test.shape, y_call_test.shape)

# -------------------- Puts --------------------
put_seq, put_valid = build_price_sequences(puts_df, n_timesteps=N_TIMESTEPS)

puts_clean = puts_df[put_valid].copy().reset_index(drop=True)
X_put = put_seq[put_valid]
y_put = puts_clean['equilibrium_price'].values

n_put = len(X_put)
n_put_train = int(n_put * train_ratio)
X_put_train, X_put_test = X_put[:n_put_train], X_put[n_put_train:]
y_put_train, y_put_test = y_put[:n_put_train], y_put[n_put_train:]

print("\nPUTS:")
print("Train shapes:", X_put_train.shape, y_put_train.shape)
print("Test shapes :", X_put_test.shape, y_put_test.shape)

CALLS:
Train shapes: (1239606, 20, 1) (1239606,)
Test shapes : (309902, 20, 1) (309902,)

PUTS:
Train shapes: (1239600, 20, 1) (1239600,)
Test shapes : (309900, 20, 1) (309900,)


## LSTM Model

In [64]:
def make_model(n_timesteps=N_TIMESTEPS, features=NUM_FEATURES, layers=HIDDEN_LAYERS):
    # Time-series input (20x1)
    close_history = Input(shape=(n_timesteps, 1), name='lstm_input')
    
    # Static features input
    input2 = Input(shape=(features,), name='static_features')
    
    # Build LSTM stack
    lstm_seq = Sequential([
        LSTM(8, return_sequences=True, input_shape=(n_timesteps, 1)),
        LSTM(8, return_sequences=True),
        LSTM(8, return_sequences=True),
        LSTM(8, return_sequences=False)
    ])
    x1 = lstm_seq(close_history)
    
    # Concatenate LSTM output + static features
    x = Concatenate()([x1, input2])
    
    # Dense layers
    for _ in range(layers - 1):
        x = Dense(100)(x)
        x = BatchNormalization()(x)
        x = LeakyReLU()(x)

    # Final output
    predict = Dense(1, activation='relu')(x)
    
    return Model(inputs=[close_history, input2], outputs=predict)

calls_model = make_model()
puts_model = make_model()

calls_model.summary()

  super().__init__(**kwargs)


## Define Static Features

In [60]:
static_cols = ['strike_price','hist_volatility','delta','gamma','vega','theta','rho']
N_TIMESTEPS = 20
train_ratio = 0.8

#################################
# Calls
#################################
call_seq, call_valid = build_price_sequences(calls_df, n_timesteps=N_TIMESTEPS)
calls_clean = calls_df[call_valid].reset_index(drop=True)

X_call = call_seq[call_valid]                                 # shape: (num_valid_rows, 20, 1)
y_call = calls_clean['equilibrium_price'].values              # shape: (num_valid_rows,)
calls_static = calls_clean[static_cols].values                # shape: (num_valid_rows, len(static_cols))

n_call = len(X_call)
n_call_train = int(n_call * train_ratio)

X_call_train, X_call_test       = X_call[:n_call_train],       X_call[n_call_train:]
y_call_train, y_call_test       = y_call[:n_call_train],       y_call[n_call_train:]
calls_static_train, calls_static_test = calls_static[:n_call_train], calls_static[n_call_train:]

print("Calls train shapes:", X_call_train.shape, calls_static_train.shape, y_call_train.shape)
print("Calls test  shapes:", X_call_test.shape,  calls_static_test.shape,  y_call_test.shape)

#################################
# Puts
#################################
put_seq, put_valid = build_price_sequences(puts_df, n_timesteps=N_TIMESTEPS)
puts_clean = puts_df[put_valid].reset_index(drop=True)

X_put = put_seq[put_valid]
y_put = puts_clean['equilibrium_price'].values
puts_static = puts_clean[static_cols].values

n_put = len(X_put)
n_put_train = int(n_put * train_ratio)

X_put_train, X_put_test       = X_put[:n_put_train],       X_put[n_put_train:]
y_put_train, y_put_test       = y_put[:n_put_train],       y_put[n_put_train:]
puts_static_train, puts_static_test = puts_static[:n_put_train], puts_static[n_put_train:]

print("\nPuts train shapes:", X_put_train.shape, puts_static_train.shape, y_put_train.shape)
print("Puts test  shapes:", X_put_test.shape,  puts_static_test.shape,  y_put_test.shape)

Calls train shapes: (1239606, 20, 1) (1239606, 7) (1239606,)
Calls test  shapes: (309902, 20, 1) (309902, 7) (309902,)

Puts train shapes: (1239600, 20, 1) (1239600, 7) (1239600,)
Puts test  shapes: (309900, 20, 1) (309900, 7) (309900,)


## Training for Calls

In [61]:
learning_rates = [1e-2, 1e-3, 1e-4]

for lr in learning_rates:
    calls_model.compile(optimizer=Adam(learning_rate=lr), loss='mse')
    
    history = calls_model.fit(
        [X_call_train, calls_static_train],
        y_call_train,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_split=0.01,
        callbacks=[TensorBoard()],
        verbose=1
    )
    
    model_name = f"call-lstm-lr-{lr}.h5"
    calls_model.save(model_name)
    print(f"Saved calls_model with lr={lr} to {model_name}")

Epoch 1/5
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 58ms/step - loss: 1144.4031 - val_loss: 470.0856
Epoch 2/5
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 59ms/step - loss: 865.6392 - val_loss: 667.5805
Epoch 3/5
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 57ms/step - loss: 796.8112 - val_loss: 697.2238
Epoch 4/5
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 58ms/step - loss: 802.0097 - val_loss: 509.7815
Epoch 5/5
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 58ms/step - loss: 770.5034 - val_loss: 462.4358




Saved calls_model with lr=0.01 to call-lstm-lr-0.01.h5
Epoch 1/5
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 60ms/step - loss: 635.7473 - val_loss: 374.6650
Epoch 2/5
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 58ms/step - loss: 601.8231 - val_loss: 385.5637
Epoch 3/5
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 58ms/step - loss: 598.0319 - val_loss: 289.7827
Epoch 4/5
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 55ms/step - loss: 589.1680 - val_loss: 362.0814
Epoch 5/5
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 54ms/step - loss: 577.1650 - val_loss: 352.9912




Saved calls_model with lr=0.001 to call-lstm-lr-0.001.h5
Epoch 1/5
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 56ms/step - loss: 562.2892 - val_loss: 316.0703
Epoch 2/5
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 57ms/step - loss: 569.1586 - val_loss: 338.1693
Epoch 3/5
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 56ms/step - loss: 555.9469 - val_loss: 363.1645
Epoch 4/5
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 57ms/step - loss: 564.8047 - val_loss: 298.2140
Epoch 5/5
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 57ms/step - loss: 554.7857 - val_loss: 339.4699




Saved calls_model with lr=0.0001 to call-lstm-lr-0.0001.h5


## Training for Puts

In [None]:
learning_rates = [1e-2, 1e-3, 1e-4]

for lr in learning_rates:
    puts_model.compile(optimizer=Adam(learning_rate=lr), loss='mse')
    
    history = calls_model.fit(
        [X_put_train, puts_static_train],  # TWO inputs
        y_call_train,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_split=0.01,  # or a separate val set
        callbacks=[TensorBoard()],
        verbose=1
    )
    
    model_name = f"call-lstm-lr-{lr}.h5"
    calls_model.save(model_name)
    print(f"Saved calls_model with lr={lr} to {model_name}")

## Evaluation and Plot

In [63]:
# Test 10 random entries
loaded_model = load_model('call-lstm-lr-0.0001.h5')
sample_entries = X_call.sample(10)
scaler = StandardScaler()
sample_scaled = scaler.transform(sample_entries)
predictions = loaded_model.predict(sample_scaled)

for i, idx in enumerate(sample_entries.index):
    print(f"Entry {idx} - Predicted price: {predictions[i][0]:.2f}, Actual price: {y.loc[idx]:.2f}")

TypeError: Could not locate function 'mse'. Make sure custom classes are decorated with `@keras.saving.register_keras_serializable()`. Full object config: {'module': 'keras.metrics', 'class_name': 'function', 'config': 'mse', 'registered_name': 'mse'}