In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.utils import Sequence
import math

In [2]:
np.random.seed(3888)
tf.random.set_seed(3888)

In [None]:
SEQ_LEN = 30
BATCH_SIZE = 128 
EPOCHS = 100 
LEARNING_RATE = 0.01

In [4]:
df = pd.read_parquet("/Users/ayush/Documents/University/Year 03/Sem 01/DATA3888/Optiver-07/Data/FE30Stocks.parquet")

In [5]:
feature_cols_mod = ['stock_id', 'mid_price', 'spread', 'imbalance',
                'book_pressure', 'LOB_entropy', 'log_return', 'bipower_var',
                'log_wap_return', 'imbalance_lag1', 'imbalance_lag2',
                'book_pressure_lag1', 'book_pressure_lag2', 'log_return_lag1',
                'log_return_lag2', 'rolling_vol_30', 'rolling_imbalance_mean_30',
                'sec_sin', 'sec_cos', 'bid_size1_log', 'ask_size1_log', 'bid_size2_log',
                'ask_size2_log']

original_target_col = "rv_future"

In [6]:
if original_target_col not in df.columns:
    print(f"Error: Original target column '{original_target_col}' not found in DataFrame.")

transformed_target_col = 'rv_future_log'
df[transformed_target_col] = np.log1p(df[original_target_col])

In [7]:
unique_sessions = df["time_id"].sort_values().unique()
if len(unique_sessions) < 2: 
    print("Error: Not enough unique time_id sessions for train/test split.")

In [8]:
split_idx       = int(len(unique_sessions) * 0.8)
if split_idx == 0 and len(unique_sessions) > 0 : 
    split_idx = 1
if split_idx == len(unique_sessions) and len(unique_sessions) > 0: 
    split_idx = len(unique_sessions) - 1

In [9]:
train_sessions  = unique_sessions[:split_idx]
test_sessions   = unique_sessions[split_idx:]

In [10]:
if len(train_sessions) == 0 or len(test_sessions) == 0:
    print("Error: Train or test set is empty after splitting sessions. Adjust split or check data.")

In [11]:
train_df = df[df["time_id"].isin(train_sessions)].copy()
test_df  = df[df["time_id"].isin(test_sessions)].copy()

if train_df.empty or test_df.empty:
    print("Error: train_df or test_df is empty after filtering by time_id.")

In [12]:
x_scaler = MinMaxScaler()

if not train_df[feature_cols_mod].empty:
    x_scaler.fit(train_df[feature_cols_mod])
    train_df.loc[:, feature_cols_mod] = x_scaler.transform(train_df[feature_cols_mod])
    test_df.loc[:, feature_cols_mod]  = x_scaler.transform(test_df[feature_cols_mod])
else:
    print("Error: Training data for feature scaling is empty.")

In [None]:
y_scaler = MinMaxScaler(feature_range=(0, 1))

if not train_df[[transformed_target_col]].empty:
    y_scaler.fit(train_df[[transformed_target_col]])

    train_df.loc[:, transformed_target_col] = y_scaler.transform(
        train_df[[transformed_target_col]]
    ).astype(np.float32)

    test_df.loc[:, transformed_target_col] = y_scaler.transform(
        test_df[[transformed_target_col]]
    ).astype(np.float32)
else:
    print("Error: Training data for target scaling is empty.")


In [14]:
class TimeSeriesSequence(Sequence):
    
    def __init__(self, df_part, feature_cols, target_col, seq_len, batch_size, shuffle=False):
        self.df_part = df_part.copy() 
        self.feature_cols = feature_cols
        self.target_col = target_col
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.session_ids = self.df_part['time_id'].unique()
        self.indices = []
        for session_id in self.session_ids:
            session_data_for_len = self.df_part[self.df_part['time_id'] == session_id] 
            session_len = len(session_data_for_len)
            if session_len > self.seq_len:
                for i in range(session_len - self.seq_len):
                    self.indices.append((session_id, i))
        if not self.indices:
            print("Warning: No sequences generated. Check SEQ_LEN, data per time_id, or filtering.")
        if self.shuffle:
            np.random.shuffle(self.indices)

    def __len__(self):
        if not self.indices:
            return 0
        return math.ceil(len(self.indices) / self.batch_size)

    def __getitem__(self, idx):
        if not self.indices:
            raise IndexError("Attempting to get item from an empty sequence generator.")
        batch_indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_X = []
        batch_y = []

        for session_id, start_in_session_idx in batch_indices:
            session_df = self.df_part[self.df_part['time_id'] == session_id]
            sequence_data = session_df[self.feature_cols].iloc[start_in_session_idx : start_in_session_idx + self.seq_len].values
            target_val = session_df[self.target_col].iloc[start_in_session_idx + self.seq_len]
            batch_X.append(sequence_data)
            batch_y.append(target_val)
        return np.array(batch_X), np.array(batch_y)

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

In [15]:
train_generator = TimeSeriesSequence(train_df, feature_cols_mod, transformed_target_col, SEQ_LEN, BATCH_SIZE, shuffle=False)
test_generator = TimeSeriesSequence(test_df, feature_cols_mod, transformed_target_col, SEQ_LEN, BATCH_SIZE, shuffle=False)

In [16]:
def build_transformer_model(seq_len, num_features, d_model=64, num_heads=4, num_layers=2, ff_dim_factor=4):
    inputs = layers.Input(shape=(seq_len, num_features))
    x = layers.Dense(d_model)(inputs) 
    for _ in range(num_layers):
        attn_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=d_model // num_heads,
            dropout=0.1
        )(query=x, value=x, key=x)
        x = layers.Add()([x, attn_output]) 
        x = layers.LayerNormalization(epsilon=1e-6)(x)
        ffn_out = layers.Dense(d_model * ff_dim_factor, activation="relu")(x)
        ffn_out = layers.Dense(d_model)(ffn_out) 
        ffn_out = layers.Dropout(0.1)(ffn_out) 
        x = layers.Add()([x, ffn_out])
        x = layers.LayerNormalization(epsilon=1e-6)(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.1)(x) 
    output = layers.Dense(1)(x)
    return models.Model(inputs, output)

num_model_features = len(feature_cols_mod)
model = build_transformer_model(SEQ_LEN, num_model_features)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), loss="mse")
model.summary()

In [None]:
history = model.fit(
    train_generator,
    epochs=EPOCHS,
    verbose=1,
)

In [None]:
pred_scaled_list = []
actual_scaled_list = []

for i in range(len(test_generator)):
    X_batch, y_batch_scaled = test_generator[i] 
    pred_batch_scaled = model.predict_on_batch(X_batch)
    pred_scaled_list.append(pred_batch_scaled)
    actual_scaled_list.append(y_batch_scaled)

In [None]:
pred_scaled = np.concatenate([p.flatten() for p in pred_scaled_list])
actual_scaled_from_generator = np.concatenate([a.flatten() for a in actual_scaled_list]) 
predictions_original = y_scaler.inverse_transform(pred_scaled.reshape(-1, 1)).flatten()

In [None]:
original_actuals_list = []
for session_id, start_in_session_idx in test_generator.indices:
    session_df_original_target = test_df[test_df['time_id'] == session_id] # test_df here still has original_target_col
    original_target_val = session_df_original_target[original_target_col].iloc[start_in_session_idx + SEQ_LEN]
    original_actuals_list.append(original_target_val)
actuals_original = np.array(original_actuals_list)

In [None]:
predictions_original = np.maximum(predictions_original, 0) 
mse_original   = np.mean((predictions_original - actuals_original) ** 2)
rmse_original  = np.sqrt(mse_original)

print(f"Test RMSE (original volatility): {rmse_original:.9f}")

In [None]:
r2_original = r2_score(actuals_original, predictions_original)
print(f"Test R² score (original volatility): {r2_original:.6f}")

In [None]:
def qlike_safe(actual, forecast, eps=1e-9): 
    a = np.clip(actual, eps, None)
    f = np.clip(forecast, eps, None)
    f_safe = np.where((a > eps) & (f < eps), eps, f)
    r = a / f_safe
    valid_qlike_indices = (a > eps) & (f_safe > eps)
    if not np.any(valid_qlike_indices):
        return np.nan
    
    a_f = a[valid_qlike_indices]
    f_f = f_safe[valid_qlike_indices]
    r_f = a_f / f_f
    return np.mean(r_f - np.log(r_f) - 1.0)

ql_original = qlike_safe(actuals_original, predictions_original)
print(f"Test QLIKE (original volatility): {ql_original:.6f}")