In [None]:

# Define positional encoding
def positional_encoding(seq_length, d_model):
    position = np.arange(seq_length)[:, np.newaxis]
    div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
    pos_encoding = np.zeros((seq_length, d_model))
    pos_encoding[:, 0::2] = np.sin(position * div_term)
    pos_encoding[:, 1::2] = np.cos(position * div_term)
    return tf.cast(pos_encoding, dtype=tf.float32)

# Reshape data for Transformer (same as CNN-LSTM)
timesteps = 10
n_features = X_train_scaled.shape[1]
X_train_reshaped = np.array([X_train_scaled[i:i+timesteps] for i in range(0, len(X_train_scaled) - timesteps)])
X_test_reshaped = np.array([X_test_scaled[i:i+timesteps] for i in range(0, len(X_test_scaled) - timesteps)])
y_train_reshaped = y_train[timesteps:]
y_test_reshaped = y_test[timesteps:]

# Define the Transformer model
inputs = Input(shape=(timesteps, n_features))
x = Dense(64, activation='relu')(inputs)  # Feature embedding

# Add positional encoding
pos_encoding = positional_encoding(timesteps, 64)
x = x + pos_encoding

# Transformer encoder layers
for _ in range(2):  # 2 layers
    # Multi-head attention
    attention_output = MultiHeadAttention(num_heads=4, key_dim=64)(x, x)
    x = LayerNormalization(epsilon=1e-6)(x + attention_output)
    # Feed-forward network
    ffn_output = Dense(128, activation='relu')(x)
    ffn_output = Dense(64)(ffn_output)
    x = LayerNormalization(epsilon=1e-6)(x + ffn_output)

x = Flatten()(x)
x = Dense(128, activation='relu')(x)
outputs = Dense(3, activation='softmax')(x)

transformer = Model(inputs, outputs)

# Compile the model
transformer.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                   loss='sparse_categorical_crossentropy',
                   metrics=['accuracy'])

# Train the model
transformer.fit(X_train_reshaped, y_train_reshaped,
                epochs=20,
                batch_size=1024,
                validation_split=0.2,
                verbose=1)

# Predict and evaluate
y_pred_transformer = np.argmax(transformer.predict(X_test_reshaped), axis=1)
metrics_transformer = compute_metrics(y_test_reshaped, y_pred_transformer, label_encoder, model_name="Transformer")
all_metrics['Transformer'] = metrics_transformer

# Save the model
transformer.save('/content/drive/MyDrive/FYP_Dataset/LOBSTER_SampleFile_AAPL_2012-06-21_5/transformer_model.keras')

Epoch 1/20
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m400s[0m 880ms/step - accuracy: 0.3355 - loss: 1.1387 - val_accuracy: 0.3321 - val_loss: 1.1040
Epoch 2/20
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m450s[0m 900ms/step - accuracy: 0.3356 - loss: 1.1007 - val_accuracy: 0.3321 - val_loss: 1.0992
Epoch 3/20
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m440s[0m 897ms/step - accuracy: 0.3343 - loss: 1.0996 - val_accuracy: 0.3346 - val_loss: 1.0988
Epoch 4/20
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m444s[0m 902ms/step - accuracy: 0.3356 - loss: 1.0991 - val_accuracy: 0.3325 - val_loss: 1.0988
Epoch 5/20
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m435s[0m 887ms/step - accuracy: 0.3369 - loss: 1.0987 - val_accuracy: 0.3319 - val_loss: 1.0989
Epoch 6/20
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m464s[0m 937ms/step - accuracy: 0.3394 - loss: 1.0984 - val_accuracy: 0.3344 - val_loss: 1.0988
Epoc

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization, MultiHeadAttention, Flatten
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import balanced_accuracy_score, precision_recall_fscore_support, matthews_corrcoef, cohen_kappa_score
import scipy.stats as stats

In [3]:
# Load the preprocessed dataset (if not already loaded)
# If you've already run the preprocessing, you can skip loading and go straight to df_cleaned
dataset_path = '/content/drive/MyDrive/FYP_Dataset/LOBSTER_SampleFile_AAPL_2012-06-21_5/AAPL_2012-06-21_balanced_dataset.csv'
orderbook_df = pd.read_csv(dataset_path)

# Define features (including the new ones added previously)
features = ['Spread', 'Imbalance1', 'Imbalance2', 'Imbalance3', 'Imbalance4', 'Imbalance5',
            'AskPriceDiff1', 'AskPriceDiff2', 'AskPriceDiff3', 'AskPriceDiff4',
            'BidPriceDiff1', 'BidPriceDiff2', 'BidPriceDiff3', 'BidPriceDiff4',
            'MidPriceChange_Lag1', 'MidPriceChange_Lag5', 'MidPriceChange_Lag10',
            'EventCount_Type1', 'EventCount_Type2', 'EventCount_Type3', 'EventCount_Type4', 'EventCount_Type5',
            'AskPrice1', 'AskPrice2', 'AskPrice3', 'AskPrice4', 'AskPrice5',
            'BidPrice1', 'BidPrice2', 'BidPrice3', 'BidPrice4', 'BidPrice5',
            'AskSize1', 'AskSize2', 'AskSize3', 'AskSize4', 'AskSize5',
            'BidSize1', 'BidSize2', 'BidSize3', 'BidSize4', 'BidSize5',
            'MidPrice_Volatility_10', 'CumulativeOrderFlow', 'MidPrice_MA10',
            'AskSize1_MA10', 'BidSize1_MA10', 'TimeWeightedImbalance1',
            'EventIntensity', 'AskPrice1_Relative', 'BidPrice1_Relative',
            'DepthRatio', 'MidPriceChange_Cumsum10',
            'MidPrice_EMA5', 'MidPrice_EMA20', 'VWAP_Imbalance',
            'RealizedVol_1sec', 'TimeSinceOpen']

# Handle NaNs (if not already done)
lagged_features = ['MidPriceChange_Lag1', 'MidPriceChange_Lag5', 'MidPriceChange_Lag10']
orderbook_df[lagged_features] = orderbook_df[lagged_features].fillna(0)

rolling_features = ['MidPrice_Volatility_10', 'MidPrice_MA10', 'AskSize1_MA10', 'BidSize1_MA10', 'MidPriceChange_Cumsum10', 'RealizedVol_1sec']
orderbook_df[rolling_features] = orderbook_df[rolling_features].ffill()

orderbook_df['TimeWeightedImbalance1'] = orderbook_df['TimeWeightedImbalance1'].fillna(0)

event_features = ['EventCount_Type1', 'EventCount_Type2', 'EventCount_Type3', 'EventCount_Type4', 'EventCount_Type5']
orderbook_df[event_features] = orderbook_df[event_features].fillna(0)
df_cleaned = orderbook_df.dropna(subset=features + ['Movement'])

X = df_cleaned[features]
y = df_cleaned['Movement']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# Define a function to compute all performance metrics (used by all models)
def compute_metrics(y_true, y_pred, label_encoder, model_name="Model"):
    metrics = {}
    metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    metrics['Weighted Precision'] = precision
    metrics['Weighted Recall'] = recall
    metrics['Weighted F1'] = f1
    metrics['MCC'] = matthews_corrcoef(y_true, y_pred)
    metrics['Cohen Kappa'] = cohen_kappa_score(y_true, y_pred)

    # Per-class metrics
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None)
    classes = label_encoder.classes_
    for i, cls in enumerate(classes):
        metrics[f'Precision_{cls}'] = precision[i]
        metrics[f'Recall_{cls}'] = recall[i]
        metrics[f'F1_{cls}'] = f1[i]

    # Print metrics
    print(f"{model_name} Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    print()

    return metrics

# Dictionary to store metrics for all models (for summary table)
all_metrics = {}

In [6]:
# Define top features (from Random Forest)
top_features = [
    'MidPriceChange_Cumsum10', 'RealizedVol_1sec', 'MidPrice_Volatility_10',
    'MidPriceChange_Lag1', 'EventCount_Type4', 'TimeWeightedImbalance1',
    'AskPriceDiff2', 'EventCount_Type3', 'EventCount_Type1', 'AskPriceDiff1'
]

# Handle NaNs for the selected features
lagged_features = ['MidPriceChange_Lag1']
orderbook_df[lagged_features] = orderbook_df[lagged_features].fillna(0)

rolling_features = ['MidPrice_Volatility_10', 'MidPriceChange_Cumsum10', 'RealizedVol_1sec']
orderbook_df[rolling_features] = orderbook_df[rolling_features].ffill()

orderbook_df['TimeWeightedImbalance1'] = orderbook_df['TimeWeightedImbalance1'].fillna(0)

event_features = ['EventCount_Type1', 'EventCount_Type3', 'EventCount_Type4']
orderbook_df[event_features] = orderbook_df[event_features].fillna(0)

df_cleaned = orderbook_df.dropna(subset=top_features + ['Movement'])

# Prepare features and target
X = df_cleaned[top_features]
y = df_cleaned['Movement']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Reshape into sequences
timesteps = 10
X_sequences = []
y_sequences = []
for i in range(len(X_scaled) - timesteps):
    X_sequences.append(X_scaled[i:i+timesteps])
    y_sequences.append(y_encoded[i+timesteps])
X_sequences = np.array(X_sequences)
y_sequences = np.array(y_sequences)

# Split the reshaped data
X_train_seq, X_test_seq, y_train_seq, y_test_seq = train_test_split(X_sequences, y_sequences, test_size=0.2, random_state=42)

# Define a function to compute all performance metrics
def compute_metrics(y_true, y_pred, label_encoder, model_name="Model"):
    metrics = {}
    metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    metrics['Weighted Precision'] = precision
    metrics['Weighted Recall'] = recall
    metrics['Weighted F1'] = f1
    metrics['MCC'] = matthews_corrcoef(y_true, y_pred)
    metrics['Cohen Kappa'] = cohen_kappa_score(y_true, y_pred)

    # Per-class metrics
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None)
    classes = label_encoder.classes_
    for i, cls in enumerate(classes):
        metrics[f'Precision_{cls}'] = precision[i]
        metrics[f'Recall_{cls}'] = recall[i]
        metrics[f'F1_{cls}'] = f1[i]

    # Print metrics
    print(f"{model_name} Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    print()

    return metrics

# Dictionary to store metrics for all models
all_metrics = {}

# Define positional encoding
def positional_encoding(seq_length, d_model):
    position = np.arange(seq_length)[:, np.newaxis]
    div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
    pos_encoding = np.zeros((seq_length, d_model))
    pos_encoding[:, 0::2] = np.sin(position * div_term)
    pos_encoding[:, 1::2] = np.cos(position * div_term)
    return tf.cast(pos_encoding, dtype=tf.float32)

# Define the Transformer model
inputs = Input(shape=(timesteps, len(top_features)))
x = Dense(32, activation='relu')(inputs)  # Feature embedding

# Add positional encoding
pos_encoding = positional_encoding(timesteps, 32)
x = x + pos_encoding

# Single Transformer encoder layer
attention_output = MultiHeadAttention(num_heads=2, key_dim=32)(x, x)
x = LayerNormalization(epsilon=1e-6)(x + attention_output)
ffn_output = Dense(64, activation='relu')(x)
ffn_output = Dense(32)(ffn_output)
x = LayerNormalization(epsilon=1e-6)(x + ffn_output)

x = Flatten()(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)
outputs = Dense(3, activation='softmax')(x)

transformer = Model(inputs, outputs)

# Compile with early stopping and learning rate scheduling
transformer.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                   loss='sparse_categorical_crossentropy',
                   metrics=['accuracy'])
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)

# Train
transformer.fit(X_train_seq, y_train_seq,
                epochs=50,
                batch_size=128,
                validation_split=0.2,
                callbacks=[early_stopping, reduce_lr],
                verbose=1)

# Predict and evaluate
y_pred_transformer = np.argmax(transformer.predict(X_test_seq), axis=1)
metrics_transformer = compute_metrics(y_test_seq, y_pred_transformer, label_encoder, model_name="Transformer-Optimized")
all_metrics['Transformer-Optimized'] = metrics_transformer

# Save the model
transformer.save('/content/drive/MyDrive/FYP_Dataset/LOBSTER_SampleFile_AAPL_2012-06-21_5/transformer_optimized.keras')

Epoch 1/50
[1m3559/3559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 25ms/step - accuracy: 0.8560 - loss: 0.4136 - val_accuracy: 0.9050 - val_loss: 0.2832 - learning_rate: 0.0010
Epoch 2/50
[1m3559/3559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 24ms/step - accuracy: 0.9048 - loss: 0.2872 - val_accuracy: 0.9073 - val_loss: 0.2807 - learning_rate: 0.0010
Epoch 3/50
[1m3559/3559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 25ms/step - accuracy: 0.9070 - loss: 0.2797 - val_accuracy: 0.9077 - val_loss: 0.2741 - learning_rate: 0.0010
Epoch 4/50
[1m3559/3559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 24ms/step - accuracy: 0.9072 - loss: 0.2770 - val_accuracy: 0.9067 - val_loss: 0.2757 - learning_rate: 0.0010
Epoch 5/50
[1m3559/3559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 25ms/step - accuracy: 0.9087 - loss: 0.2731 - val_accuracy: 0.9081 - val_loss: 0.2720 - learning_rate: 0.0010
Epoch 6/50
[1m3559/3559[0m [32m━━━━━━━━━━━━━━