In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install tensorflow



In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import balanced_accuracy_score, precision_recall_fscore_support, matthews_corrcoef, cohen_kappa_score

# Load the preprocessed dataset
dataset_path = '/content/drive/MyDrive/FYP_Dataset/LOBSTER_SampleFile_AAPL_2012-06-21_5/AAPL_2012-06-21_balanced_dataset.csv'
orderbook_df = pd.read_csv(dataset_path)

# Define features
features = ['Spread', 'Imbalance1', 'Imbalance2', 'Imbalance3', 'Imbalance4', 'Imbalance5',
            'AskPriceDiff1', 'AskPriceDiff2', 'AskPriceDiff3', 'AskPriceDiff4',
            'BidPriceDiff1', 'BidPriceDiff2', 'BidPriceDiff3', 'BidPriceDiff4',
            'MidPriceChange_Lag1', 'MidPriceChange_Lag5', 'MidPriceChange_Lag10',
            'EventCount_Type1', 'EventCount_Type2', 'EventCount_Type3', 'EventCount_Type4', 'EventCount_Type5',
            'AskPrice1', 'AskPrice2', 'AskPrice3', 'AskPrice4', 'AskPrice5',
            'BidPrice1', 'BidPrice2', 'BidPrice3', 'BidPrice4', 'BidPrice5',
            'AskSize1', 'AskSize2', 'AskSize3', 'AskSize4', 'AskSize5',
            'BidSize1', 'BidSize2', 'BidSize3', 'BidSize4', 'BidSize5',
            'MidPrice_Volatility_10', 'CumulativeOrderFlow', 'MidPrice_MA10',
            'AskSize1_MA10', 'BidSize1_MA10', 'TimeWeightedImbalance1',
            'EventIntensity', 'AskPrice1_Relative', 'BidPrice1_Relative',
            'DepthRatio', 'MidPriceChange_Cumsum10',
            'MidPrice_EMA5', 'MidPrice_EMA20', 'VWAP_Imbalance',
            'RealizedVol_1sec', 'TimeSinceOpen']

# Handle NaNs
lagged_features = ['MidPriceChange_Lag1', 'MidPriceChange_Lag5', 'MidPriceChange_Lag10']
orderbook_df[lagged_features] = orderbook_df[lagged_features].fillna(0)

rolling_features = ['MidPrice_Volatility_10', 'MidPrice_MA10', 'AskSize1_MA10', 'BidSize1_MA10', 'MidPriceChange_Cumsum10', 'RealizedVol_1sec']
orderbook_df[rolling_features] = orderbook_df[rolling_features].ffill()

orderbook_df['TimeWeightedImbalance1'] = orderbook_df['TimeWeightedImbalance1'].fillna(0)

event_features = ['EventCount_Type1', 'EventCount_Type2', 'EventCount_Type3', 'EventCount_Type4', 'EventCount_Type5']
orderbook_df[event_features] = orderbook_df[event_features].fillna(0)

# Drop any remaining NaNs in features and target
df_cleaned = orderbook_df.dropna(subset=features + ['Movement'])
print("Number of rows after handling NaNs:", len(df_cleaned))

# Define features and target
X = df_cleaned[features]
y = df_cleaned['Movement']

# Encode the target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the compute_metrics function
def compute_metrics(y_true, y_pred, label_encoder, model_name="Model"):
    metrics = {}
    metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    metrics['Weighted Precision'] = precision
    metrics['Weighted Recall'] = recall
    metrics['Weighted F1'] = f1
    metrics['MCC'] = matthews_corrcoef(y_true, y_pred)
    metrics['Cohen Kappa'] = cohen_kappa_score(y_true, y_pred)

    # Per-class metrics
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None)
    classes = label_encoder.classes_
    for i, cls in enumerate(classes):
        metrics[f'Precision_{cls}'] = precision[i]
        metrics[f'Recall_{cls}'] = recall[i]
        metrics[f'F1_{cls}'] = f1[i]

    # Print metrics
    print(f"{model_name} Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    print()

    return metrics

# Initialize the all_metrics dictionary
all_metrics = {}

Number of rows after handling NaNs: 711705


In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import numpy as np

# Select top features
top_features = [
    'MidPriceChange_Cumsum10', 'RealizedVol_1sec', 'MidPrice_Volatility_10',
    'MidPriceChange_Lag1', 'EventCount_Type4', 'TimeWeightedImbalance1',
    'AskPriceDiff2', 'EventCount_Type3', 'EventCount_Type1', 'AskPriceDiff1'
]

# Subset and scale data
X_top = df_cleaned[top_features]
y = label_encoder.fit_transform(df_cleaned['Movement'])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_top)

# Reshape into sequences
timesteps = 10
X_sequences = []
y_sequences = []
for i in range(len(X_scaled) - timesteps):
    X_sequences.append(X_scaled[i:i+timesteps])
    y_sequences.append(y[i+timesteps])
X_sequences = np.array(X_sequences)
y_sequences = np.array(y_sequences)

# Split data
X_train_seq, X_test_seq, y_train_seq, y_test_seq = train_test_split(X_sequences, y_sequences, test_size=0.2, random_state=42)

# Define LSTM model
model = Sequential([
    LSTM(64, activation='tanh', input_shape=(timesteps, len(top_features))),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])

# Compile
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)

# Train
model.fit(X_train_seq, y_train_seq,
          epochs=50,
          batch_size=128,
          validation_split=0.2,
          callbacks=[early_stopping, reduce_lr],
          verbose=1)

# Predict and evaluate
y_pred = np.argmax(model.predict(X_test_seq), axis=1)
metrics = compute_metrics(y_test_seq, y_pred, label_encoder, "LSTM-Simplified")
all_metrics['LSTM-Simplified'] = metrics

# Save model
model.save('/content/drive/MyDrive/FYP_Dataset/LOBSTER_SampleFile_AAPL_2012-06-21_5/lstm_simplified.h5')

NameError: name 'df_cleaned' is not defined