In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, precision_recall_fscore_support, matthews_corrcoef, cohen_kappa_score
from sklearn.preprocessing import LabelEncoder
import scipy.stats as stats

# Load the dataset
dataset_path = '/content/drive/MyDrive/FYP_Dataset/LOBSTER_SampleFile_AAPL_2012-06-21_5/AAPL_2012-06-21_balanced_dataset.csv'
df = pd.read_csv(dataset_path)

# Define features
features = ['Spread', 'Imbalance1', 'Imbalance2', 'Imbalance3', 'Imbalance4', 'Imbalance5',
            'AskPriceDiff1', 'AskPriceDiff2', 'AskPriceDiff3', 'AskPriceDiff4',
            'BidPriceDiff1', 'BidPriceDiff2', 'BidPriceDiff3', 'BidPriceDiff4',
            'MidPriceChange_Lag1', 'MidPriceChange_Lag5', 'MidPriceChange_Lag10',
            'EventCount_Type1', 'EventCount_Type2', 'EventCount_Type3', 'EventCount_Type4', 'EventCount_Type5',
            'AskPrice1', 'AskPrice2', 'AskPrice3', 'AskPrice4', 'AskPrice5',
            'BidPrice1', 'BidPrice2', 'BidPrice3', 'BidPrice4', 'BidPrice5',
            'AskSize1', 'AskSize2', 'AskSize3', 'AskSize4', 'AskSize5',
            'BidSize1', 'BidSize2', 'BidSize3', 'BidSize4', 'BidSize5',
            'MidPrice_Volatility_10', 'CumulativeOrderFlow', 'MidPrice_MA10',
            'AskSize1_MA10', 'BidSize1_MA10', 'TimeWeightedImbalance1',
            'EventIntensity', 'AskPrice1_Relative', 'BidPrice1_Relative',
            'DepthRatio', 'MidPriceChange_Cumsum10']

# Handle NaNs
lagged_features = ['MidPriceChange_Lag1', 'MidPriceChange_Lag5', 'MidPriceChange_Lag10']
df[lagged_features] = df[lagged_features].fillna(0)

rolling_features = ['MidPrice_Volatility_10', 'MidPrice_MA10', 'AskSize1_MA10', 'BidSize1_MA10', 'MidPriceChange_Cumsum10']
df[rolling_features] = df[rolling_features].fillna(method='ffill')

df['TimeWeightedImbalance1'] = df['TimeWeightedImbalance1'].fillna(0)

event_features = ['EventCount_Type1', 'EventCount_Type2', 'EventCount_Type3', 'EventCount_Type4', 'EventCount_Type5']
df[event_features] = df[event_features].fillna(0)

# Drop remaining NaNs
df_cleaned = df.dropna(subset=features + ['Movement'])
print("Number of rows after handling NaNs:", len(df_cleaned))

# Define features and target
X = df_cleaned[features]
y = df_cleaned['Movement']

# Encode the target labels (q_-1, q_0, q_+1) to numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Define a function to compute all performance metrics
def compute_metrics(y_true, y_pred, label_encoder):
    metrics = {}
    metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    metrics['Weighted Precision'] = precision
    metrics['Weighted Recall'] = recall
    metrics['Weighted F1'] = f1
    metrics['MCC'] = matthews_corrcoef(y_true, y_pred)
    metrics['Cohen Kappa'] = cohen_kappa_score(y_true, y_pred)

    # Per-class metrics
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None)
    classes = label_encoder.classes_
    for i, cls in enumerate(classes):
        metrics[f'Precision_{cls}'] = precision[i]
        metrics[f'Recall_{cls}'] = recall[i]
        metrics[f'F1_{cls}'] = f1[i]

    return metrics

# 1. Random Model
np.random.seed(42)
y_pred_random = np.random.randint(0, 3, size=len(y_test))
metrics_random = compute_metrics(y_test, y_pred_random, label_encoder)
print("Random Model Performance:")
for metric, value in metrics_random.items():
    print(f"{metric}: {value:.4f}")
print()

# 2. Naive Model
most_frequent_class = stats.mode(y_train, keepdims=True)[0][0]
y_pred_naive = np.full_like(y_test, most_frequent_class)
metrics_naive = compute_metrics(y_test, y_pred_naive, label_encoder)
print("Naive Model Performance:")
for metric, value in metrics_naive.items():
    print(f"{metric}: {value:.4f}")
print()

# 3. Multinomial Logistic Regression
logreg = LogisticRegression(multi_class='multinomial', solver='sag', max_iter=20, tol=1e-1, random_state=42)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
metrics_logreg = compute_metrics(y_test, y_pred_logreg, label_encoder)
print("Multinomial Logistic Regression Performance:")
for metric, value in metrics_logreg.items():
    print(f"{metric}: {value:.4f}")
print()

# Summary Table of Performance Metrics
metrics_summary = pd.DataFrame({
    'Random Model': metrics_random,
    'Naive Model': metrics_naive,
    'Logistic Regression': metrics_logreg
})
print("\nSummary of Performance Metrics:")
print(metrics_summary.T)

  df[rolling_features] = df[rolling_features].fillna(method='ffill')


Number of rows after handling NaNs: 711705
Random Model Performance:
Balanced Accuracy: 0.3349
Weighted Precision: 0.3349
Weighted Recall: 0.3349
Weighted F1: 0.3349
MCC: 0.0023
Cohen Kappa: 0.0023
Precision_q_+1: 0.3357
Recall_q_+1: 0.3366
F1_q_+1: 0.3362
Precision_q_-1: 0.3348
Recall_q_-1: 0.3359
F1_q_-1: 0.3353
Precision_q_0: 0.3342
Recall_q_0: 0.3322
F1_q_0: 0.3332

Naive Model Performance:
Balanced Accuracy: 0.3333
Weighted Precision: 0.1106
Weighted Recall: 0.3326
Weighted F1: 0.1660
MCC: 0.0000
Cohen Kappa: 0.0000
Precision_q_+1: 0.0000
Recall_q_+1: 0.0000
F1_q_+1: 0.0000
Precision_q_-1: 0.3326
Recall_q_-1: 1.0000
F1_q_-1: 0.4992
Precision_q_0: 0.0000
Recall_q_0: 0.0000
F1_q_0: 0.0000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Multinomial Logistic Regression Performance:
Balanced Accuracy: 0.3328
Weighted Precision: 0.3525
Weighted Recall: 0.3321
Weighted F1: 0.1819
MCC: -0.0031
Cohen Kappa: -0.0009
Precision_q_+1: 0.4449
Recall_q_+1: 0.0138
F1_q_+1: 0.0267
Precision_q_-1: 0.3317
Recall_q_-1: 0.9712
F1_q_-1: 0.4945
Precision_q_0: 0.2809
Recall_q_0: 0.0133
F1_q_0: 0.0254


Summary of Performance Metrics:
                     Balanced Accuracy  Weighted Precision  Weighted Recall  \
Random Model                  0.334887            0.334884         0.334886   
Naive Model                   0.333333            0.110634         0.332617   
Logistic Regression           0.332755            0.352530         0.332069   

                     Weighted F1      MCC  Cohen Kappa  Precision_q_+1  \
Random Model            0.334883  0.00233     0.002330        0.335706   
Naive Model             0.166040  0.00000     0.000000        0.000000   
Logistic Regression     0.181861 -0.00312    -0.000864        0.444898   

  