In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import balanced_accuracy_score, precision_recall_fscore_support, matthews_corrcoef, cohen_kappa_score

In [4]:
def compute_metrics(y_true, y_pred, label_encoder, model_name="Model"):
    metrics = {}
    metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    metrics['Weighted Precision'] = precision
    metrics['Weighted Recall'] = recall
    metrics['Weighted F1'] = f1
    metrics['MCC'] = matthews_corrcoef(y_true, y_pred)
    metrics['Cohen Kappa'] = cohen_kappa_score(y_true, y_pred)

    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None)
    classes = label_encoder.classes_
    for i, cls in enumerate(classes):
        metrics[f'Precision_{cls}'] = precision[i]
        metrics[f'Recall_{cls}'] = recall[i]
        metrics[f'F1_{cls}'] = f1[i]

    print(f"{model_name} Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    print()
    return metrics

all_metrics = {}

In [6]:
# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

def load_and_preprocess_data(data_path, sample_fraction=0.25):
    """
    Load and preprocess financial data for GNN training with stratified sampling.

    Args:
        data_path (str): Path to the CSV file.
        sample_fraction (float): Fraction of data to sample (default: 0.25).

    Returns:
        df (pd.DataFrame): Preprocessed DataFrame.
        X (np.ndarray): Normalized feature matrix.
        y (np.ndarray): Target array with values [0, 1, 2].
    """
    # Load data
    df = pd.read_csv(data_path)
    print(f"Initial rows: {len(df)}")

    # Debug: Inspect initial state
    print("Columns in the dataset:", df.columns.tolist())
    if df.empty:
        raise ValueError("Error: The dataset is empty. Please check the file path and contents.")

    # Define movement mapping
    movement_mapping = {'q_-1': 0, 'q_0': 1, 'q_+1': 2}

    # Map Movement and filter invalid values
    print("Unique values in 'Movement' (before):", df['Movement'].unique())
    df['Movement'] = df['Movement'].map(movement_mapping)
    valid_idx = df['Movement'].isin([0, 1, 2])
    print("Rows dropped due to invalid 'Movement':", (~valid_idx).sum())
    df = df[valid_idx]

    # Stratified sampling using StratifiedShuffleSplit
    sss = StratifiedShuffleSplit(n_splits=1, test_size=1 - sample_fraction, random_state=42)
    for train_idx, _ in sss.split(df, df['Movement']):
        df = df.iloc[train_idx]
    print(f"Rows after subsampling: {len(df)}")

    # Target variable
    y = df['Movement'].values.astype(np.int64)  # [0, 1, 2]
    print("Unique values in y:", np.unique(y))

    # Define feature columns
    feature_cols = [
        'BidPrice1', 'AskPrice1', 'BidSize1', 'AskSize1',
        'Spread', 'Imbalance1', 'MidPrice_Volatility_10', 'CumulativeOrderFlow',
        'TotalAskSize', 'TotalBidSize', 'DepthRatio', 'AskVWAP', 'BidVWAP',
        'VWAP_Imbalance', 'LogReturn', 'RealizedVol_1sec'
    ]

    # Handle missing features
    missing_cols = [col for col in feature_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Error: Missing feature columns: {missing_cols}")

    # Extract and preprocess features
    X = df[feature_cols].fillna(0).values
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    return df, X, y

In [8]:
# Step 2: Graph Construction
from sklearn.neighbors import NearestNeighbors

def construct_graph_knn(df, X, y, k=10):
    num_nodes = len(df)
    node_features = torch.tensor(X, dtype=torch.float)
    labels = torch.tensor(y, dtype=torch.long)
    timestamps = df['Time'].values.reshape(-1, 1)  # Reshape for sklearn

    # Find k nearest neighbors in time
    nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm='auto').fit(timestamps)
    distances, indices = nbrs.kneighbors(timestamps)

    # Build edge index
    edge_index = []
    for i in range(num_nodes):
        for j in indices[i, 1:]:  # Skip self (indices[i, 0] == i)
            edge_index.append([i, j])
            edge_index.append([j, i])  # Undirected graph

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    data = Data(x=node_features, edge_index=edge_index, y=labels)
    return data

In [12]:
# Step 3: GNN Model
class GNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv3(x, edge_index)
        return x


# Step 4: Training and Evaluation
def train_model(data, model, optimizer, criterion, num_epochs=100):
    best_test_loss = float('inf')
    patience = 10
    patience_counter = 0

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            test_loss = criterion(out[data.test_mask], data.y[data.test_mask])

        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Train Loss: {loss.item():.4f}, Test Loss: {test_loss.item():.4f}")

        if test_loss < best_test_loss:
            best_test_loss = test_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break

In [13]:
# Ensure the label encoder maps classes correctly to q_-1, q_0, q_+1
def evaluate_model(data, model, label_encoder, model_name="GNN"):
    model.eval()
    with torch.no_grad():
        out = model(data)
        pred = out.argmax(dim=1)  # Predicted class indices

        # Convert to NumPy for sklearn metrics
        y_train_true = data.y[data.train_mask].cpu().numpy()
        y_train_pred = pred[data.train_mask].cpu().numpy()
        y_test_true = data.y[data.test_mask].cpu().numpy()
        y_test_pred = pred[data.test_mask].cpu().numpy()

        # Compute metrics for train and test sets
        train_metrics = compute_metrics(y_train_true, y_train_pred, label_encoder, model_name=f"{model_name} (Train)")
        test_metrics = compute_metrics(y_test_true, y_test_pred, label_encoder, model_name=f"{model_name} (Test)")

        # Store in all_metrics with unique keys
        all_metrics[f'{model_name}_Train'] = train_metrics
        all_metrics[f'{model_name}_Test'] = test_metrics

# Function to create the evaluation matrix
def create_evaluation_matrix(all_metrics):
    # Define the metrics to include in the table (matching your provided table)
    metrics_order = [
        'Balanced Accuracy', 'Weighted Precision', 'Weighted Recall', 'Weighted F1',
        'MCC', 'Cohen Kappa',
        'Precision_q_-1', 'Recall_q_-1', 'F1_q_-1',
        'Precision_q_0', 'Recall_q_0', 'F1_q_0',
        'Precision_q_+1', 'Recall_q_+1', 'F1_q_+1',
        'Precision_0', 'Recall_0', 'F1_0',
        'Precision_1', 'Recall_1', 'F1_1',
        'Precision_2', 'Recall_2', 'F1_2'
    ]

    # Create a DataFrame with metrics as rows and models as columns
    model_names = list(all_metrics.keys())
    matrix = pd.DataFrame(index=metrics_order, columns=model_names)

    # Populate the DataFrame
    for model_name in model_names:
        metrics = all_metrics[model_name]
        for metric in metrics_order:
            if metric in metrics:
                matrix.loc[metric, model_name] = metrics[metric]
            else:
                matrix.loc[metric, model_name] = 'N/A'

    return matrix

# Model with LSTM
class GNNWithLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.conv1 = GCNConv(hidden_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)
    def forward(self, data):
        x = data.x.unsqueeze(0)
        x, _ = self.lstm(x)
        x = x.squeeze(0)
        x = F.relu(self.conv1(x, data.edge_index))
        x = F.dropout(x, p=0.3, training=self.training)
        x = self.conv2(x, data.edge_index)
        return x


# Update the main execution to use the modified evaluate_model
if __name__ == "__main__":
    # Clear all_metrics to start fresh
    all_metrics = {}

    # Load balanced dataset
    data_path = "/content/drive/MyDrive/FYP_Dataset/LOBSTER_SampleFile_AAPL_2012-06-21_5/AAPL_2012-06-21_balanced_dataset.csv"
    df, X, y = load_and_preprocess_data(data_path, sample_fraction=0.1)  # Smaller sample for speed

    # Enhanced graph construction
    feature_cols = ['Time', 'MidPriceChange_Cumsum10', 'Imbalance1']
    combined = StandardScaler().fit_transform(df[feature_cols])
    nbrs = NearestNeighbors(n_neighbors=21).fit(combined)
    distances, indices = nbrs.kneighbors(combined)
    edge_index = torch.tensor([[i, j] for i in range(len(df)) for j in indices[i, 1:]], dtype=torch.long).t()
    data = Data(x=torch.tensor(X, dtype=torch.float), edge_index=edge_index, y=torch.tensor(y, dtype=torch.long))

    # Split
    train_idx, test_idx = train_test_split(range(len(y)), test_size=0.2, stratify=y, random_state=42)
    data.train_mask = torch.zeros(len(y), dtype=torch.bool); data.train_mask[train_idx] = True
    data.test_mask = torch.zeros(len(y), dtype=torch.bool); data.test_mask[test_idx] = True

    # Second GNN Model (GNNWithLSTM)
    df, X, y = load_and_preprocess_data(data_path, sample_fraction=0.1)

    feature_cols = ['Time', 'MidPriceChange_Cumsum10', 'Imbalance1']
    combined = StandardScaler().fit_transform(df[feature_cols])
    nbrs = NearestNeighbors(n_neighbors=21).fit(combined)
    distances, indices = nbrs.kneighbors(combined)
    edge_index = torch.tensor([[i, j] for i in range(len(df)) for j in indices[i, 1:]], dtype=torch.long).t()
    data = Data(x=torch.tensor(X, dtype=torch.float), edge_index=edge_index, y=torch.tensor(y, dtype=torch.long))

    train_idx, test_idx = train_test_split(range(len(y)), test_size=0.2, stratify=y, random_state=42)
    data.train_mask = torch.zeros(len(y), dtype=torch.bool)
    data.test_mask = torch.zeros(len(y), dtype=torch.bool)
    data.train_mask[train_idx] = True
    data.test_mask[test_idx] = True

    model = GNNWithLSTM(X.shape[1], 128, 3)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    class_weights = 1.0 / torch.tensor(np.bincount(y), dtype=torch.float)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5)
    train_model(data, model, optimizer, criterion, num_epochs=200)
    evaluate_model(data, model, label_encoder, model_name="GNNWithLSTM")

    # Create and display the evaluation matrix
    evaluation_matrix = create_evaluation_matrix(all_metrics)
    print("\nEvaluation Matrix:")
    print(evaluation_matrix)

    # Optionally, save to CSV
    evaluation_matrix.to_csv('/content/drive/MyDrive/FYP_Dataset/evaluation_matrix.csv')

Initial rows: 711705
Columns in the dataset: ['AskPrice1', 'AskSize1', 'BidPrice1', 'BidSize1', 'AskPrice2', 'AskSize2', 'BidPrice2', 'BidSize2', 'AskPrice3', 'AskSize3', 'BidPrice3', 'BidSize3', 'AskPrice4', 'AskSize4', 'BidPrice4', 'BidSize4', 'AskPrice5', 'AskSize5', 'BidPrice5', 'BidSize5', 'MidPrice', 'Time', 'MidPriceChange', 'Spread', 'Imbalance1', 'Imbalance2', 'Imbalance3', 'Imbalance4', 'Imbalance5', 'AskPriceDiff1', 'BidPriceDiff1', 'AskPriceDiff2', 'BidPriceDiff2', 'AskPriceDiff3', 'BidPriceDiff3', 'AskPriceDiff4', 'BidPriceDiff4', 'MidPriceChange_Lag1', 'MidPriceChange_Lag5', 'MidPriceChange_Lag10', 'EventCount_Type1', 'EventCount_Type2', 'EventCount_Type3', 'EventCount_Type4', 'EventCount_Type5', 'MidPrice_Volatility_10', 'CumulativeOrderFlow', 'MidPrice_MA10', 'AskSize1_MA10', 'BidSize1_MA10', 'TimeWeightedImbalance1', 'EventIntensity', 'AskPrice1_Relative', 'BidPrice1_Relative', 'AskPrice2_Relative', 'BidPrice2_Relative', 'AskPrice3_Relative', 'BidPrice3_Relative', 'Ask

In [None]:
def evaluate_model(data, model, label_encoder):
    model.eval()
    with torch.no_grad():
        out = model(data)
        pred = out.argmax(dim=1)  # Predicted class indices

        # Convert to NumPy for sklearn metrics
        y_train_true = data.y[data.train_mask].cpu().numpy()
        y_train_pred = pred[data.train_mask].cpu().numpy()
        y_test_true = data.y[data.test_mask].cpu().numpy()
        y_test_pred = pred[data.test_mask].cpu().numpy()

        # Compute metrics for train and test sets
        train_metrics = compute_metrics(y_train_true, y_train_pred, label_encoder, model_name="GNN (Train)")
        test_metrics = compute_metrics(y_test_true, y_test_pred, label_encoder, model_name="GNN (Test)")

        # Store in all_metrics
        all_metrics['GNN_Train'] = train_metrics
        all_metrics['GNN_Test'] = test_metrics

In [None]:
# Step 5: Main Execution
if __name__ == "__main__":
    data_path = '/content/drive/MyDrive/FYP_Dataset/LOBSTER_SampleFile_AAPL_2012-06-21_5/AAPL_2012-06-21_balanced_dataset.csv'
    df, X, y = load_and_preprocess_data(data_path, sample_fraction=0.25)

    class_counts = np.bincount(y)
    if len(class_counts) != 3:
        raise ValueError(f"Error: Expected 3 classes, found {len(class_counts)}: {class_counts}")

    class_weights = 1.0 / torch.tensor(class_counts, dtype=torch.float)
    class_weights = class_weights / class_weights.sum()  # Normalize
    class_weights[class_weights > 1.0] = 1.0  # Cap
    print("Class counts:", class_counts)
    print("Class weights:", class_weights)

    label_encoder = LabelEncoder()
    label_encoder.classes_ = np.array([0, 1, 2])

    data = construct_graph_knn(df, X, y, k=50)
    train_idx, test_idx = train_test_split(range(len(y)), test_size=0.2, random_state=42)
    data.train_mask = torch.zeros(len(y), dtype=torch.bool)
    data.test_mask = torch.zeros(len(y), dtype=torch.bool)
    data.train_mask[train_idx] = True
    data.test_mask[test_idx] = True

    input_dim = X.shape[1]
    hidden_dim = 64
    output_dim = 3
    model = GNNModel(input_dim, hidden_dim, output_dim)  # Updated model

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss(weight=class_weights)

    train_model(data, model, optimizer, criterion, num_epochs=100)
    evaluate_model(data, model, label_encoder)

Initial rows: 711705
Columns in the dataset: ['AskPrice1', 'AskSize1', 'BidPrice1', 'BidSize1', 'AskPrice2', 'AskSize2', 'BidPrice2', 'BidSize2', 'AskPrice3', 'AskSize3', 'BidPrice3', 'BidSize3', 'AskPrice4', 'AskSize4', 'BidPrice4', 'BidSize4', 'AskPrice5', 'AskSize5', 'BidPrice5', 'BidSize5', 'MidPrice', 'Time', 'MidPriceChange', 'Spread', 'Imbalance1', 'Imbalance2', 'Imbalance3', 'Imbalance4', 'Imbalance5', 'AskPriceDiff1', 'BidPriceDiff1', 'AskPriceDiff2', 'BidPriceDiff2', 'AskPriceDiff3', 'BidPriceDiff3', 'AskPriceDiff4', 'BidPriceDiff4', 'MidPriceChange_Lag1', 'MidPriceChange_Lag5', 'MidPriceChange_Lag10', 'EventCount_Type1', 'EventCount_Type2', 'EventCount_Type3', 'EventCount_Type4', 'EventCount_Type5', 'MidPrice_Volatility_10', 'CumulativeOrderFlow', 'MidPrice_MA10', 'AskSize1_MA10', 'BidSize1_MA10', 'TimeWeightedImbalance1', 'EventIntensity', 'AskPrice1_Relative', 'BidPrice1_Relative', 'AskPrice2_Relative', 'BidPrice2_Relative', 'AskPrice3_Relative', 'BidPrice3_Relative', 'Ask

In [None]:
# Load balanced dataset
data_path = "/content/drive/MyDrive/FYP_Dataset/LOBSTER_SampleFile_AAPL_2012-06-21_5/AAPL_2012-06-21_balanced_dataset.csv"
df, X, y = load_and_preprocess_data(data_path, sample_fraction=0.1)  # Smaller sample for speed

# Enhanced graph construction
feature_cols = ['Time', 'MidPriceChange_Cumsum10', 'Imbalance1']
combined = StandardScaler().fit_transform(df[feature_cols])
nbrs = NearestNeighbors(n_neighbors=21).fit(combined)
distances, indices = nbrs.kneighbors(combined)
edge_index = torch.tensor([[i, j] for i in range(len(df)) for j in indices[i, 1:]], dtype=torch.long).t()
data = Data(x=torch.tensor(X, dtype=torch.float), edge_index=edge_index, y=torch.tensor(y, dtype=torch.long))

# Split
train_idx, test_idx = train_test_split(range(len(y)), test_size=0.2, stratify=y, random_state=42)
data.train_mask = torch.zeros(len(y), dtype=torch.bool); data.train_mask[train_idx] = True
data.test_mask = torch.zeros(len(y), dtype=torch.bool); data.test_mask[test_idx] = True

# Model with LSTM
class GNNWithLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.conv1 = GCNConv(hidden_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)
    def forward(self, data):
        x = data.x.unsqueeze(0)
        x, _ = self.lstm(x)
        x = x.squeeze(0)
        x = F.relu(self.conv1(x, data.edge_index))
        x = F.dropout(x, p=0.3, training=self.training)
        x = self.conv2(x, data.edge_index)
        return x

# Train
model = GNNWithLSTM(X.shape[1], 128, 3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
class_weights = 1.0 / torch.tensor(np.bincount(y), dtype=torch.float)
criterion = nn.CrossEntropyLoss(weight=class_weights)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5)
train_model(data, model, optimizer, criterion, num_epochs=200)
evaluate_model(data, model, LabelEncoder().fit([0, 1, 2]))

Initial rows: 711705
Columns in the dataset: ['AskPrice1', 'AskSize1', 'BidPrice1', 'BidSize1', 'AskPrice2', 'AskSize2', 'BidPrice2', 'BidSize2', 'AskPrice3', 'AskSize3', 'BidPrice3', 'BidSize3', 'AskPrice4', 'AskSize4', 'BidPrice4', 'BidSize4', 'AskPrice5', 'AskSize5', 'BidPrice5', 'BidSize5', 'MidPrice', 'Time', 'MidPriceChange', 'Spread', 'Imbalance1', 'Imbalance2', 'Imbalance3', 'Imbalance4', 'Imbalance5', 'AskPriceDiff1', 'BidPriceDiff1', 'AskPriceDiff2', 'BidPriceDiff2', 'AskPriceDiff3', 'BidPriceDiff3', 'AskPriceDiff4', 'BidPriceDiff4', 'MidPriceChange_Lag1', 'MidPriceChange_Lag5', 'MidPriceChange_Lag10', 'EventCount_Type1', 'EventCount_Type2', 'EventCount_Type3', 'EventCount_Type4', 'EventCount_Type5', 'MidPrice_Volatility_10', 'CumulativeOrderFlow', 'MidPrice_MA10', 'AskSize1_MA10', 'BidSize1_MA10', 'TimeWeightedImbalance1', 'EventIntensity', 'AskPrice1_Relative', 'BidPrice1_Relative', 'AskPrice2_Relative', 'BidPrice2_Relative', 'AskPrice3_Relative', 'BidPrice3_Relative', 'Ask