# Dataset Construction

In [1]:
import numpy as np
import pandas as pd
from torch_geometric_temporal.signal import StaticGraphTemporalSignal
import torch

class StaticSP500DatasetLoader(object):
    def __init__(self, correlation_type='pearsons'):
        self._read_csv(correlation_type)

    def _read_csv(self, correlation_type):
        self._correlation_matrix = np.fromfile(f's&p500_{correlation_type}.csv', sep=',')
        N = int(np.sqrt(len(self._correlation_matrix)))
        self._correlation_matrix = self._correlation_matrix.reshape(N, N)
        _correlation_threshold = 0.9
        self._correlation_matrix[self._correlation_matrix < _correlation_threshold] = 0
        
        df = pd.read_csv('s&p500.csv')
        df = df.set_index('Date')
        data = torch.from_numpy(df.to_numpy()).to(torch.float32)

        # Round data size to nearest multiple of batch_size
        days_in_quarter = 64
        num_quarters = data.size(0) // days_in_quarter
        num_days = num_quarters * days_in_quarter
        data = data[:num_days]
        
        # z-score normalization with training data following GERU
        train_days = int(0.8 * num_quarters) * days_in_quarter
        data = (data - data[:train_days].mean(dim=0)) / data[:train_days].std(dim=0)

        pd.DataFrame(data).to_csv('s&p500_z_scores.csv')

        self._dataset = data.numpy()

    def _get_edges(self):
        self._edges = np.array(self._correlation_matrix.nonzero())

    def _get_edge_weights(self):
        self._edge_weights = self._correlation_matrix[self._correlation_matrix > 0]

    def _get_targets_and_features(self):
        stacked_target = self._dataset
        self.features = [
            stacked_target[i : i + self.lags, :].T
            for i in range(stacked_target.shape[0] - self.lags)
        ]
        # predict next-day stock price directly
        self.targets = [
            (stacked_target[i + self.lags, :]).T
            for i in range(stacked_target.shape[0] - self.lags)
        ]

    def get_dataset(self, lags: int) -> StaticGraphTemporalSignal:
        """Returning the data iterator.

        Args types:
            * **lags** *(int)* - The number of time lags.
        Return types:
            * **dataset** *(StaticGraphTemporalSignal)*
        """
        self.lags = lags
        self._get_edges()
        self._get_edge_weights()
        self._get_targets_and_features()
        dataset = StaticGraphTemporalSignal(
            self._edges, self._edge_weights, self.features, self.targets
        )
        return dataset

In [2]:
from torch_geometric_temporal.signal import temporal_signal_split

device = 'cpu'

loader = StaticSP500DatasetLoader()

lags = 63

dataset = loader.get_dataset(lags)

train_dataset, test_val_dataset = temporal_signal_split(dataset, train_ratio=0.8)
val_dataset, test_dataset = temporal_signal_split(test_val_dataset, train_ratio=0.5)

# Evaluation

In [3]:
import math
from torcheval.metrics.functional import binary_f1_score

def mse(y_hats, ys):
    return torch.nn.functional.mse_loss(y_hats, ys)

def rmse(y_hats, ys):
    return math.sqrt(mse(y_hats, ys))

def mae(y_hats, ys):
    return torch.nn.functional.l1_loss(y_hats, ys)

def accuracy(y_hats, y_prevs, ys):
    return ((y_hats > y_prevs) == (ys > y_prevs)).sum().item() / y_hats.numel()

def f1(y_hats, y_prevs, ys):
    return binary_f1_score((y_hats > y_prevs).int().flatten(), (ys > y_prevs).int().flatten(), threshold=1).item()


# RGCN

In [4]:
import torch
import torch.nn.functional as F
from torch_geometric_temporal.nn.recurrent import DCRNN

class RecurrentGCN(torch.nn.Module):
    def __init__(self, node_features):
        super(RecurrentGCN, self).__init__()
        self.recurrent = DCRNN(node_features, 32, 1)
        self.linear = torch.nn.Linear(32, 1)

    def forward(self, x, edge_index, edge_weight):
        h = self.recurrent(x, edge_index, edge_weight)
        h = F.relu(h)
        h = self.linear(h)
        return h

In [5]:
from tqdm import tqdm
import wandb

model = RecurrentGCN(node_features = lags).to(device)

lr = 0.001
num_epochs = 50

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

track_with_wandb = True

if track_with_wandb:
    wandb.init(project="cs224w-stock-market-prediction", config={
        "dataset": "S&P500",
        "learning_rate": lr,
        "epochs": num_epochs,
        "architecture": "DCRNN",
    })

for epoch in tqdm(range(num_epochs)):
    model.train()
    y_hats, ys, y_prevs = zip(*[(model(snapshot.x, snapshot.edge_index, snapshot.edge_attr), snapshot.y, snapshot.x[:,-1])
                       for time, snapshot in enumerate(train_dataset)])
    y_hats, ys, y_prevs = torch.stack(list(y_hats)).squeeze(), torch.stack(list(ys)), torch.stack(list(y_prevs))
    loss = mse(y_hats, ys)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if track_with_wandb:
        model.eval()
        with torch.no_grad():
            wandb.log({"epoch": epoch,
                    "train/rmse": rmse(y_hats, ys),
                    "train/mae": mae(y_hats, ys),
                    "train/acc": accuracy(y_hats, y_prevs, ys),
                    "train/f1": f1(y_hats, y_prevs, ys) })
            y_hats, ys, y_prevs = zip(*[(model(snapshot.x, snapshot.edge_index, snapshot.edge_attr), snapshot.y, snapshot.x[:,-1])
                       for time, snapshot in enumerate(val_dataset)])
            y_hats, ys, y_prevs = torch.stack(list(y_hats)).squeeze(), torch.stack(list(ys)), torch.stack(list(y_prevs))
            wandb.log({"epoch": epoch,
                    "val/rmse": rmse(y_hats, ys),
                    "val/mae": mae(y_hats, ys),
                    "val/acc": accuracy(y_hats, y_prevs, ys),
                    "val/f1": f1(y_hats, y_prevs, ys) })


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mkevinxli[0m. Use [1m`wandb login --relogin`[0m to force relogin


 50%|█████     | 25/50 [07:15<07:29, 17.98s/it]

In [28]:
if track_with_wandb:
    model.eval()
    with torch.no_grad():
        y_hats, ys, y_prevs = zip(*[(model(snapshot.x, snapshot.edge_index, snapshot.edge_attr), snapshot.y, snapshot.x[:,-1])
                       for time, snapshot in enumerate(test_dataset)])
        y_hats, ys, y_prevs = torch.stack(list(y_hats)).squeeze(), torch.stack(list(ys)), torch.stack(list(y_prevs))
        wandb.log({"epoch": epoch,
                "test/rmse": rmse(y_hats, ys),
                "test/mae": mae(y_hats, ys),
                "test/acc": accuracy(y_hats, y_prevs, ys),
                "test/f1": f1(y_hats, y_prevs, ys) })