# Dataset Construction

In [9]:
import numpy as np
import pandas as pd
from torch_geometric_temporal.signal import StaticGraphTemporalSignal
import torch

class StaticSP500DatasetLoader(object):
    def __init__(self, correlation_type='pearsons'):
        self._read_csv(correlation_type)

    def _read_csv(self, correlation_type):
        self._correlation_matrix = np.fromfile(f's&p500_{correlation_type}.csv', sep=',')
        N = int(np.sqrt(len(self._correlation_matrix)))
        self._correlation_matrix = self._correlation_matrix.reshape(N, N)
        _correlation_threshold = 0.9
        self._correlation_matrix[self._correlation_matrix < _correlation_threshold] = 0
        
        df = pd.read_csv('s&p500.csv')
        df = df.set_index('Date')
        data = torch.from_numpy(df.to_numpy()).to(torch.float32)
        ratios = torch.zeros_like(data)
        # calculate daily return ratio
        for d in range(1, data.size(0)):
            ratios[d-1] = (data[d] - data[d-1]) / data[d-1]
        # skip the first day which cannot calculate daily return ratio
        # and round data size to nearest multiple of batch_size
        days_in_quarter = 64
        num_quarters = data.size(0) // days_in_quarter
        num_days = num_quarters * days_in_quarter
        ratios = ratios[:num_days]
        self._dataset = ratios.numpy()

    def _get_edges(self):
        self._edges = np.array(self._correlation_matrix.nonzero())

    def _get_edge_weights(self):
        self._edge_weights = self._correlation_matrix[self._correlation_matrix > 0]

    def _get_targets_and_features(self):
        stacked_target = self._dataset
        self.features = [
            stacked_target[i : i + self.lags, :].T
            for i in range(stacked_target.shape[0] - self.lags)
        ]
        self.targets = [
            stacked_target[i + self.lags, :].T
            for i in range(stacked_target.shape[0] - self.lags)
        ]

    def get_dataset(self, lags: int) -> StaticGraphTemporalSignal:
        """Returning the data iterator.

        Args types:
            * **lags** *(int)* - The number of time lags.
        Return types:
            * **dataset** *(StaticGraphTemporalSignal)*
        """
        self.lags = lags
        self._get_edges()
        self._get_edge_weights()
        self._get_targets_and_features()
        dataset = StaticGraphTemporalSignal(
            self._edges, self._edge_weights, self.features, self.targets
        )
        return dataset

In [11]:
from torch_geometric_temporal.signal import temporal_signal_split
import torch_geometric

device = 'cpu'

loader = StaticSP500DatasetLoader()

lags = 64

dataset = loader.get_dataset(lags)

train_dataset, test_dataset = temporal_signal_split(dataset, train_ratio=0.8)

# RGCN

In [12]:
import torch
import torch.nn.functional as F
from torch_geometric_temporal.nn.recurrent import DCRNN

class RecurrentGCN(torch.nn.Module):
    def __init__(self, node_features):
        super(RecurrentGCN, self).__init__()
        self.recurrent = DCRNN(node_features, 32, 1)
        self.linear = torch.nn.Linear(32, 1)

    def forward(self, x, edge_index, edge_weight):
        h = self.recurrent(x, edge_index, edge_weight)
        h = F.relu(h)
        h = self.linear(h)
        return h

In [14]:
from tqdm import tqdm

model = RecurrentGCN(node_features = lags).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

model.train()

for epoch in tqdm(range(100)):
    cost = 0
    for time, snapshot in enumerate(train_dataset):
        y_hat = model(snapshot.x.to(device), snapshot.edge_index.to(device), snapshot.edge_attr.to(device))
        cost = cost + torch.mean((y_hat-snapshot.y.to(device))**2)
    cost = cost / (time+1)
    print(f'Epoch {epoch}, MSE: {cost.item()}')
    cost.backward()
    optimizer.step()
    optimizer.zero_grad()

  0%|          | 0/200 [00:00<?, ?it/s]

Epoch 0, MSE: 0.006118872202932835


  0%|          | 1/200 [00:07<25:39,  7.74s/it]

Epoch 1, MSE: 0.003357746172696352


  1%|          | 2/200 [00:14<24:31,  7.43s/it]

Epoch 2, MSE: 0.001397061045281589


  2%|▏         | 3/200 [00:22<24:06,  7.34s/it]

Epoch 3, MSE: 0.000503134448081255


  2%|▏         | 4/200 [00:29<23:46,  7.28s/it]

Epoch 4, MSE: 0.0007674872758798301


  2%|▎         | 5/200 [00:36<23:35,  7.26s/it]

Epoch 5, MSE: 0.0014428674476221204


  3%|▎         | 6/200 [00:43<23:29,  7.26s/it]

Epoch 6, MSE: 0.0016559084178879857


  4%|▎         | 7/200 [00:51<23:16,  7.24s/it]

Epoch 7, MSE: 0.0014035552740097046


  4%|▍         | 8/200 [00:58<23:04,  7.21s/it]

Epoch 8, MSE: 0.0009823220316320658


  4%|▍         | 9/200 [01:05<22:58,  7.22s/it]

Epoch 9, MSE: 0.0006320746033452451


  5%|▌         | 10/200 [01:12<22:50,  7.22s/it]

Epoch 10, MSE: 0.00045646927901543677


  6%|▌         | 11/200 [01:19<22:37,  7.18s/it]

Epoch 11, MSE: 0.0004459126212168485


  6%|▌         | 12/200 [01:27<22:36,  7.21s/it]

Epoch 12, MSE: 0.0005333911976777017


  6%|▋         | 13/200 [01:34<22:25,  7.19s/it]

Epoch 13, MSE: 0.000645472202450037


  7%|▋         | 14/200 [01:41<22:19,  7.20s/it]

Epoch 14, MSE: 0.0007302474696189165


  8%|▊         | 15/200 [01:48<22:10,  7.19s/it]

Epoch 15, MSE: 0.0007636560476385057


  8%|▊         | 16/200 [01:55<22:02,  7.18s/it]

Epoch 16, MSE: 0.0007442952482961118


  8%|▊         | 17/200 [02:02<21:53,  7.18s/it]

Epoch 17, MSE: 0.0006852075457572937


  9%|▉         | 18/200 [02:10<21:47,  7.18s/it]

Epoch 18, MSE: 0.0006064282497391105


 10%|▉         | 19/200 [02:17<21:44,  7.21s/it]

Epoch 19, MSE: 0.0005290175904519856


 10%|█         | 20/200 [02:24<21:38,  7.21s/it]

Epoch 20, MSE: 0.00047033329610712826


 10%|█         | 21/200 [02:36<22:15,  7.46s/it]


KeyboardInterrupt: 

In [None]:
model.eval()
cost = 0
for time, snapshot in enumerate(test_dataset):
    y_hat = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
    cost = cost + torch.mean((y_hat-snapshot.y)**2)
cost = cost / (time+1)
cost = cost.item()
print("MSE: {:.4f}".format(cost))