<a href="https://colab.research.google.com/github/ArturAzarskyy/CSC413-Stock-Prediction/blob/main/transformer_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer model for Stock prediction

## Preparations:

Note that torch dataset  and sampler were inspired from `yousefnami`'s article about
[Reading .h5 Files Faster with PyTorch Datasets](https://towardsdatascience.com/reading-h5-files-faster-with-pytorch-datasets-3ff86938cc)

### Getting the pre-processed data from the 

In [1]:
from google.colab import drive
drive.mount('/amd/')

Mounted at /amd/


In [2]:
load_mvg_avg_f = False

In [3]:
if load_mvg_avg_f:
    !cp /amd/My\ Drive/CSC413/Data/sp_data_orig_m_avg.zip /content/
    !unzip sp_data_orig.zip
else:
    !cp /amd/My\ Drive/CSC413/Data/sp_data_orig.zip /content/
    !unzip sp_data_orig.zip

Archive:  sp_data_orig.zip
  inflating: test_data.hdf5          
  inflating: train_data.hdf5         
  inflating: val_data.hdf5           


Imports:

In [4]:
from torch.utils.data import Dataset, DataLoader, Sampler, BatchSampler
from torchvision.transforms import Compose
import tables
import torch as ty
import torch.nn as nn
import os.path
import numpy as np
import time
seq_len = 128

### Creating a custom dataset for pytorch

In [5]:
class StockDataset(Dataset):
    def __init__(self, file_name, shuffle=True):
        super(StockDataset, self).__init__()
        hdf5_file = tables.open_file(file_name, mode='r')
        assert('data' in hdf5_file.root)
        assert('labels' in hdf5_file.root)
        self.f_name = file_name
        self.data = hdf5_file.root.data
        self.lables = hdf5_file.root.labels
        self.size = self.data.shape[0]
        self.shuffle = shuffle
        self.trans_data = Compose([self._from_numpy])
        self.trans_labels = Compose([self._from_numpy, self._prepare_class_task])

    def __getitem__(self, index):
        # print('here', index)
        X = np.array(self.data[index, :])
        y = np.array(self.lables[index])
        if self.shuffle and type(index) == list:
            permute = ty.randperm(len(index))
            X = X[permute, :]
            y = y[permute]
            y = self.trans_labels(y)
        else:
            y = self.trans_data(y)
        X = self.trans_data(X)
        return X, y

    def _prepare_class_task(self, tensor):
        return ty.reshape(tensor, (-1,))

    def _from_numpy(self, tensor):
        return ty.from_numpy(tensor).float()

    def __len__(self):
        return self.size

### Creaiting samplers and two ways of sampling

**TODO  --> check if we actually do need that**



Both the RandomBatchSampler and loader generations are the same as the once `yousefnami` used in his artickle
[Reading .h5 Files Faster with PyTorch Datasets](https://towardsdatascience.com/reading-h5-files-faster-with-pytorch-datasets-3ff86938cc)

In [6]:
class RandomBatchSampler(Sampler):
    def __init__(self, dataset, batch_size):
        self.batch_size = batch_size
        self.dataset_length = len(dataset)
        self.n_batches = self.dataset_length / self.batch_size
        self.batch_ids = ty.randperm(int(self.n_batches))

    def __len__(self):
        return self.batch_size

    def __iter__(self):
        for id in self.batch_ids:
            idx = ty.arange(id * self.batch_size, (id + 1) * self.batch_size)
            for index in idx:
                yield int(index)
        if int(self.n_batches) < self.n_batches:
            idx = ty.arange(int(self.n_batches) * self.batch_size,
                            self.dataset_length)
            for index in idx:
                yield int(index)

def normal_loader(dataset, batch_size=32, drop_last=False, shuffle=True):
    return DataLoader(dataset,
                      batch_size=batch_size,
                      drop_last=drop_last,
                      shuffle=shuffle)
def fast_loader(dataset, batch_size=32, drop_last=False, transforms=None):
    return DataLoader(dataset, 
                      batch_size=None,
                      sampler=BatchSampler(RandomBatchSampler(dataset,
                                                              batch_size),
                                           batch_size=batch_size,
                                           drop_last=drop_last))

In [7]:
train_data = StockDataset("train_data.hdf5")

In [8]:
train_loader = normal_loader(train_data)
train_loader_f = fast_loader(train_data)

In [9]:
start_load = time.time()
for i, (X,y) in enumerate(train_loader_f):
    end_load = time.time()
    print(i, X.shape, y.shape)
    break
print( f'Time taken: load({end_load - start_load:.3g}), ')

0 torch.Size([32, 128, 5]) torch.Size([32])
Time taken: load(0.896), 


In [10]:

start_load = time.time()
for i, (X,y) in enumerate(train_loader):
    end_load = time.time()
    print(i, X.shape, y.shape)
    break
print( f'Time taken: load({end_load - start_load:.3g}), ')

0 torch.Size([32, 128, 5]) torch.Size([32])
Time taken: load(1.53), 


In [39]:
batch_size = 128

train_data = StockDataset("train_data.hdf5")
val_data = StockDataset("val_data.hdf5")
test_data = StockDataset("test_data.hdf5")

train_loader = normal_loader(train_data, batch_size=batch_size)
# train_loader_f = fast_loader(train_data, batch_size=batch_size)
val_loader = normal_loader(val_data, batch_size=batch_size)
# val_loader_f = fast_loader(val_data, batch_size=batch_size)
test_loader = normal_loader(test_data, batch_size=batch_size)
# test_loader_f = fast_loader(test_data, batch_size=batch_size)

## Model 

**Time2vec**

In order to consider both periodic and non-periodic patterns & time rescaling invariance (representation not affected by different time units), we use time2vec.

Idea:
* Initially use a linear function for first iteration
* Call upon a function of the linear function (sin) for every other iteration

In [12]:
device = None
use_cuda = True # set this variable if you want to use cuda
if ty.cuda.is_available() and use_cuda:
    device = ty.device('cuda:0')

In [13]:
def time2vec(tau, w0, b0, w1, b1):
  #Sticking with sin function since it outperforms other functions
  #Tau is the average of z in [x, y, z] = [32, 128, 5]? (Need to double check)
  v0 = ty.sin(ty.matmul(tau, w0) + b0)
  v1 = ty.matmul(tau, w1) + b1

  return ty.cat([v0, v1], 1)

In [14]:
class Time2VecTest(nn.Module):
    def __init__(self, seq_len, device):
        super(Time2VecTest, self).__init__()
        self.seq_len = seq_len
        self.device = device
        self.time_lin_weight = nn.Linear(seq_len, seq_len, device=device)
        self.time_periodic_weight = nn.Linear(seq_len, seq_len, device=device)
        # nn.init.uniform_(self.time_lin_weight.weight)
        # nn.init.uniform_(self.time_periodic_weight.weight)

    def forward(self, x):
        x = x.to(self.device)
        x = ty.mean(x, dim=-1)
        x_time_lin = self.time_lin_weight(x).unsqueeze(-1)
        x_time_periodic =  ty.sin(self.time_periodic_weight(x)).unsqueeze(-1)
        return ty.concat([x_time_lin, x_time_periodic], -1)


### Our manually created model


Note our model is a mix of the classic definition of the transformer discussed in class as well as some modifications to it were taken from the paper Attention Is All You Need and [Jan Schmitz implementation](https://towardsdatascience.com/stock-predictions-with-state-of-the-art-transformer-and-time-embeddings-3a4485237de6#:~:text=A%20Transformer%20is%20a%20neural,and%20Multi%2DHead%20Attention%20layer), though unlike his implementaiton we are using pytorch, and we will write our own training funcitonality.

In [17]:
dim_cases = 128
dim_vals = 128
filter_dim = 128
class OneHead(nn.Module):
    def __init__(self, dim_keys, dim_val, device):
        super(OneHead, self).__init__()
        self.dim_keys = dim_keys
        self.dim_val = dim_val
        self.device = device
        self.query = nn.Linear(7, dim_keys, device=device)  # 7 = 5(open, close, high, low, volume) + 2(time vec)
        self.keys = nn.Linear(7, dim_keys, device=device)
        self.values = nn.Linear(7, dim_val, device=device)
        self.softmax = nn.Softmax(-1)

    def forward(self, x):
        #  Expects x to be of shape [batch_size, seq_len, 7]
        x = x.to(self.device)
        q = self.query(x)
        k = self.keys(x)
        attention = q @ ty.transpose(k, 1, 2)
        attention = attention / np.sqrt(self.dim_keys)
        attention =  self.softmax(attention)
        v = self.values(x)
        return attention @ v

In [16]:

class MultiHead(nn.Module):
    def __init__(self, dim_keys, dim_val, n_heads, device):
        super(MultiHead, self).__init__()
        self.dim_keys = dim_keys
        self.dim_val = dim_val
        self.device = device
        self.n_heads = n_heads
        self.lin_dim = n_heads * dim_val
        self.multi_head = [OneHead(dim_keys, dim_val, device) for _ in range(n_heads)]
        self.lin_final_attention = nn.Linear(self.lin_dim, 7,  device=device)

    def forward(self, x):
        x = x.to(self.device)
        attn = [func(x) for func in self.multi_head]
        concat_attn = ty.concat(attn, -1)
        mult_attn = self.lin_final_attention(concat_attn)
        return mult_attn

In [29]:
class Encoder(nn.Module):
    def __init__(self, dim_keys, dim_val, n_heads,
                 filter_dim, device, dropout):
        super(Encoder, self).__init__()
        self.dim_keys = dim_keys
        self.dim_val = dim_val
        self.device = device
        self.n_heads = n_heads
        self.multi_hed = MultiHead(dim_keys, dim_val, n_heads, device)
        self.drop_out = nn.Dropout(dropout)
        self.normilize  = nn.InstanceNorm1d(seq_len, eps=1e-6)
        self.f1 = nn.Conv1d(7, filter_dim, 1)
        self.relu = nn.ReLU()
        self.f2 = nn.Conv1d(filter_dim, 7, 1)
        self.f_drop_out = nn.Dropout(dropout)
        self.f_norm = nn.InstanceNorm1d(seq_len, eps=1e-6)

    def forward(self, x):
        x = x.to(self.device)
        attentions = self.multi_hed(x)
        attentions = self.drop_out(attentions)
        attentions = self.normilize(attentions + x)
        attentions = attentions.permute(0, 2, 1)
        filtered = self.relu(self.f1(attentions))
        filtered = self.f2(filtered)
        filtered = filtered.permute(0, 2, 1)
        filtered = self.f_drop_out(filtered)
        filtered = self.f_norm(filtered + x)
        return filtered

In [40]:
class Decoder(nn.Module):
    def __init__(self, dim_keys, dim_val, seq_len, hidden,
                 n_heads, filter_dim, device, dropout=0.1):
        super(Decoder, self).__init__()
        self.time_enc = Time2VecTest(seq_len, device)
        self.enc_1 = Encoder( dim_keys, dim_val, n_heads,
                             filter_dim, device, dropout)
        self.enc_2 = Encoder( dim_keys, dim_val, n_heads,
                             filter_dim, device, dropout)
        self.enc_3 = Encoder( dim_keys, dim_val, n_heads,
                             filter_dim, device, dropout)
        self.avg_pool = nn.AdaptiveAvgPool1d(1)

        self.drop_out = nn.Dropout(0.1)

        self.relu = nn.ReLU()
        self.lin1 = nn.Linear(seq_len, hidden)
        self.lin2 = nn.Linear(hidden, 1)

    def forward(self, x):
        x = x.to(self.device)
        time_vec = self.time_enc(x)
        x = ty.concat([x, time_vec], -1)
        x = self.enc_1(x)
        x = self.enc_2(x)
        x = self.enc_3(x)
        x = ty.squeeze(self.avg_pool(x))
        x = self.drop_out(x)
        x = self.relu(self.lin1(x))
        x = self.drop_out(x)
        x = self.lin2(x)
        return x



In [37]:
# time2vec_m = Time2VecTest(seq_len, device)
one_h = Decoder(dim_cases, dim_vals,seq_len, 64,  12, filter_dim, device)
# transformer_model = nn.Transformer(d_model=5, nhead=5, num_encoder_layers=12, batch_first=True )

for i, (X, y) in enumerate(train_loader):
    print(i, X.shape, y.shape)
    # x_new  = ty.concat([X, time2vec_m(X)], -1)
    # print(x_new.shape)
    print(one_h(X).shape)
    # transformer_model(x_new)
    break

0 torch.Size([32, 128, 5]) torch.Size([32])
torch.Size([32, 128, 7])
torch.Size([32, 128])
torch.Size([32, 1])


#### Training

In [None]:
# def get_acuracy(model, data):


In [None]:
def train(model, train_l, val_l, lr=5.0, epoches=20):

    criterion  = nn.CrossEntropyLoss()
    optim = ty.optim.SGD(model.parameters(), lr=lr)
    scheduler = ty.optim.lr_scheduler.StepLR(optim, 1.0, gamma=0.95)
    log_interval = 200

    for j in range(epoches):
        model.train()
        total_loss = 0

        for i, (X, y) in enumerate(train_l):
            out = ty.squeeze(model(X))
            loss = criterion(out, y)
            optim.zero_grad()
            ty.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optim.step()
            if i % log_interval:
                lr = scheduler.get_last_lr()[0]
                cur_loss = total_loss / log_interval
                print(f"[Epoch {j+1:3d}] batch: {i} lr:{lr:02.2f}, [Loss : {cur_loss:5.2f}]")
                total_loss = 0



### PyTorch Version of transformer 