<a href="https://colab.research.google.com/github/ArturAzarskyy/CSC413-Stock-Prediction/blob/main/transformer_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer model for Stock prediction

## Preporations:

Note that torch dataset  and sampler were inspired from `yousefnami`'s article about
[Reading .h5 Files Faster with PyTorch Datasets](https://towardsdatascience.com/reading-h5-files-faster-with-pytorch-datasets-3ff86938cc)

### Getting the pre-processed data from the 

In [9]:
from google.colab import drive
drive.mount('/amd/')

Drive already mounted at /amd/; to attempt to forcibly remount, call drive.mount("/amd/", force_remount=True).


In [10]:
load_mvg_avg_f = False

In [11]:
if load_mvg_avg_f:
    !cp /amd/My\ Drive/CSC413/Data/sp_data_orig_m_avg.zip /content/
    !unzip sp_data_orig.zip
else:
    !cp /amd/My\ Drive/CSC413/Data/sp_data_orig.zip /content/
    !unzip sp_data_orig.zip

Archive:  sp_data_orig.zip
replace test_data.hdf5? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [4]:
import tables
import torch as ty
import os.path
import numpy as np

### Creating a custom dataset for pytorch

In [127]:
from torch.utils.data import Dataset, DataLoader, Sampler, BatchSampler
from torchvision.transforms import Compose
import tables


class StockDataset(Dataset):
    def __init__(self, file_name, shuffle=True):
        super(StockDataset, self).__init__()
        hdf5_file = tables.open_file(file_name, mode='r')
        assert('data' in hdf5_file.root)
        assert('labels' in hdf5_file.root)
        self.f_name = file_name
        self.data = hdf5_file.root.data
        self.lables = hdf5_file.root.labels
        self.size = self.data.shape[0]
        self.shuffle = shuffle
        self.trans_data = Compose([self._from_numpy])
        self.trans_labels = Compose([self._from_numpy, self._prepare_class_task])
    def _prepare_class_task(self, tensor):
        return ty.reshape(tensor, (-1,)).type(ty.LongTensor)
    def __getitem__(self, index):
        # print('here', index)
        X = np.array(self.data[index, :])
        y = np.array(self.lables[index])
        if self.shuffle and type(index) == list:
            permute = ty.randperm(len(index))
            X = X[permute, :]
            y = y[permute]
        X = self.trans_data(X)
        y = self.trans_labels(y)
        return X, y

    def _from_numpy(self, tensor):
        return ty.from_numpy(tensor).float()

    def __len__(self):
        return self.size

### Creaiting samplers and two ways of sampling

Both the RandomBatchSampler and loader generations are the same as the once `yousefnami` used in his artickle

[Reading .h5 Files Faster with PyTorch Datasets](https://towardsdatascience.com/reading-h5-files-faster-with-pytorch-datasets-3ff86938cc)

In [128]:
class RandomBatchSampler(Sampler):
    def __init__(self, dataset, batch_size):
        self.batch_size = batch_size
        self.dataset_length = len(dataset)
        self.n_batches = self.dataset_length / self.batch_size
        self.batch_ids = ty.randperm(int(self.n_batches))

    def __len__(self):
        return self.batch_size

    def __iter__(self):
        for id in self.batch_ids:
            idx = ty.arange(id * self.batch_size, (id + 1) * self.batch_size)
            for index in idx:
                yield int(index)
        if int(self.n_batches) < self.n_batches:
            idx = ty.arange(int(self.n_batches) * self.batch_size, self.dataset_length)
            for index in idx:
                yield int(index)

def normal_loader(dataset, batch_size=32, drop_last=False, shuffle=True):
    return DataLoader(dataset,
                      batch_size=batch_size,
                      drop_last=drop_last,
                      shuffle=shuffle)
def fast_loader(dataset, batch_size=32, drop_last=False, transforms=None):
    return DataLoader(dataset, 
                      batch_size=None,
                      sampler=BatchSampler(RandomBatchSampler(dataset,
                                                              batch_size),
                                           batch_size=batch_size,
                                           drop_last=drop_last))

In [129]:
train_data = StockDataset("train_data.hdf5")


In [130]:
train_loader = normal_loader(train_data)
train_loader_f = fast_loader(train_data)

In [134]:
import time
start_load = time.time()
for i, (X,y) in enumerate(train_loader_f):
    end_load = time.time()
    print(i, X.shape, y.shape)
    break
print( f'Time taken: load({end_load - start_load:.3g}), ')

0 torch.Size([32, 128, 5]) torch.Size([32])
Time taken: load(1.25), 


In [133]:

start_load = time.time()
for i, (X,y) in enumerate(train_loader):
    end_load = time.time()
    print(i, X.shape, y.reshape((-1,)).shape)
    break
print( f'Time taken: load({end_load - start_load:.3g}), ')

0 torch.Size([32, 128, 5]) torch.Size([32])
Time taken: load(1.78), 


In [12]:
train_file = tables.open_file("train_data.hdf5", mode='r')
print(train_file.root.data.shape, train_file.root.labels.shape)
# 

(12573778, 128, 5) (12573778,)


In [45]:
train_file

File(filename=train_data.hdf5, title='', mode='r', root_uep='/', filters=Filters(complevel=0, shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) ''
/data (EArray(12573778, 128, 5)shuffle, blosc(5)) ''
  atom := Float64Atom(shape=(), dflt=0.0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := (204, 128, 5)
/labels (EArray(12573778,)shuffle, blosc(5)) ''
  atom := Float64Atom(shape=(), dflt=0.0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := (16384,)

In [63]:
train_file.root.labels[np.array([1, 3, 45, 85,])].shape

(4,)

In [60]:
train_file.root.data[np.array([1,3, 45, 85,]), :].shape

(4, 128, 5)

In [66]:
train_file.close()

In [39]:
assert('data' in train_file.root)
assert('labels' in train_file.root)

## Model 