In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from collections import deque

from torch.utils.tensorboard import SummaryWriter
from utils.nn_data_classifier import load_data, Classifier
from utils.preprocess import preprocess, RNNDataset

writer = SummaryWriter(log_dir='logs')



Load the dataset

In [2]:
historical_data = load_data()

historical_data['minute'] = historical_data.time.dt.minute
historical_data = historical_data[historical_data.minute == 0]
historical_data.drop('minute', axis=1, inplace=True)

historical_data

Unnamed: 0,time,Price,Volume_ETH,Price_BTC,Volume_BTC
128,2016-03-14 10:00:00,13.100,0.008360,414.70,1.500000
201,2016-03-14 14:00:00,14.000,248.973956,414.95,9.077592
318,2016-03-14 19:00:00,14.750,8.442300,414.50,1.679111
468,2016-03-15 00:00:00,12.650,651.351595,415.90,0.481963
490,2016-03-15 01:00:00,12.576,4.558064,415.79,0.615654
...,...,...,...,...,...
2903842,2022-11-12 16:00:00,1274.800,9.022555,16934.00,0.065520
2903902,2022-11-12 17:00:00,1269.900,52.030265,16888.00,0.006411
2903962,2022-11-12 18:00:00,1272.700,0.252397,16899.00,0.006868
2904022,2022-11-12 19:00:00,1269.600,2.803365,16877.00,0.130167


Classify data

In [3]:
data = historical_data.copy()
time_outlook = 1
data['Prediction'] = data.Price.shift(-time_outlook)
data.dropna(inplace=True)
data.drop('time', axis=1,inplace=True)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,Price,Volume_ETH,Price_BTC,Volume_BTC,Prediction
0,13.100,0.008360,414.70,1.500000,14.000
1,14.000,248.973956,414.95,9.077592,14.750
2,14.750,8.442300,414.50,1.679111,12.650
3,12.650,651.351595,415.90,0.481963,12.576
4,12.576,4.558064,415.79,0.615654,12.662
...,...,...,...,...,...
50202,1276.600,21.640138,16923.00,0.381346,1274.800
50203,1274.800,9.022555,16934.00,0.065520,1269.900
50204,1269.900,52.030265,16888.00,0.006411,1272.700
50205,1272.700,0.252397,16899.00,0.006868,1269.600


Sort data

In [4]:
SEQ_LEN = 24

In [5]:
dataset = []
sequence = deque(maxlen=SEQ_LEN)
y = data.Prediction
X = data.drop('Prediction', axis=1)

for seq, price in zip(X.values, y):
    sequence.append(seq)
    length = len(sequence)

    if length == SEQ_LEN:
        new_seq = np.array(sequence)
        entry = [new_seq, price]
        dataset.append([new_seq, price])
dataset = np.array(dataset)
dataset

  dataset = np.array(dataset)


array([[array([[1.31000000e+01, 8.36000000e-03, 4.14700000e+02, 1.50000000e+00],
               [1.40000000e+01, 2.48973956e+02, 4.14950000e+02, 9.07759197e+00],
               [1.47500000e+01, 8.44230000e+00, 4.14500000e+02, 1.67911106e+00],
               [1.26500000e+01, 6.51351595e+02, 4.15900000e+02, 4.81963470e-01],
               [1.25760000e+01, 4.55806440e+00, 4.15790000e+02, 6.15654120e-01],
               [1.26620000e+01, 8.92037234e+00, 4.18000000e+02, 1.47154553e+00],
               [1.26300000e+01, 1.75970454e+00, 4.16710000e+02, 2.07328463e+00],
               [1.30240000e+01, 1.75996693e+01, 4.15570000e+02, 1.85775689e+01],
               [1.30390000e+01, 6.41923300e+00, 4.16160000e+02, 3.98193240e+00],
               [1.26460000e+01, 2.00000000e+02, 4.15990000e+02, 3.87356600e-01],
               [1.30000000e+01, 5.00000000e+00, 4.15720000e+02, 2.82285100e+00],
               [1.33670000e+01, 2.59978869e+01, 4.14950000e+02, 1.61245856e+00],
               [1.32910000e+

In [6]:
class PredDataSet(Dataset):
    def __init__(self, data: np.array, transform = None, target_transform = None):
        self._data = data
        self._transform = transform
        self._target_transform = target_transform

    def __len__(self):
        return len(self._data)

    def __getitem__(self, x:int):
        X = torch.tensor(self._data[x][0])
        y = torch.tensor(self._data[x][1])
        return X.float(), y.float()

In [7]:
total_len = len(dataset)
training_size = int(0.6 * total_len)
validation_size = int(0.2 * total_len)
testing_size = int(0.2 * total_len)

train_df = dataset[:training_size]
validation_df = dataset[training_size:training_size+validation_size]
testing_df = dataset[-testing_size:]

training = PredDataSet(train_df)
validation = PredDataSet(validation_df)
testing = PredDataSet(testing_df)

In [8]:
from torch import nn
class RNN_module(nn.Module):
    def __init__(self, hidden_size, input_size, num_layers):
        super(RNN_module, self).__init__()
        self._num_layers = num_layers
        self._input_size = input_size
        self._hidden_size = hidden_size
        self._output_size = 1

        self.lstm = nn.LSTM(input_size = self._input_size, hidden_size = self._hidden_size, 
                            num_layers = self._num_layers, batch_first = True)
        self.fc = nn.Linear(in_features=self._hidden_size, out_features= self._output_size)

    def __str__(self):
        return f"RNN LSTM Model w/ {self._input_size} features and {self._num_layers} layers and {self._hidden_size} of hidden size"

    def forward(self, input):
        lstm_output, (h_n, c_n) = self.lstm(input)
        pred = self.fc(lstm_output[:, -1, :])
        
        return pred

In [9]:
from torch.utils.data import DataLoader

learning_rate = 0.01
dim_size = training[0][0].shape[1]
hidden_size = 60
batch_size = 64
epochs = 10
layers = 1

train_dataloader = DataLoader(training, batch_size = batch_size, shuffle = True)
validation_dataloader = DataLoader(validation, batch_size = batch_size, shuffle = True)
test_dataloader = DataLoader(testing, batch_size = batch_size, shuffle = False)

model = RNN_module(hidden_size = hidden_size, input_size = dim_size,
                   num_layers = layers)

In [10]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [11]:
CHECKPOINT_PATH = './models_parameters/LSTM/checkpoints_2/'
BEST_PATH = './models_parameters/LSTM/best_model.pth'

def epoch_training(model, train_dataloader, criterion, epoch, total_epochs, optimizer):
    n_of_steps = len(train_dataloader)
    running_loss = 0

    for current_batch, (sequence, prices) in enumerate(train_dataloader):
        #forward: we are calculating the loss given the parameters
        outputs = model(sequence).view(-1)
        loss = criterion(input=outputs, target = prices)

        #backward: lets update the parameters given the current loss
        optimizer.zero_grad() #nullifies the current gradients. If you don't do this, gradients will be added up (you don't want that)
        loss.backward() #computates the bwrd-prop gradient for each model parameter
        optimizer.step() #updates the model current parameter using the gradients.

        running_loss += loss.item()

        if (current_batch + 1) % 50 == 0:
            print(f"epoch {epoch+1}/{total_epochs}, current step(batch): {current_batch+1}/{n_of_steps}, loss = {loss.item():.4f} ")
            writer.add_scalar('training loss: ', running_loss/50, epoch * n_of_steps + current_batch)
            running_loss = 0
            
    writer.add_scalar('Epoch loss: ', loss, epoch + 1)


def epoch_validate(model, validation_dataloader, criterion, epoch, total_epochs):
    with torch.no_grad():
        n_samples = 0
        total_loss = 0
        for current_batch, (sequence, prices) in enumerate(validation_dataloader):

            #forward: we are calculating the loss given the parameters
            outputs = model(sequence).view(-1)
            loss = criterion(input=outputs, target = prices).item()

            batch_size = outputs.shape[0]
            loss *= batch_size
            total_loss += loss
            n_samples += batch_size

        final_loss = total_loss / n_samples

        print(f"epoch {epoch+1}/{total_epochs} final_loss: {final_loss}")
        writer.add_scalar('Validation Accuracy: ', final_loss, epoch+1)

    return final_loss


def train_loop(model: RNN_module, train_dataloader: DataLoader, criterion: torch.nn, validation_dataloader: DataLoader, epochs: int, optimizer: torch.optim):
    
    max_precision = 0
    is_best = False

    for epoch in range(epochs):
        epoch_training(model, train_dataloader, criterion, epoch, epochs, optimizer)

        precision = epoch_validate(model, validation_dataloader, criterion, epoch, epochs)

        if precision < max_precision:
            is_best = True
            max_precision = precision
        else:
            is_best = False
        
        checkpoint = {
            'epoch': epoch+1,
            'model_state': model.state_dict(),
            'optim_state': optimizer.state_dict()
        }

        if is_best:
            torch.save(checkpoint, BEST_PATH)
        
        torch.save(checkpoint, CHECKPOINT_PATH+f'model_{epoch+1}.pth')

def overfit_batch(model: RNN_module, train_dataloader: DataLoader, criterion: torch.nn, validation_dataloader: DataLoader, epochs: int, optimizer: torch.optim):
    
    sequence, prices = next(iter(train_dataloader))
    running_loss = 0

    for epoch in range(epochs):
        n_of_steps = len(prices)
        
        #forward: we are calculating the loss given the parameters
        outputs = model(sequence).view(-1)
        loss = criterion(input=outputs, target = prices)

        #backward: lets update the parameters given the current loss
        optimizer.zero_grad() #nullifies the current gradients. If you don't do this, gradients will be added up (you don't want that)
        loss.backward() #computates the bwrd-prop gradient for each model parameter
        optimizer.step() #updates the model current parameter using the gradients.

        running_loss += loss.item()

        if (epoch + 1) % 100 == 0:
            print(f"epoch {epoch+1}/{epochs}, loss = {loss.item():.4f} ")
            writer.add_scalar('training loss: ', running_loss/100, epoch * n_of_steps)
            running_loss = 0
                


In [12]:
c = model.state_dict()

In [14]:
def test_loop(test_dataloader: DataLoader, model: nn.Module):
    with torch.no_grad():
        n_corrects = 0
        n_samples = 0

        for current_batch, (sequence, label) in enumerate(test_dataloader):
            #forward: we are calculating the loss given the parameters
            outputs = model(sequence)
            predictions = torch.argmax(outputs, 1)

            n_samples += outputs.shape[0]
            n_corrects += (predictions == label).sum().item()

            if (current_batch + 1) % 200 == 0:
                print(f"test batch: {current_batch+1}/{len(test_dataloader)}, current accuracy: {100 * n_corrects / n_samples}")

        acc = 100.0 * n_corrects / n_samples
        print(f"final test accuracy: {acc}")


In [15]:
epochs = 10000
overfit_batch(model, train_dataloader=train_dataloader, criterion=criterion, validation_dataloader = validation_dataloader, epochs=epochs, optimizer=optimizer)
# train_loop(model, train_dataloader=train_dataloader, criterion=criterion, validation_dataloader = validation_dataloader, epochs=epochs, optimizer=optimizer)
# test_loop(test_dataloader=test_dataloader, model=model)

epoch 100/10000, loss = 105710.3203 
epoch 200/10000, loss = 94911.7344 
epoch 300/10000, loss = 84946.8359 
epoch 400/10000, loss = 76505.4922 
epoch 500/10000, loss = 69261.5156 
epoch 600/10000, loss = 63060.6172 
epoch 700/10000, loss = 57784.7031 
epoch 800/10000, loss = 53330.8047 
epoch 900/10000, loss = 49604.9922 
epoch 1000/10000, loss = 46520.0234 
epoch 1100/10000, loss = 43994.4648 
epoch 1200/10000, loss = 41952.4258 
epoch 1300/10000, loss = 40323.5234 
epoch 1400/10000, loss = 39043.0898 
epoch 1500/10000, loss = 38052.3438 
epoch 1600/10000, loss = 37298.5586 
epoch 1700/10000, loss = 36735.2578 
epoch 1800/10000, loss = 36322.1289 
epoch 1900/10000, loss = 36024.9766 
epoch 2000/10000, loss = 35815.3984 
epoch 2100/10000, loss = 35670.4102 
epoch 2200/10000, loss = 35571.8398 
epoch 2300/10000, loss = 35505.7578 
epoch 2400/10000, loss = 35461.7852 
epoch 2500/10000, loss = 35432.4102 
epoch 2600/10000, loss = 35412.4062 
epoch 2700/10000, loss = 35398.2188 
epoch 280

In [21]:
for p in model.parameters():
    print('===========\ngradient:\n----------\n{}'.format(p.grad.norm))

gradient:
----------
<bound method Tensor.norm of tensor([[-3.9372e-33, -1.5202e-33, -2.0430e-31, -8.6199e-34],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 6.0891e-28,  1.3080e-27,  3.1140e-26,  6.3675e-28],
        [ 4.7624e-28,  3.7155e-26,  2.5959e-26,  3.1659e-29],
        [-6.8102e-28, -1.4144e-28, -3.5121e-26, -1.7453e-28],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 4.2524e-38,  3.2859e-36,  2.2272e-36,  1.2228e-38],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 3.1323e-15,  9.7263e-16,  1.6381e-13,  1.1995e-15],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00

In [None]:
for name, param in model.named_parameters():
    print(name, param.grad.abs().sum())


Using best model in validation

In [None]:
best_model = RNN_module(hidden_size = hidden_size, input_size = dim_size,
                     output_size = number_of_classes, num_layers = 1)

checkpoint = torch.load(BEST_PATH)
print(f'Model type: {best_model}')
print(f'Best performing model found at {checkpoint["epoch"]}ºepoch')

best_model.load_state_dict(state_dict=checkpoint['model_state'], strict=True)
best_model.eval()

test_loop(test_dataloader=test_dataloader, model=best_model)

In [None]:
checkpoint = torch.load('./models_parameters/LSTM/checkpoints_2/model_1.pth')

a = checkpoint['model_state']

checkpoint = torch.load('./models_parameters/LSTM/checkpoints_2/model_10.pth')

b = checkpoint['model_state']

In [None]:
writer.close()