In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter
from utils.nn_data_classifier import load_data, Classifier
from utils.preprocess import preprocess, RNNDataset
from utils.nn_data_classifier import Classifier
writer = SummaryWriter(log_dir='logs')



In [None]:
classi = Classifier(max_drop=-0.05, max_raise=0.05, max_period=240)
historical_data = load_data()

historical_data['minute'] = historical_data.Day.dt.minute
historical_data = historical_data[historical_data.minute == 0]
historical_data.drop('minute', axis=1, inplace=True)

classified_data = classi.classify_data(historical_data)
classified_data.to_csv('./historicals/BTC-min-005-005_with_max_period.csv')
classified_data

In [2]:
classified_data_path = './historicals/BTC-min-005-005_with_max_period.csv'
data = pd.read_csv(classified_data_path, parse_dates=['Day']).iloc[:, 1:]
data.sort_values("Day", inplace=True)
data


Unnamed: 0,Day,Price,Volume,Classification
0,2017-01-01 01:00:00,966.60,113.092200,1.0
1,2017-01-01 02:00:00,964.35,43.352316,1.0
2,2017-01-01 03:00:00,963.97,0.000000,1.0
3,2017-01-01 04:00:00,960.60,10.717626,1.0
4,2017-01-01 05:00:00,963.46,0.000000,1.0
...,...,...,...,...
44579,2022-02-28 23:00:00,43053.74,129851.644413,2.0
44580,2022-03-01 00:00:00,43237.60,13215.276262,2.0
44581,2022-03-01 01:00:00,43548.62,5412.571754,2.0
44582,2022-03-01 02:00:00,43296.76,10547.801236,2.0


In [3]:
SEQ_LEN = 300

In [4]:
processor = preprocess(sequence_length=SEQ_LEN)
training, validation, testing  = processor.preprocess(dataframe=data, validation_size=0.2)

In [5]:
from torch import nn
class RNN_module(nn.Module):
    def __init__(self, hidden_size, input_size, output_size, num_layers):
        super(RNN_module, self).__init__()
        self._num_layers = num_layers
        self._input_size = input_size
        self._hidden_size = hidden_size
        self._output_size = output_size

        self.lstm = nn.LSTM(input_size = self._input_size, hidden_size = self._hidden_size, 
                            num_layers = self._num_layers, batch_first = True)
        self.fc = nn.Linear(in_features=self._hidden_size, out_features= self._output_size)

    def forward(self, input):
        h_0 = torch.zeros(self._num_layers, input.size(0), self._hidden_size, dtype=torch.float64)
        c_0 = torch.zeros(self._num_layers, input.size(0), self._hidden_size, dtype=torch.float64)

        lstm_output, (h_n, c_n) = self.lstm(input)

        pred = self.fc(lstm_output[:, -1, :])
        
        return pred

In [37]:
from torch.utils.data import DataLoader

learning_rate = 0.01
dim_size = training._features.size(dim=-1)
hidden_size = 60
batch_size = 64
number_of_classes = training[1][1].shape[0]

train_dataloader = DataLoader(training, batch_size = batch_size, shuffle = True)
validation_dataloader = DataLoader(validation, batch_size = batch_size, shuffle = True)
test_dataloader = DataLoader(testing, batch_size = batch_size, shuffle = False)

model = RNN_module(hidden_size = hidden_size, input_size = dim_size,
                     output_size = number_of_classes, num_layers = 1)

In [38]:
labels = training._labels
classes = torch.unique(labels, dim = 0)

freq = {}
for _class in classes:
    freq[_class.item()] = 0

weights = torch.zeros((classes.shape[0],))
total = torch.tensor(len(training))
for input in range(training._beg, training._end):
    _, output = torch.max(training[input][1], 0)
    freq[output.item()] += 1


for idx, _class in enumerate(classes):
    weights[idx] +=  1 - freq[_class.item()]/total

weights

tensor([0.5900, 0.5023, 0.8981])

In [39]:
criterion = nn.CrossEntropyLoss(weight= weights)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [40]:
CHECKPOINT_PATH = './models_parameters/LSTM/checkpoints_2/'
BEST_PATH = './models_parameters/LSTM/best_model.pth'

def epoch_training(train_dataloader, epoch, total_epochs):
    running_correct = 0
    running_loss = 0.0
    n_of_steps = len(train_dataloader)

    for current_batch, (sequence, label) in enumerate(train_dataloader):
        #forward: we are calculating the loss given the parameters
        outputs = model(sequence)
        print(outputs)
        print(label)
        loss = criterion(input=outputs, target = label)

        #backward: lets update the parameters given the current loss
        optimizer.zero_grad() #nullifies the current gradients. If you don't do this, gradients will be added up (you don't want that)
        loss.backward() #computates the bwrd-prop gradient for each model parameter
        optimizer.step() #updates the model current parameter using the gradients.

        _, predictions = torch.max(outputs, 1)
        _, correct = torch.max(label, 1)

        running_correct += (predictions == correct).sum().item()
        running_loss += loss.item()

        if (current_batch + 1) % 50 == 0:
            print(f"epoch {epoch+1}/{total_epochs}, current step(batch): {current_batch+1}/{n_of_steps}, loss = {loss.item():.4f} ")
            writer.add_scalar('training loss: ', running_loss/50, epoch * n_of_steps + current_batch)
            writer.add_scalar('accuracy: ', running_correct/50, epoch * n_of_steps + current_batch)
            running_loss = 0.0
            running_correct = 0

    writer.add_scalar('Epoch loss: ', loss, epoch + 1)


def epoch_validate(validation_dataloader, epoch, total_epochs):
    with torch.no_grad():
        n_corrects = 0
        n_samples = 0
        
        for current_batch, (sequence, label) in enumerate(validation_dataloader):

            #forward: we are calculating the loss given the parameters
            outputs = model(sequence)
            _, predictions = torch.max(outputs, 1)
            
            n_samples += outputs.shape[0]
            n_corrects += (predictions == label).sum().item()

        acc = 100.0 * n_corrects / n_samples

        print(f"epoch {epoch+1}/{total_epochs} accuracy: {acc}")
        writer.add_scalar('Validation Accuracy: ', acc, epoch+1)

    return acc


def train_loop(train_dataloader: DataLoader, validation_dataloader: DataLoader, epochs: int):
    
    n_of_steps = len(train_dataloader) #total number of batches
    running_correct = 0
    running_loss = 0.0
    max_accuracy = 0
    is_best = False

    for epoch in range(epochs):
        epoch_training(train_dataloader, epoch, epochs)

        accuracy = epoch_validate(validation_dataloader, epoch, epochs)

        if accuracy > max_accuracy:
            is_best = True
        
        checkpoint = {
            'epoch': epoch,
            'model_state': model.state_dict(),
            'optim_state': optimizer.state_dict()
        }

        if is_best:
            torch.save(checkpoint, BEST_PATH)
        
        torch.save(checkpoint, CHECKPOINT_PATH+f'model_{epoch+1}.pth')

In [41]:
def test_loop(test_dataloader: DataLoader):
    with torch.no_grad():
        n_corrects = 0
        n_samples = 0

        for current_batch, (sequence, label) in enumerate(test_dataloader):

            #forward: we are calculating the loss given the parameters
            outputs = model(sequence)
            _, predictions = torch.max(outputs, 1)

            n_samples += outputs.shape[0]
            n_corrects += (predictions == label).sum().item()

            if (current_batch + 1) % 200 == 0:
                print(f"test batch: {current_batch+1}/{len(test_dataloader)}, current accuracy: {100 * n_corrects / n_samples}")

        acc = 100.0 * n_corrects / n_samples
        print(f"final test accuracy: {acc}")


In [42]:
epochs = 10
train_loop(train_dataloader=train_dataloader, validation_dataloader = validation_dataloader, epochs=epochs)
test_loop(test_dataloader=test_dataloader) #curioso: quando o dataloader tava em lista (tuple) o modelo tava treinando bem mais rápido...

tensor([[ 0.1316,  0.0738,  0.0572],
        [ 0.1895,  0.2140,  0.1606],
        [ 0.1152,  0.0830, -0.0336],
        [ 0.1925,  0.1458, -0.0221],
        [-0.0469,  0.0824,  0.1614],
        [-0.0524,  0.0504,  0.1513],
        [-0.0267, -0.2763,  0.0413],
        [ 0.0489, -0.2374, -0.1050],
        [-0.0710, -0.0626,  0.0337],
        [ 0.0242,  0.0827,  0.1156],
        [ 0.1263,  0.1797,  0.0126],
        [ 0.0558,  0.0943,  0.1023],
        [ 0.1159,  0.0969,  0.2475],
        [ 0.2252,  0.2156,  0.1389],
        [-0.0870, -0.0865,  0.1776],
        [ 0.0164, -0.3269,  0.0223],
        [-0.2119, -0.0809,  0.2650],
        [ 0.2088,  0.2021, -0.0112],
        [ 0.2258,  0.2158,  0.1387],
        [-0.0708, -0.0096, -0.0227],
        [ 0.2964,  0.3012,  0.0490],
        [ 0.0404, -0.2152,  0.0240],
        [ 0.1595, -0.3815, -0.2442],
        [ 0.1943,  0.2041,  0.1520],
        [ 0.2259,  0.2158,  0.1386],
        [ 0.0228, -0.2230,  0.0161],
        [ 0.0530,  0.0969,  0.1046],
 

RuntimeError: Expected floating point type for target with class probabilities, got Long

In [None]:
writer.close()