In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter
from utils.nn_data_classifier import load_data, Classifier
from utils.preprocess import preprocess, RNNDataset
writer = SummaryWriter(log_dir='logs')



In [2]:
historical_data = load_data()

historical_data['minute'] = historical_data.Day.dt.minute
historical_data = historical_data[historical_data.minute == 0]
historical_data.drop('minute', axis=1, inplace=True)

classi = Classifier(historical_data)

historical_data

# classified_data = classi.classify_data_rd(historical_data)
# classified_data.to_csv('./historicals/BTC-min-005-005_with_max_period.csv')
# classified_data

Unnamed: 0,Day,open,high,low,Price,Volume BTC,Volume
525539,2017-01-01 01:00:00,966.60,966.60,966.60,966.60,0.117000,113.092200
525479,2017-01-01 02:00:00,964.35,964.35,964.35,964.35,0.044955,43.352316
525419,2017-01-01 03:00:00,963.97,963.97,963.97,963.97,0.000000,0.000000
525359,2017-01-01 04:00:00,960.61,960.61,960.60,960.60,0.011157,10.717626
525299,2017-01-01 05:00:00,963.46,963.46,963.46,963.46,0.000000,0.000000
...,...,...,...,...,...,...,...
2064801,2022-02-28 23:00:00,43085.30,43250.00,43027.93,43053.74,3.016036,129851.644413
2064741,2022-03-01 00:00:00,43221.71,43244.68,43214.95,43237.60,0.305643,13215.276262
2064681,2022-03-01 01:00:00,43594.14,43594.14,43530.98,43548.62,0.124288,5412.571754
2064621,2022-03-01 02:00:00,43290.16,43301.26,43277.55,43296.76,0.243616,10547.801236


In [3]:
classified_data = classi.classify_data_strict_time(time_outlook=2)
classified_data

[-0.00272088 -0.00388863 -0.00052906 ... -0.00889603         nan
         nan]
                        Day      open      high       low     Price  \
525539  2017-01-01 01:00:00    966.60    966.60    966.60    966.60   
525479  2017-01-01 02:00:00    964.35    964.35    964.35    964.35   
525419  2017-01-01 03:00:00    963.97    963.97    963.97    963.97   
525359  2017-01-01 04:00:00    960.61    960.61    960.60    960.60   
525299  2017-01-01 05:00:00    963.46    963.46    963.46    963.46   
...                     ...       ...       ...       ...       ...   
2064801 2022-02-28 23:00:00  43085.30  43250.00  43027.93  43053.74   
2064741 2022-03-01 00:00:00  43221.71  43244.68  43214.95  43237.60   
2064681 2022-03-01 01:00:00  43594.14  43594.14  43530.98  43548.62   
2064621 2022-03-01 02:00:00  43290.16  43301.26  43277.55  43296.76   
2064561 2022-03-01 03:00:00  43160.97  43161.72  43160.97  43161.21   

         Volume BTC         Volume  Classification  
525539     0.11

  outputs = ufunc(*inputs)


Unnamed: 0,Day,open,high,low,Price,Volume BTC,Volume,Classification
525539,2017-01-01 01:00:00,966.60,966.60,966.60,966.60,0.117000,113.092200,0
525479,2017-01-01 02:00:00,964.35,964.35,964.35,964.35,0.044955,43.352316,0
525419,2017-01-01 03:00:00,963.97,963.97,963.97,963.97,0.000000,0.000000,0
525359,2017-01-01 04:00:00,960.61,960.61,960.60,960.60,0.011157,10.717626,1
525299,2017-01-01 05:00:00,963.46,963.46,963.46,963.46,0.000000,0.000000,1
...,...,...,...,...,...,...,...,...
2064801,2022-02-28 23:00:00,43085.30,43250.00,43027.93,43053.74,3.016036,129851.644413,1
2064741,2022-03-01 00:00:00,43221.71,43244.68,43214.95,43237.60,0.305643,13215.276262,1
2064681,2022-03-01 01:00:00,43594.14,43594.14,43530.98,43548.62,0.124288,5412.571754,0
2064621,2022-03-01 02:00:00,43290.16,43301.26,43277.55,43296.76,0.243616,10547.801236,0


In [None]:
classified_data_path = './historicals/BTC-min-005-005_with_max_period.csv'
data = pd.read_csv(classified_data_path, parse_dates=['Day']).iloc[:, 1:]
data.sort_values("Day", inplace=True)
data


In [None]:
SEQ_LEN = 300

In [None]:
processor = preprocess(sequence_length=SEQ_LEN)
training, validation, testing  = processor.preprocess(dataframe=data, validation_size=0.2)

In [None]:
from torch import nn
class RNN_module(nn.Module):
    def __init__(self, hidden_size, input_size, output_size, num_layers):
        super(RNN_module, self).__init__()
        self._num_layers = num_layers
        self._input_size = input_size
        self._hidden_size = hidden_size
        self._output_size = output_size

        self.lstm = nn.LSTM(input_size = self._input_size, hidden_size = self._hidden_size, 
                            num_layers = self._num_layers, batch_first = True)
        self.fc = nn.Linear(in_features=self._hidden_size, out_features= self._output_size)

    def __str__(self):
        return f"RNN LSTM Model w/ {self._input_size} features and {self._num_layers} layers and {self._hidden_size} of hidden size"

    def forward(self, input):
        h_0 = torch.zeros(self._num_layers, input.size(0), self._hidden_size, dtype=torch.float64)
        c_0 = torch.zeros(self._num_layers, input.size(0), self._hidden_size, dtype=torch.float64)

        lstm_output, (h_n, c_n) = self.lstm(input)

        pred = self.fc(lstm_output[:, -1, :])
        
        return pred

In [None]:
from torch.utils.data import DataLoader

learning_rate = 0.01
dim_size = training._features.size(dim=-1)
hidden_size = 60
batch_size = 64
number_of_classes = training[1][1].shape[0]

train_dataloader = DataLoader(training, batch_size = batch_size, shuffle = True)
validation_dataloader = DataLoader(validation, batch_size = batch_size, shuffle = True)
test_dataloader = DataLoader(testing, batch_size = batch_size, shuffle = False)

model = RNN_module(hidden_size = hidden_size, input_size = dim_size,
                     output_size = number_of_classes, num_layers = 1)

In [None]:
labels = training._labels
classes = torch.unique(labels, dim = 0)

freq = {}
for _class in classes:
    freq[_class.item()] = 0

weights = torch.zeros((classes.shape[0],))
total = torch.tensor(len(training))

for input in range(training._beg, training._end):
    _, output = torch.max(training[input][1], 0)
    freq[output.item()] += 1


for idx, _class in enumerate(classes):
    weights[idx] +=  1 - freq[_class.item()]/total

weights

In [None]:
criterion = nn.CrossEntropyLoss(weight= weights)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
CHECKPOINT_PATH = './models_parameters/LSTM/checkpoints_2/'
BEST_PATH = './models_parameters/LSTM/best_model.pth'

def epoch_training(train_dataloader, epoch, total_epochs):
    running_correct = 0
    running_loss = 0.0
    n_of_steps = len(train_dataloader)

    for current_batch, (sequence, label) in enumerate(train_dataloader):
        #forward: we are calculating the loss given the parameters
        outputs = model(sequence)
        loss = criterion(input=outputs, target = label.float())

        #backward: lets update the parameters given the current loss
        optimizer.zero_grad() #nullifies the current gradients. If you don't do this, gradients will be added up (you don't want that)
        loss.backward() #computates the bwrd-prop gradient for each model parameter
        optimizer.step() #updates the model current parameter using the gradients.

        _, predictions = torch.max(outputs, 1)
        _, correct = torch.max(label, 1)

        running_correct += (predictions == correct).sum().item()
        running_loss += loss.item()

        if (current_batch + 1) % 50 == 0:
            print(f"epoch {epoch+1}/{total_epochs}, current step(batch): {current_batch+1}/{n_of_steps}, loss = {loss.item():.4f} ")
            writer.add_scalar('training loss: ', running_loss/50, epoch * n_of_steps + current_batch)
            writer.add_scalar('accuracy: ', running_correct/50, epoch * n_of_steps + current_batch)
            running_loss = 0.0
            running_correct = 0

    writer.add_scalar('Epoch loss: ', loss, epoch + 1)


def epoch_validate(validation_dataloader, epoch, total_epochs):
    with torch.no_grad():
        n_corrects = 0
        n_samples = 0
        
        for current_batch, (sequence, label) in enumerate(validation_dataloader):

            #forward: we are calculating the loss given the parameters
            outputs = model(sequence)
            _, predictions = torch.max(outputs, 1)
            
            n_samples += outputs.shape[0]
            n_corrects += (predictions == label).sum().item()

        acc = 100.0 * n_corrects / n_samples

        print(f"epoch {epoch+1}/{total_epochs} accuracy: {acc}")
        writer.add_scalar('Validation Accuracy: ', acc, epoch+1)

    return acc


def train_loop(train_dataloader: DataLoader, validation_dataloader: DataLoader, epochs: int):
    
    max_accuracy = 0
    is_best = False

    for epoch in range(epochs):
        epoch_training(train_dataloader, epoch, epochs)

        accuracy = epoch_validate(validation_dataloader, epoch, epochs)

        if accuracy > max_accuracy:
            is_best = True
        
        checkpoint = {
            'epoch': epoch,
            'model_state': model.state_dict(),
            'optim_state': optimizer.state_dict()
        }

        if is_best:
            torch.save(checkpoint, BEST_PATH)
        
        torch.save(checkpoint, CHECKPOINT_PATH+f'model_{epoch+1}.pth')

In [None]:
def test_loop(test_dataloader: DataLoader, model: nn.Module):
    with torch.no_grad():
        n_corrects = 0
        n_samples = 0

        for current_batch, (sequence, label) in enumerate(test_dataloader):
            #forward: we are calculating the loss given the parameters
            outputs = model(sequence)
            _, predictions = torch.max(outputs, 1)

            n_samples += outputs.shape[0]
            n_corrects += (predictions == label).sum().item()

            if (current_batch + 1) % 200 == 0:
                print(f"test batch: {current_batch+1}/{len(test_dataloader)}, current accuracy: {100 * n_corrects / n_samples}")

        acc = 100.0 * n_corrects / n_samples
        print(f"final test accuracy: {acc}")


In [None]:
epochs = 10
train_loop(train_dataloader=train_dataloader, validation_dataloader = validation_dataloader, epochs=epochs)
test_loop(test_dataloader=test_dataloader) #curioso: quando o dataloader tava em lista (tuple) o modelo tava treinando bem mais rápido...

In [None]:
test_loop(test_dataloader)

Using best model in validation

In [None]:
best_model = RNN_module(hidden_size = hidden_size, input_size = dim_size,
                     output_size = number_of_classes, num_layers = 1)

print(f'{best_model}')

In [None]:
best_model = RNN_module(hidden_size = hidden_size, input_size = dim_size,
                     output_size = number_of_classes, num_layers = 1)
checkpoint = torch.load(BEST_PATH)
print(f'Model type: {best_model}')
print(f'Best performing model found at {checkpoint["epoch"]}ºepoch')

best_model.load_state_dict(state_dict=checkpoint['model_state'], strict=True)
best_model.eval()

test_loop(test_dataloader=test_dataloader, model=best_model)


In [None]:
writer.close()