In [36]:
import torch
from torch import nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch import optim
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import gc
import math
from sklearn.metrics import r2_score 

In [49]:
#Params
input_size=200
batch_size=512
nbr_epochs=20
data_split_ratio=0.8
chunksize = 1000000
lr = 0.0001
y_column = "30s"
files_x = ["../python-docker/Swedbank_A/x_Swedbank_A_200_p.csv",]
files_y = ["../python-docker/Swedbank_A/y_Swedbank_A_200_tmp.csv",]

In [50]:
def splitData(xs, ys, trainRatio):
    t = round(len(xs)*trainRatio)
    
    train_data_x = torch.tensor(xs[:t].values, dtype=torch.float32)
    train_data_y = torch.tensor(ys[:t].values, dtype=torch.float32)
    
    dev_data_x = torch.tensor(xs[t:].values, dtype=torch.float32)
    dev_data_y = torch.tensor(ys[t:].values, dtype=torch.float32)
    
    return TensorDataset(train_data_x, train_data_y), TensorDataset(dev_data_x, dev_data_y)

In [51]:
#compute_unit = "cuda:0" if torch.cuda.is_available() else "cpu"
device = torch.device('cuda:0')
#device = torch.device("cpu")
#compute_unit = "cpu"

In [52]:
#dtype = torch.FloatTensor
dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

In [53]:
class StockModel(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        
        self.net = nn.Sequential(
            nn.Linear(input_size, 300).type(dtype),
            nn.LeakyReLU(),
            nn.Linear(300, 20).type(dtype),
            nn.LeakyReLU(),
            nn.Linear(20, 1).type(dtype),
            nn.LeakyReLU()
        )
        
        torch.nn.init.xavier_uniform_(self.net[0].weight)
        torch.nn.init.xavier_uniform_(self.net[2].weight)
        torch.nn.init.xavier_uniform_(self.net[4].weight)
        
        #self.fc1 = nn.Linear(input_size, 300).type(dtype)
        #self.fc1.weight.data.uniform_(-0.1, 0.1)
        #self.fc2 = nn.Linear(300, 20).type(dtype)
        #self.fc2.weight.data.uniform_(-0.1, 0.1)
        #self.fc3 = nn.Linear(20, 1).type(dtype)
        #self.fc3.weight.data.uniform_(-0.1, 0.1)
        #self.fc4 = nn.Linear(1, 20).type(dtype)
        #self.fc4.weight.data.uniform_(-0.1, 0.1)
        #self.fc5 = nn.Linear(20, 1).type(dtype)
        #elf.fc5.weight.data.uniform_(-0.1, 0.1)
        #self.fc6 = nn.Linear(100, 20).type(dtype)
        #self.fc6.weight.data.uniform_(-0.1, 0.1)
        #self.fc7 = nn.Linear(20, 1).type(dtype)
        #self.fc7.weight.data.uniform_(-0.1, 0.1)
        
        #self.bn1 = nn.BatchNorm1d(num_features=input_size, track_running_stats=True)
        #self.sig = nn.Sigmoid()
        
        #self.drop_layer = nn.Dropout(p=0.010)
        #self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.net(x)
        return x

In [54]:
def evaluate_model(data, model, loss_fn):
    losses = []
    ys = []
    predictions = []
    model.eval()
    with torch.no_grad():
        for x, y in data:
            y = y.type(dtype)
            x = x.type(dtype)
            pred = model(x).squeeze()
            loss = loss_fn(pred, y)
            losses.append(loss.item())
            ys.extend(y.tolist())
            predictions.extend(pred.tolist())
        avg_loss = sum(losses)/len(losses)    
        
    r2 = r2_score(ys, predictions)
    return avg_loss, predictions, r2

In [55]:
def train_model(model, train_data_loader, dev_data_loader, loss_fn, optimizer, epochrange, batchsize):
    for epoch in range(epochrange):
        losses = []
        n_correct = 0
        model.train()
        for x, y in train_data_loader:
            y = y.type(dtype)
            x = x.type(dtype)
            pred = model(x).squeeze()
            loss = loss_fn(pred, y)
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()    

        # Compute accuracy and loss in the entire training set
        train_avg_loss = sum(losses)/len(losses)    
        
        dev_avg_loss,_,r2 = evaluate_model(dev_data_loader, model, loss_fn)
        
        # Display metrics
        display_str = 'Epoch {} '
        display_str += '\tLoss: {:.6f} '
        display_str += '\tLoss (val): {:.6f}'
        display_str += '\tR^2 score: {:.4f}'
        print(display_str.format(epoch, train_avg_loss, dev_avg_loss, r2))

In [56]:
def train_chunk(model, loss_fn, optimizer, nbr_epochs, x_data, y_data, data_split_ratio, batch_size):
    train_data, dev_data = splitData(x_data, y_data, data_split_ratio)
    train_data_loader = DataLoader(train_data, batch_size=batch_size, drop_last=True)
    dev_data_loader = DataLoader(dev_data, batch_size=batch_size, drop_last=True)
    train_model(model, train_data_loader, dev_data_loader, loss_fn, optimizer, nbr_epochs, batch_size)

In [None]:
#Start training
model = StockModel(input_size)
loss_fn = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=lr, eps=0.001)
model = model.to(device)
test_data_x = pd.DataFrame()
test_data_y = pd.DataFrame()
for i in range(len(files_x)):
    print("Current file: " + files_x[i])
    total_rows = sum(1 for row in open(files_x[i], 'r'))
    number_of_loops = int(total_rows/chunksize)
    print("Number of chunks: " + str(number_of_loops))
    current_loop = 0
    with pd.read_csv(files_x[i], sep=";", dtype="float32", usecols = [j for j in range(input_size)], chunksize=chunksize) as reader_x,\
    pd.read_csv(files_y[i], sep=";", dtype="float32", converters = {'ts': int}, chunksize=chunksize) as reader_y:
        for chunk_x, chunk_y in zip(reader_x, reader_y):
            print("Progress: " + "{:.2f}".format(100 * current_loop/number_of_loops) + "%")
            x_data = chunk_x
            y_data = chunk_y
            if(current_loop < data_split_ratio * number_of_loops):
                y_data = y_data[y_column]
                train_chunk(model, loss_fn, optimizer, nbr_epochs, x_data, y_data, data_split_ratio, batch_size)
            else:
                print("Append test data")
                test_data_x = test_data_x.append(x_data)
                test_data_y = test_data_y.append(y_data)
            current_loop+=1

test_data_x = torch.tensor(test_data_x.values, dtype=torch.float32)
test_data_y= torch.tensor(test_data_y[y_column].values, dtype=torch.float32)
test_data = TensorDataset(test_data_x, test_data_y)

Current file: ../python-docker/Swedbank_A/x_Swedbank_A_200_p.csv
Number of chunks: 4


  app.launch_new_instance()


Progress: 0.00%
Epoch 0 	Loss: 69.295307 	Loss (val): 0.065076	R^2 score: 0.9375
Epoch 1 	Loss: 0.399226 	Loss (val): 1.087863	R^2 score: -0.0456
Epoch 2 	Loss: 0.674165 	Loss (val): 0.364510	R^2 score: 0.6497
Epoch 3 	Loss: 0.882123 	Loss (val): 0.146160	R^2 score: 0.8595
Epoch 4 	Loss: 0.973030 	Loss (val): 0.124758	R^2 score: 0.8801
Epoch 5 	Loss: 0.966249 	Loss (val): 0.169450	R^2 score: 0.8371
Epoch 6 	Loss: 0.951875 	Loss (val): 0.266490	R^2 score: 0.7439
Epoch 7 	Loss: 0.932138 	Loss (val): 0.390163	R^2 score: 0.6250
Epoch 8 	Loss: 0.911748 	Loss (val): 0.524433	R^2 score: 0.4960
Epoch 9 	Loss: 0.893053 	Loss (val): 0.709130	R^2 score: 0.3184
Epoch 10 	Loss: 0.870688 	Loss (val): 0.861051	R^2 score: 0.1724
Epoch 11 	Loss: 0.848649 	Loss (val): 0.999828	R^2 score: 0.0390
Epoch 12 	Loss: 0.827177 	Loss (val): 1.062042	R^2 score: -0.0207
Epoch 13 	Loss: 0.808323 	Loss (val): 0.973482	R^2 score: 0.0644
Epoch 14 	Loss: 0.790097 	Loss (val): 0.752422	R^2 score: 0.2768
Epoch 15 	Loss: 

  app.launch_new_instance()


Progress: 25.00%
Epoch 0 	Loss: 0.679074 	Loss (val): 0.488552	R^2 score: 0.4946
Epoch 1 	Loss: 0.627874 	Loss (val): 0.501737	R^2 score: 0.4810
Epoch 2 	Loss: 0.610543 	Loss (val): 0.503239	R^2 score: 0.4794
Epoch 3 	Loss: 0.594560 	Loss (val): 0.495292	R^2 score: 0.4876
Epoch 4 	Loss: 0.578827 	Loss (val): 0.476173	R^2 score: 0.5074
Epoch 5 	Loss: 0.571962 	Loss (val): 0.226938	R^2 score: 0.7652
Epoch 6 	Loss: 0.529304 	Loss (val): 0.243966	R^2 score: 0.7476
Epoch 7 	Loss: 0.508670 	Loss (val): 0.252647	R^2 score: 0.7386
Epoch 8 	Loss: 0.506095 	Loss (val): 0.189357	R^2 score: 0.8041
Epoch 9 	Loss: 0.510699 	Loss (val): 0.065778	R^2 score: 0.9320
Epoch 10 	Loss: 0.504896 	Loss (val): 0.029971	R^2 score: 0.9690
Epoch 11 	Loss: 0.500020 	Loss (val): 0.032689	R^2 score: 0.9662
Epoch 12 	Loss: 0.487612 	Loss (val): 0.052024	R^2 score: 0.9462
Epoch 13 	Loss: 0.475381 	Loss (val): 0.077426	R^2 score: 0.9199
Epoch 14 	Loss: 0.463658 	Loss (val): 0.119151	R^2 score: 0.8767


In [None]:
test_data_loader = DataLoader(test_data, batch_size=2)
loss, preds, r2 = evaluate_model(test_data_loader, model, loss_fn)
print("Test loss: " + str(loss))
print("Test R^2: " + str(r2))

In [None]:
plt.plot(list(range(len(preds))), preds, label="Predictions")
plt.plot(list(range(len(test_data_y))), test_data_y.tolist(), label="Target")
axes = plt.gca()
#axes.set_ylim([146,148])
plt.legend()
#axes.set_xlim([260000,261200])
#axes.set_xlim([100000,120000])
#axes.set_xlim([140000,160000])
#xes.set_xlim([540000,560000])
#axes.set_xlim([610000,660000])
#axes.set_xlim([660000,660100])
#axes.set_ylim([157,158])
plt.show()