In [1]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import torch
from torch import nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from datetime import timedelta
import warnings
from preprocessing import get_consecutive
from sklearn.cluster import KMeans
warnings.filterwarnings("ignore")

  import pandas.util.testing as tm


# Preprocesamiento de los datos

In [2]:
# abriendo datos
with open('./data/data_product_02.pk', 'rb') as f:
    data = pickle.load(f)
f.close()
data_std = (data - data.mean())/data.std()
data.head()

Unnamed: 0_level_0,wind_dir,wind_vel,rel_hum,precipitations,temp,mp_25
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-03-24 14:00:00,210.0,3.55,49.0,0.0,16.15,8.5
2014-03-24 16:00:00,211.5,4.15,47.5,0.0,17.0,5.5
2014-03-24 18:00:00,243.5,3.5,56.0,0.0,14.85,16.0
2014-03-24 20:00:00,269.5,1.6,73.5,0.0,11.4,8.5
2014-03-24 22:00:00,300.5,0.799999,86.5,0.0,9.9,14.0


In [3]:
data_with_clusters = data_std.copy()
data_with_clusters['mp_25'] = data['mp_25']
for i in range(2, 11):
    kmeans = KMeans(i)
    kmeans.fit(data_with_clusters)
    data_with_clusters['{}_clusters'.format(i)] = kmeans.labels_
data_with_clusters.head()    

Unnamed: 0_level_0,wind_dir,wind_vel,rel_hum,precipitations,temp,mp_25,2_clusters,3_clusters,4_clusters,5_clusters,6_clusters,7_clusters,8_clusters,9_clusters,10_clusters
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2014-03-24 14:00:00,0.129948,1.434579,-1.733582,-0.207012,1.01723,8.5,1,0,3,0,5,0,1,0,0
2014-03-24 16:00:00,0.14596,1.881879,-1.811555,-0.207012,1.210544,5.5,1,0,3,0,5,0,1,0,0
2014-03-24 18:00:00,0.487556,1.397304,-1.369708,-0.207012,0.721575,16.0,1,0,3,0,5,0,1,0,9
2014-03-24 20:00:00,0.765102,-0.019146,-0.460023,-0.207012,-0.06305,8.5,1,0,3,0,5,0,1,0,0
2014-03-24 22:00:00,1.096023,-0.615547,0.215742,-0.207012,-0.404191,14.0,1,0,3,0,5,0,1,0,9


# Generando feature vectors pero separando la parte timeserie

In [4]:
X, y = get_consecutive(data, 128)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
y_train = [i.mp_25 for i in y_train]
y_test = [i.mp_25 for i in y_test]

Ahora deberíamos tener ``X_train`` que tiene todas las features menos la serie de tiempo justo anterior a su output, esa serie está contenida en ``mp_25_train`` para poder darsela a la red de manera independiente y separar la parte temporal de las características

# Parte con Torch

In [5]:
128*15

1920

In [6]:
input_size = 6
sequence_len = 128
num_layers = 1
hidden_size =1000
learning_rate = 0.001
batch_size = 64
num_epochs = 2

In [7]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        # Batch x time sequence x time features
        self.fc1 = nn.Linear(hidden_size * sequence_len, num_classes)


        self.name = 'rnn'
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        
        # forward
        out, _ = self.rnn(x, h0)
        out = F.relu(out.reshape(out.shape[0], -1))
        out = self.fc1(out)
        return out

In [8]:
model = RNN(input_size, hidden_size, num_layers, 1)

In [9]:
class MP25Dataset(Dataset):
    """Dataset para el proyecto"""
    def __init__(self, X, y, n):
        self.X = [torch.from_numpy(x.values.reshape(1, n, 6)) for x in X]
        self.y = [torch.from_numpy(i.reshape(-1)) for i in y]
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        sample = self.X[idx], self.y[idx]
        return sample
    
    def __repr__(self):
        return 'MP25Dataset'

In [10]:
training_set = MP25Dataset(X_train, y_train, 128)
test_set = MP25Dataset(X_test, y_test, 128)

In [11]:
def test(model, x_test, y_test, loss_function, batch_size=365*24):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for x_i, y_i in zip(x_test, y_test):
            # predict data using the given model
            prediction = model(x_i)
            # Compute loss
            total_loss += loss_function(prediction, y_i).item()

    print(total_loss)

    return total_loss

def train(model, train_set, optimizer, loss_function, epochs=5, batch_size=365*24):
    model.train()
    total_loss = 0

    for i in range(epochs):
        # each epoch
        epoch_loss = 0
        best_test_loss = float('inf')

        
        for j in range(len(train_set)):
        # get the inputs; data is a list of [inputs, labels]
            x_i, y_i = train_set[j]

            x_i = x_i.to(device).float()
            y_i = y_i.to(device).float()
            optimizer.zero_grad()
            y_pred = model(x_i)

            loss = loss_function(y_pred, y_i)
            epoch_loss += loss.item()
            loss.backward()
            optimizer.step()

        # Save results from the best trained model
        if epoch_loss < best_test_loss:
            best_test_loss = epoch_loss
            torch.save(model.state_dict(), '{}.pt'.format(model.name))
            
        total_loss += epoch_loss
        print(f'epoch: {i} loss: {epoch_loss:10.8f}')

    print(f'Average loss: {total_loss/len(train_set):4f}')
    return total_loss

In [12]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')
rnn = RNN(input_size, hidden_size, num_layers, 1)
rnn.to(device)

RNN(
  (rnn): RNN(6, 1000, batch_first=True)
  (fc1): Linear(in_features=128000, out_features=1, bias=True)
)

In [13]:
loss_func = torch.nn.MSELoss().to(device)
opt = optim.Adam(rnn.parameters())

In [14]:
train(rnn, training_set, opt, loss_func)

epoch: 0 loss: 44736195.15032203
epoch: 1 loss: 41284824.10247315
epoch: 2 loss: 40281581.25878270
epoch: 3 loss: 39773577.57589075
epoch: 4 loss: 40369754.19670726
Average loss: 13755.725765


206445932.28417587

In [15]:
n = 100
rnn(training_set[n][0].to(device).float()), training_set[n][1]

(tensor([[26.5034]], device='cuda:0', grad_fn=<AddmmBackward>),
 tensor([11.5205], dtype=torch.float64))

In [16]:
input_size = 24
sequence_len = 128
num_layers = 1
hidden_size =1000
learning_rate = 0.001
batch_size = 64
num_epochs = 2

rnn2 = RNN(input_size, hidden_size, num_layers, 10)
rnn2.to(device)
loss_func = torch.nn.CrossEntropyLoss().to(device)
opt = optim.Adam(rnn2.parameters())

In [17]:
def test(model, x_test, y_test, loss_function, batch_size=365*24):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for x_i, y_i in zip(x_test, y_test):
            # predict data using the given model
            prediction = model(x_i)
            # Compute loss
            total_loss += loss_function(prediction, y_i).item()

    print(total_loss)

    return total_loss

def train(model, train_set, optimizer, loss_function, epochs=5, batch_size=365*24):
    model.train()
    total_loss = 0

    for i in range(epochs):
        # each epoch
        epoch_loss = 0
        best_test_loss = float('inf')

        
        for j in range(len(train_set)):
        # get the inputs; data is a list of [inputs, labels]
            x_i, y_i = train_set[j]

            x_i = x_i.to(device).float()
            y_i = y_i.to(device).float()
            optimizer.zero_grad()
            y_pred = model(x_i)

            loss = loss_function(y_pred.view(1, 10), torch.argmax(y_i).view(-1))
            epoch_loss += loss.item()
            loss.backward()
            optimizer.step()

        # Save results from the best trained model
        if epoch_loss < best_test_loss:
            best_test_loss = epoch_loss
            torch.save(model.state_dict(), '{}.pt'.format(model.name))
            
        total_loss += epoch_loss
        print(f'epoch: {i} loss: {epoch_loss:10.8f}')

    print(f'Average loss: {total_loss/len(train_set):4f}')
    return total_loss



In [18]:
from sklearn.preprocessing import KBinsDiscretizer

discretizador = KBinsDiscretizer(10)
mp_25_vectors = discretizador.fit_transform(data[['mp_25']]).toarray()
X, y = get_consecutive(data_with_clusters, 128)
for i, x in enumerate(X):
    yy = discretizador.transform(x[['mp_25']]).toarray()
    x.drop('mp_25', axis=1, inplace=True)
    for j in range(10):
        x['onehot_{}'.format(j)] = yy[:, j]
        
y = [discretizador.transform(i.mp_25.reshape(-1, 1)).toarray() for i in y]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [19]:

class MP25Dataset2(Dataset):
    """Dataset para el proyecto"""
    def __init__(self, X, y, n):
        self.X = [torch.from_numpy(x.values.reshape(1, n, 24)) for x in X]
        self.y = [torch.from_numpy(i.reshape(-1)) for i in y]
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        sample = self.X[idx], self.y[idx]
        return sample
    
    def __repr__(self):
        return 'MP25Dataset'

In [20]:
training_set2 = MP25Dataset2(X_train, y_train, 128)
test_set2 = MP25Dataset2(X_test, y_test, 128)

In [None]:
train(rnn2, training_set2, opt, loss_func)

epoch: 0 loss: 530782.18175894
epoch: 1 loss: 453484.54944919
