In [88]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import iplot
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler
from torch.utils.data import TensorDataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim

In [50]:
def plot_dataset(df, title):
    
    data = []
    value = go.Scatter(
        x=df.index,
        y=df.value,
        mode="lines",
        name="values",
        marker=dict(),
        text=df.index,
        line=dict(color="rgba(0,0,0, 0.3)"),
    )
    data.append(value)

    layout = dict(
        title=title,
        xaxis=dict(title="Date", ticklen=5, zeroline=False),
        yaxis=dict(title="Value", ticklen=5, zeroline=False),
    )

    fig = dict(data=data, layout=layout)
    iplot(fig)

# LSTM

In [51]:
dataframe = pd.read_csv('../Finais/3500105.csv').drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
dataframe

Unnamed: 0,Município,Código IBGE,Populacao,Total 1ª Dose,Total 2ª Dose,Total Unica,Total Doses Aplicadas,População Vacinada dose1/População Total,População Vacinada/População Total,data,diagnostico_covid19,obito,media_movel_casos,media_movel_obitos,ocupacao_leitos,media_isolamento,Mean.R
0,ADAMANTINA,3500105.0,35111.0,2068.0,543.0,0.0,2611.0,5.889892,1.546524,2021-02-21,11.0,1.0,16.571429,0.571429,79.81,43.00,0.0
1,ADAMANTINA,3500105.0,35111.0,2079.0,547.0,0.0,2626.0,5.921221,1.557916,2021-02-22,24.0,1.0,17.428571,0.571429,80.29,37.25,0.0
2,ADAMANTINA,3500105.0,35111.0,2104.0,608.0,0.0,2712.0,5.992424,1.731651,2021-02-23,13.0,0.0,15.142857,0.571429,80.77,38.25,0.0
3,ADAMANTINA,3500105.0,35111.0,2110.0,615.0,0.0,2725.0,6.009513,1.751588,2021-02-24,10.0,1.0,14.285714,0.571429,79.81,50.25,0.0
4,ADAMANTINA,3500105.0,35111.0,2125.0,739.0,0.0,2864.0,6.052234,2.104753,2021-02-25,7.0,1.0,11.714286,0.714286,77.03,38.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,ADAMANTINA,3500105.0,35111.0,21276.0,7906.0,632.0,29814.0,60.596394,24.317166,2021-07-15,9.0,0.0,10.428571,0.142857,69.18,39.25,0.0
145,ADAMANTINA,3500105.0,35111.0,21559.0,7968.0,632.0,30159.0,61.402410,24.493748,2021-07-16,6.0,0.0,9.857143,0.142857,72.95,37.75,0.0
146,ADAMANTINA,3500105.0,35111.0,21747.0,7997.0,632.0,30376.0,61.937854,24.576344,2021-07-17,3.0,0.0,8.142857,0.000000,68.59,40.25,0.0
147,ADAMANTINA,3500105.0,35111.0,21747.0,7997.0,632.0,30376.0,61.937854,24.576344,2021-07-18,5.0,0.0,7.714286,0.000000,65.58,45.00,0.0


In [52]:
dataframe = dataframe.set_index(['data'])
dataframe.index = pd.to_datetime(dataframe.index)
dataframe = dataframe.rename(columns={'diagnostico_covid19': 'value'})
if not dataframe.index.is_monotonic:
    dataframe = dataframe.sort_index()
    
plot_dataset(dataframe, title='Casos')

In [53]:
def generate_time_lags(df, n_lags):
    df_n = df.copy()
    for n in range(1, n_lags + 1):
        df_n[f"lag{n}"] = df_n["value"].shift(n)
    df_n = df_n.iloc[n_lags:]
    return df_n
    
input_dim = 12

df_lags = generate_time_lags(dataframe, input_dim)
df_lags

Unnamed: 0_level_0,Município,Código IBGE,Populacao,Total 1ª Dose,Total 2ª Dose,Total Unica,Total Doses Aplicadas,População Vacinada dose1/População Total,População Vacinada/População Total,value,...,lag3,lag4,lag5,lag6,lag7,lag8,lag9,lag10,lag11,lag12
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-03-05,ADAMANTINA,3500105.0,35111.0,2972.0,1242.0,0.0,4214.0,8.464584,3.537353,12.0,...,9.0,14.0,3.0,8.0,5.0,7.0,10.0,13.0,24.0,11.0
2021-03-06,ADAMANTINA,3500105.0,35111.0,3277.0,1321.0,0.0,4598.0,9.333257,3.762354,12.0,...,12.0,9.0,14.0,3.0,8.0,5.0,7.0,10.0,13.0,24.0
2021-03-07,ADAMANTINA,3500105.0,35111.0,3277.0,1321.0,0.0,4598.0,9.333257,3.762354,9.0,...,7.0,12.0,9.0,14.0,3.0,8.0,5.0,7.0,10.0,13.0
2021-03-08,ADAMANTINA,3500105.0,35111.0,3305.0,1329.0,0.0,4634.0,9.413004,3.785139,13.0,...,12.0,7.0,12.0,9.0,14.0,3.0,8.0,5.0,7.0,10.0
2021-03-09,ADAMANTINA,3500105.0,35111.0,3395.0,1332.0,0.0,4727.0,9.669334,3.793683,16.0,...,12.0,12.0,7.0,12.0,9.0,14.0,3.0,8.0,5.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-15,ADAMANTINA,3500105.0,35111.0,21276.0,7906.0,632.0,29814.0,60.596394,24.317166,9.0,...,9.0,8.0,15.0,10.0,12.0,20.0,5.0,14.0,12.0,12.0
2021-07-16,ADAMANTINA,3500105.0,35111.0,21559.0,7968.0,632.0,30159.0,61.402410,24.493748,6.0,...,10.0,9.0,8.0,15.0,10.0,12.0,20.0,5.0,14.0,12.0
2021-07-17,ADAMANTINA,3500105.0,35111.0,21747.0,7997.0,632.0,30376.0,61.937854,24.576344,3.0,...,12.0,10.0,9.0,8.0,15.0,10.0,12.0,20.0,5.0,14.0
2021-07-18,ADAMANTINA,3500105.0,35111.0,21747.0,7997.0,632.0,30376.0,61.937854,24.576344,5.0,...,9.0,12.0,10.0,9.0,8.0,15.0,10.0,12.0,20.0,5.0


In [54]:
df_features = (
                dataframe
                .assign(day = dataframe.index.day)
                .assign(month = dataframe.index.month)
                .assign(day_of_week = dataframe.index.dayofweek)
                .assign(week_of_year = dataframe.index.isocalendar().week)
              )
df_features

Unnamed: 0_level_0,Município,Código IBGE,Populacao,Total 1ª Dose,Total 2ª Dose,Total Unica,Total Doses Aplicadas,População Vacinada dose1/População Total,População Vacinada/População Total,value,obito,media_movel_casos,media_movel_obitos,ocupacao_leitos,media_isolamento,Mean.R,day,month,day_of_week,week_of_year
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2021-02-21,ADAMANTINA,3500105.0,35111.0,2068.0,543.0,0.0,2611.0,5.889892,1.546524,11.0,1.0,16.571429,0.571429,79.81,43.00,0.0,21,2,6,7
2021-02-22,ADAMANTINA,3500105.0,35111.0,2079.0,547.0,0.0,2626.0,5.921221,1.557916,24.0,1.0,17.428571,0.571429,80.29,37.25,0.0,22,2,0,8
2021-02-23,ADAMANTINA,3500105.0,35111.0,2104.0,608.0,0.0,2712.0,5.992424,1.731651,13.0,0.0,15.142857,0.571429,80.77,38.25,0.0,23,2,1,8
2021-02-24,ADAMANTINA,3500105.0,35111.0,2110.0,615.0,0.0,2725.0,6.009513,1.751588,10.0,1.0,14.285714,0.571429,79.81,50.25,0.0,24,2,2,8
2021-02-25,ADAMANTINA,3500105.0,35111.0,2125.0,739.0,0.0,2864.0,6.052234,2.104753,7.0,1.0,11.714286,0.714286,77.03,38.00,0.0,25,2,3,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-15,ADAMANTINA,3500105.0,35111.0,21276.0,7906.0,632.0,29814.0,60.596394,24.317166,9.0,0.0,10.428571,0.142857,69.18,39.25,0.0,15,7,3,28
2021-07-16,ADAMANTINA,3500105.0,35111.0,21559.0,7968.0,632.0,30159.0,61.402410,24.493748,6.0,0.0,9.857143,0.142857,72.95,37.75,0.0,16,7,4,28
2021-07-17,ADAMANTINA,3500105.0,35111.0,21747.0,7997.0,632.0,30376.0,61.937854,24.576344,3.0,0.0,8.142857,0.000000,68.59,40.25,0.0,17,7,5,28
2021-07-18,ADAMANTINA,3500105.0,35111.0,21747.0,7997.0,632.0,30376.0,61.937854,24.576344,5.0,0.0,7.714286,0.000000,65.58,45.00,0.0,18,7,6,28


In [55]:
def onehot_encode_pd(df, col_name):
    for i in col_name:
        dummies = pd.get_dummies(df[i], prefix=i)
        df = pd.concat([df, dummies], axis=1)
    return df

df_features = onehot_encode_pd(df_features, ['month','day','day_of_week','week_of_year'])
df_features.columns

Index(['Município', 'Código IBGE', 'Populacao', 'Total 1ª Dose',
       'Total 2ª Dose', 'Total Unica', 'Total Doses Aplicadas',
       'População Vacinada dose1/População Total',
       'População Vacinada/População Total', 'value', 'obito',
       'media_movel_casos', 'media_movel_obitos', 'ocupacao_leitos',
       'media_isolamento', 'Mean.R', 'day', 'month', 'day_of_week',
       'week_of_year', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6',
       'day_7', 'day_8', 'day_9', 'day_10', 'day_11', 'day_12', 'day_13',
       'day_14', 'day_15', 'day_16', 'day_17', 'day_18', 'day_19', 'day_20',
       'day_21', 'day_22', 'day_23', 'day_24', 'day_25', 'day_26', 'day_27',
       'day_28', 'day_29', 'day_30', 'day_31', 'day_of_week_0',
       'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4',
       'day_of_week_5', 'day_of_week_6', 'week_of_year_7', 'week_of_year_8',
       'week_of_year_9', '

In [56]:
#deixar as datas cíclicas
def generate_cyclical_features(df, col_name, period, start_num=0):
    kwargs = {
        f'sin_{col_name}' : lambda x: np.sin(2*np.pi*(df[col_name]-start_num)/period),
        f'cos_{col_name}' : lambda x: np.cos(2*np.pi*(df[col_name]-start_num)/period)    
             }
    return df.assign(**kwargs).drop(columns=[col_name])

df_features = generate_cyclical_features(df_features, 'day_of_week', 7, 0)
df_features = generate_cyclical_features(df_features, 'month', 12, 1)
df_features = generate_cyclical_features(df_features, 'week_of_year', 52, 0)
df_features

Unnamed: 0_level_0,Município,Código IBGE,Populacao,Total 1ª Dose,Total 2ª Dose,Total Unica,Total Doses Aplicadas,População Vacinada dose1/População Total,População Vacinada/População Total,value,...,week_of_year_26,week_of_year_27,week_of_year_28,week_of_year_29,sin_day_of_week,cos_day_of_week,sin_month,cos_month,sin_week_of_year,cos_week_of_year
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-02-21,ADAMANTINA,3500105.0,35111.0,2068.0,543.0,0.0,2611.0,5.889892,1.546524,11.0,...,0,0,0,0,-0.781831,0.623490,5.000000e-01,0.866025,0.748511,0.663123
2021-02-22,ADAMANTINA,3500105.0,35111.0,2079.0,547.0,0.0,2626.0,5.921221,1.557916,24.0,...,0,0,0,0,0.000000,1.000000,5.000000e-01,0.866025,0.822984,0.568065
2021-02-23,ADAMANTINA,3500105.0,35111.0,2104.0,608.0,0.0,2712.0,5.992424,1.731651,13.0,...,0,0,0,0,0.781831,0.623490,5.000000e-01,0.866025,0.822984,0.568065
2021-02-24,ADAMANTINA,3500105.0,35111.0,2110.0,615.0,0.0,2725.0,6.009513,1.751588,10.0,...,0,0,0,0,0.974928,-0.222521,5.000000e-01,0.866025,0.822984,0.568065
2021-02-25,ADAMANTINA,3500105.0,35111.0,2125.0,739.0,0.0,2864.0,6.052234,2.104753,7.0,...,0,0,0,0,0.433884,-0.900969,5.000000e-01,0.866025,0.822984,0.568065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-15,ADAMANTINA,3500105.0,35111.0,21276.0,7906.0,632.0,29814.0,60.596394,24.317166,9.0,...,0,0,1,0,0.433884,-0.900969,1.224647e-16,-1.000000,-0.239316,-0.970942
2021-07-16,ADAMANTINA,3500105.0,35111.0,21559.0,7968.0,632.0,30159.0,61.402410,24.493748,6.0,...,0,0,1,0,-0.433884,-0.900969,1.224647e-16,-1.000000,-0.239316,-0.970942
2021-07-17,ADAMANTINA,3500105.0,35111.0,21747.0,7997.0,632.0,30376.0,61.937854,24.576344,3.0,...,0,0,1,0,-0.974928,-0.222521,1.224647e-16,-1.000000,-0.239316,-0.970942
2021-07-18,ADAMANTINA,3500105.0,35111.0,21747.0,7997.0,632.0,30376.0,61.937854,24.576344,5.0,...,0,0,1,0,-0.781831,0.623490,1.224647e-16,-1.000000,-0.239316,-0.970942


In [58]:
test_ratio = 0.2
X = df_lags.drop(columns=['value', 'Município', 'Código IBGE', 'Populacao'])
y = pd.DataFrame(data=[])
y['value'] = df_lags['value']

val_ratio = test_ratio / (1 - test_ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_ratio, shuffle=False)

In [75]:
scaler = MinMaxScaler()
X_train_arr = scaler.fit_transform(X_train)

X_val_arr = scaler.transform(X_val)

X_test_arr = scaler.transform(X_test)

y_train_arr = scaler.fit_transform(y_train).reshape(1, -1)
y_val_arr = scaler.transform(y_val)
y_test_arr = scaler.transform(y_test)

y_train_arr = y_train_arr.reshape(-1, 1)

In [28]:
######TALVEZ DELETAR
def get_scaler(scaler):
    scalers = {
        "minmax": MinMaxScaler,
        "standard": StandardScaler,
        "maxabs": MaxAbsScaler,
        "robust": RobustScaler,
    }
    return scalers.get(scaler.lower())()
    
scaler = get_scaler('robust')

In [76]:
batch_size = 64

train_features = torch.Tensor(X_train_arr)
train_targets = torch.Tensor(y_train_arr)
val_features = torch.Tensor(X_val_arr)
val_targets = torch.Tensor(y_val_arr)
test_features = torch.Tensor(X_test_arr)
test_targets = torch.Tensor(y_test_arr)

train = TensorDataset(train_features, train_targets)
val = TensorDataset(val_features, val_targets)
test = TensorDataset(test_features, test_targets)

train_loader = DataLoader(train, batch_size=batch_size, shuffle=False, drop_last=True)
val_loader = DataLoader(val, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader = DataLoader(test, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader_one = DataLoader(test, batch_size=1, shuffle=False, drop_last=True)

In [81]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob):
        super(LSTMModel, self).__init__()

        # Defining the number of layers and the nodes in each layer
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim

        # LSTM layers
        self.lstm = nn.LSTM(
            input_dim, hidden_dim, layer_dim, batch_first=True, dropout=dropout_prob
        )

        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initializing hidden state for first input with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # Initializing cell state for first input with zeros
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        # Forward propagation by passing in the input, hidden state, and cell state into the model
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        # Reshaping the outputs in the shape of (batch_size, seq_length, hidden_size)
        # so that it can fit into the fully connected layer
        out = out[:, -1, :]

        # Convert the final state to our desired output shape (batch_size, output_dim)
        out = self.fc(out)

        return out

In [82]:

class Optimization:
    def __init__(self, model, loss_fn, optimizer):
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.train_losses = []
        self.val_losses = []
    
    def train_step(self, x, y):
        # Sets model to train mode
        self.model.train()

        # Makes predictions
        yhat = self.model(x)

        # Computes loss
        loss = self.loss_fn(y, yhat)

        # Computes gradients
        loss.backward()

        # Updates parameters and zeroes gradients
        self.optimizer.step()
        self.optimizer.zero_grad()

        # Returns the loss
        return loss.item()

In [84]:
def train(self, train_loader, val_loader, batch_size=64, n_epochs=50, n_features=1):
    model_path = f'models/{self.model}_{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}'

    for epoch in range(1, n_epochs + 1):
        batch_losses = []
        for x_batch, y_batch in train_loader:
            x_batch = x_batch.view([batch_size, -1, n_features]).to(device)
            y_batch = y_batch.to(device)
            loss = self.train_step(x_batch, y_batch)
            batch_losses.append(loss)
        training_loss = np.mean(batch_losses)
        self.train_losses.append(training_loss)

        with torch.no_grad():
            batch_val_losses = []
            for x_val, y_val in val_loader:
                x_val = x_val.view([batch_size, -1, n_features]).to(device)
                y_val = y_val.to(device)
                self.model.eval()
                yhat = self.model(x_val)
                val_loss = self.loss_fn(y_val, yhat).item()
                batch_val_losses.append(val_loss)
            validation_loss = np.mean(batch_val_losses)
            self.val_losses.append(validation_loss)

        if (epoch <= 10) | (epoch % 50 == 0):
            print(
                f"[{epoch}/{n_epochs}] Training loss: {training_loss:.4f}\t Validation loss: {validation_loss:.4f}"
            )

    torch.save(self.model.state_dict(), model_path)

In [85]:
def evaluate(self, test_loader, batch_size=1, n_features=1):
    with torch.no_grad():
        predictions = []
        values = []
        for x_test, y_test in test_loader:
            x_test = x_test.view([batch_size, -1, n_features]).to(device)
            y_test = y_test.to(device)
            self.model.eval()
            yhat = self.model(x_test)
            predictions.append(yhat.to(device).detach().numpy())
            values.append(y_test.to(device).detach().numpy())

    return predictions, values

In [86]:
def plot_losses(self):
    plt.plot(self.train_losses, label="Training loss")
    plt.plot(self.val_losses, label="Validation loss")
    plt.legend()
    plt.title("Losses")
    plt.show()
    plt.close()

In [94]:
def get_model(model, model_params):
    models = {

        "lstm": LSTMModel,
    }
    return models.get(model.lower())(**model_params)

input_dim = len(X_train.columns)
output_dim = 1
hidden_dim = 64
layer_dim = 3
batch_size = 64
dropout = 0.2
n_epochs = 100
learning_rate = 1e-3
weight_decay = 1e-6

model_params = {'input_dim': input_dim,
                'hidden_dim' : hidden_dim,
                'layer_dim' : layer_dim,
                'output_dim' : output_dim,
                'dropout_prob' : dropout}

model = get_model('lstm', model_params)

loss_fn = nn.MSELoss(reduction="mean")
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

opt = Optimization(model=model, loss_fn=loss_fn, optimizer=optimizer)
opt.train(train_loader, val_loader, batch_size=batch_size, n_epochs=n_epochs, n_features=input_dim)
opt.plot_losses()

predictions, values = opt.evaluate(test_loader_one, batch_size=1, n_features=input_dim)

AttributeError: module 'torch' has no attribute '_six'