In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import iplot

In [53]:
def plot_dataset(df, title):
    
    data = []
    value = go.Scatter(
        x=df.index,
        y=df.value,
        mode="lines",
        name="values",
        marker=dict(),
        text=df.index,
        line=dict(color="rgba(0,0,0, 0.3)"),
    )
    data.append(value)

    layout = dict(
        title=title,
        xaxis=dict(title="Date", ticklen=5, zeroline=False),
        yaxis=dict(title="Value", ticklen=5, zeroline=False),
    )

    fig = dict(data=data, layout=layout)
    iplot(fig)

# LSTM

In [54]:
dataframe = pd.read_csv('../normalizados/dados/3500105.csv')
dataframe

Unnamed: 0.1,Unnamed: 0,Município,Código IBGE,Populacao,Total 1ª Dose,Total 2ª Dose,Total Unica,Total Doses Aplicadas,População Vacinada dose1/População Total,População Vacinada/População Total,data,diagnostico_covid19,obito,media_movel_casos,media_movel_obitos,ocupacao_leitos,media_isolamento,Mean.R
0,0,ADAMANTINA,3500105.0,35111.0,-1.388394,-1.677764,-0.250856,-1.497988,-1.388394,-1.658040,2021-02-21,-0.544871,0.590619,0.427559,-0.043846,-1.657499,0.800494,0.0
1,1,ADAMANTINA,3500105.0,35111.0,-1.386443,-1.676090,-0.250856,-1.496095,-1.386443,-1.656399,2021-02-22,1.488685,0.590619,0.620413,-0.043846,-1.590281,-1.072060,0.0
2,2,ADAMANTINA,3500105.0,35111.0,-1.382008,-1.650562,-0.250856,-1.485242,-1.382008,-1.631381,2021-02-23,-0.232016,-0.806242,0.106135,-0.043846,-1.523063,-0.746399,0.0
3,3,ADAMANTINA,3500105.0,35111.0,-1.380944,-1.647632,-0.250856,-1.483601,-1.380944,-1.628510,2021-02-24,-0.701299,0.590619,-0.086720,-0.043846,-1.657499,3.161540,0.0
4,4,ADAMANTINA,3500105.0,35111.0,-1.378283,-1.595739,-0.250856,-1.466059,-1.378283,-1.577653,2021-02-25,-1.170581,0.590619,-0.665283,0.391689,-2.046804,-0.827814,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,144,ADAMANTINA,3500105.0,35111.0,2.018727,1.403590,4.468007,1.935050,2.018727,1.620987,2021-07-15,-0.857726,-0.806242,-0.954564,-1.350450,-3.146099,-0.420737,0.0
145,145,ADAMANTINA,3500105.0,35111.0,2.068926,1.429536,4.468007,1.978589,2.068926,1.646415,2021-07-16,-1.327008,-0.806242,-1.083134,-1.350450,-2.618157,-0.909230,0.0
146,146,ADAMANTINA,3500105.0,35111.0,2.102273,1.441673,4.468007,2.005975,2.102273,1.658309,2021-07-17,-1.796290,-0.806242,-1.468843,-1.785985,-3.228721,-0.095076,0.0
147,147,ADAMANTINA,3500105.0,35111.0,2.102273,1.441673,4.468007,2.005975,2.102273,1.658309,2021-07-18,-1.483435,-0.806242,-1.565270,-1.785985,-3.650234,1.451817,0.0


In [55]:
dataframe = dataframe.set_index(['data'])
dataframe.index = pd.to_datetime(dataframe.index)
dataframe = dataframe.rename(columns={'diagnostico_covid19': 'value'})
if not dataframe.index.is_monotonic:
    dataframe = dataframe.sort_index()
    
plot_dataset(dataframe, title='Casos')

In [56]:
def generate_time_lags(df, n_lags):
    df_n = df.copy()
    for n in range(1, n_lags + 1):
        df_n[f"lag{n}"] = df_n["value"].shift(n)
    df_n = df_n.iloc[n_lags:]
    return df_n
    
input_dim = 12

df_generated = generate_time_lags(dataframe, input_dim)
df_generated

Unnamed: 0_level_0,Unnamed: 0,Município,Código IBGE,Populacao,Total 1ª Dose,Total 2ª Dose,Total Unica,Total Doses Aplicadas,População Vacinada dose1/População Total,População Vacinada/População Total,...,lag3,lag4,lag5,lag6,lag7,lag8,lag9,lag10,lag11,lag12
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-03-05,12,ADAMANTINA,3500105.0,35111.0,-1.228042,-1.385238,-0.250856,-1.295688,-1.228042,-1.371355,...,-0.857726,-0.075589,-1.796290,-1.014153,-1.483435,-1.170581,-0.701299,-0.232016,1.488685,-0.544871
2021-03-06,13,ADAMANTINA,3500105.0,35111.0,-1.173941,-1.352177,-0.250856,-1.247227,-1.173941,-1.338955,...,-0.388444,-0.857726,-0.075589,-1.796290,-1.014153,-1.483435,-1.170581,-0.701299,-0.232016,1.488685
2021-03-07,14,ADAMANTINA,3500105.0,35111.0,-1.173941,-1.352177,-0.250856,-1.247227,-1.173941,-1.338955,...,-1.170581,-0.388444,-0.857726,-0.075589,-1.796290,-1.014153,-1.483435,-1.170581,-0.701299,-0.232016
2021-03-08,15,ADAMANTINA,3500105.0,35111.0,-1.168974,-1.348829,-0.250856,-1.242684,-1.168974,-1.335674,...,-0.388444,-1.170581,-0.388444,-0.857726,-0.075589,-1.796290,-1.014153,-1.483435,-1.170581,-0.701299
2021-03-09,16,ADAMANTINA,3500105.0,35111.0,-1.153010,-1.347574,-0.250856,-1.230947,-1.153010,-1.334443,...,-0.388444,-0.388444,-1.170581,-0.388444,-0.857726,-0.075589,-1.796290,-1.014153,-1.483435,-1.170581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-15,144,ADAMANTINA,3500105.0,35111.0,2.018727,1.403590,4.468007,1.935050,2.018727,1.620987,...,-0.857726,-1.014153,0.080838,-0.701299,-0.388444,0.862975,-1.483435,-0.075589,-0.388444,-0.388444
2021-07-16,145,ADAMANTINA,3500105.0,35111.0,2.068926,1.429536,4.468007,1.978589,2.068926,1.646415,...,-0.701299,-0.857726,-1.014153,0.080838,-0.701299,-0.388444,0.862975,-1.483435,-0.075589,-0.388444
2021-07-17,146,ADAMANTINA,3500105.0,35111.0,2.102273,1.441673,4.468007,2.005975,2.102273,1.658309,...,-0.388444,-0.701299,-0.857726,-1.014153,0.080838,-0.701299,-0.388444,0.862975,-1.483435,-0.075589
2021-07-18,147,ADAMANTINA,3500105.0,35111.0,2.102273,1.441673,4.468007,2.005975,2.102273,1.658309,...,-0.857726,-0.388444,-0.701299,-0.857726,-1.014153,0.080838,-0.701299,-0.388444,0.862975,-1.483435


In [76]:
df_features = (
                dataframe
                .assign(hour = dataframe.index.hour)
                .assign(day = dataframe.index.day)
                .assign(month = dataframe.index.month)
                .assign(day_of_week = dataframe.index.dayofweek)
                .assign(week_of_year = dataframe.index.isocalendar().week)
              )

In [88]:
def onehot_encode_pd(df, col_name):
    dummies = pd.get_dummies(df[col_name], prefix=col_name)
    return pd.concat([df, dummies], axis=1).drop(columns=[col_name])

df_features = onehot_encode_pd(df_features, ['month','day','day_of_week','week_of_year'])

ValueError: Length of 'prefix' (4) did not match the length of the columns being encoded (0).

In [87]:
onehot_encoded

array([[0, 'ADAMANTINA', 3500105.0, ..., 2, 6, 7],
       [1, 'ADAMANTINA', 3500105.0, ..., 2, 0, 8],
       [2, 'ADAMANTINA', 3500105.0, ..., 2, 1, 8],
       ...,
       [146, 'ADAMANTINA', 3500105.0, ..., 7, 5, 28],
       [147, 'ADAMANTINA', 3500105.0, ..., 7, 6, 28],
       [148, 'ADAMANTINA', 3500105.0, ..., 7, 0, 29]], dtype=object)