In [32]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from collections import defaultdict

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

# Importe data

In [33]:
df = pd.read_csv('macapa-2000-2020.csv', encoding='iso8859-1')
df = df.drop(columns=['Unnamed: 0'])
df = df.rename( columns = { 'chuva' : 'precipitacao' })
df.head()

Unnamed: 0,data,hora,temperatura,umidade,pressao,vel_vento,direcao_vento,nebulosidade,insolacao,temp_max,temp_min,precipitacao
0,2000-01-01,12.0,27.2,80.0,1006.8,1.0,32.0,10.0,6.854922,32.108248,24.0,0.0
1,2000-01-01,18.0,29.6,64.0,1006.1,2.0,9.0,10.0,6.854922,32.108248,24.121414302836595,6.919249
2,2000-01-02,0.0,25.6,89.0,1005.6,0.0,0.0,10.0,7.2,30.8,24.121414302836595,6.919249
3,2000-01-02,12.0,25.2,90.0,1006.7,0.0,0.0,10.0,6.854922,32.108248,24.0,53.5
4,2000-01-02,18.0,30.2,68.0,1004.3,2.0,5.0,6.0,6.854922,32.108248,24.121414302836595,6.919249


In [34]:
df.data

0        2000-01-01
1        2000-01-01
2        2000-01-02
3        2000-01-02
4        2000-01-02
            ...    
23003    2021-01-05
23004    2021-01-05
23005    2021-01-06
23006    2021-01-06
23007    2021-01-06
Name: data, Length: 23008, dtype: object

# Feature Engineering

- Tratando dados faltantes

In [35]:
# Tabela de percentual de dados faltantes
def table_missing(df, size):
    
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False) * 100

    missing = pd.concat([total, percent], axis=1, join='outer', keys=['Total_faltantes', 'Percentual'])

    missing.index.name = 'Variaveis Numericas'
    
    return missing.head(size)

In [36]:
table_missing(df, 27)

Unnamed: 0_level_0,Total_faltantes,Percentual
Variaveis Numericas,Unnamed: 1_level_1,Unnamed: 2_level_1
precipitacao,134,0.582406
umidade,133,0.57806
pressao,133,0.57806
vel_vento,133,0.57806
direcao_vento,133,0.57806
nebulosidade,133,0.57806
insolacao,133,0.57806
temp_max,133,0.57806
temp_min,133,0.57806
temperatura,130,0.565021


In [37]:
df = df.dropna(axis=0)
df.isnull().sum()

data             0
hora             0
temperatura      0
umidade          0
pressao          0
vel_vento        0
direcao_vento    0
nebulosidade     0
insolacao        0
temp_max         0
temp_min         0
precipitacao     0
dtype: int64

- Transformando os tipos das features

In [38]:
'''
Como algumas variaveis numericas estavam como object 
não estava como float fazemos a transformação da mesmas
'''

df['temperatura'] = df['temperatura'].astype('float')
df['hora'] = df['hora'].astype('float')
df['temp_min'] = df['temp_min'].astype('float')

- Criando Variaveis Sazonais

In [39]:
df['data'] = pd.to_datetime(df['data'])

df["diadasemana"] = df["data"].dt.dayofweek
df["diadomes"] = df["data"].dt.day
df["trimestre"] = df["data"].dt.quarter
df["mes"] = df["data"].dt.month
df["semanadoano"] = df["data"].dt.weekofyear
df["ano"] = df["data"].dt.year

### Atrasos (lags) da série temporal

**LAG = Yt - 1**

Mudando a série [Math Processing Error] para trás, obtemos uma feature em que o valor atual da série temporal está alinhado com seu valor no tempo [Math Processing Error]. Se fizermos uma mudança de 1 lag e treinarmos um modelo com essa nova feature, o modelo poderá prever um passo à frente e observar o estado atual da série. Aumentar o atraso (lag), digamos, até 7, permitirá que o modelo faça previsões 28 passos à frente; no entanto, usará os dados observados nas 28 etapas para trás. Se algo mudar fundamentalmente a série durante esse período não observado, o modelo não capturará essas alterações e retornará previsões com um grande erro. Portanto, durante a seleção do atraso inicial, é preciso encontrar um equilíbrio entre a qualidade ideal da previsão e a duração de previsão.

In [40]:
for l in range(1, 7):
    df[f'lag_{l}'] = df['precipitacao'].shift(l)

### Diferença do Lag 

**DIFF_LAG = Yt-1 - Yt-2**

In [41]:
for l in range(1, 7):
    df[f'diff_lag_{l}'] = df['precipitacao'].diff(l)

# Criar labels para classificação

In [42]:
bins = [-1.0, 0.3, 2.5, 10, 50, 215.800000]

labels = ["sem_chuva", "fraca", "moderada", "forte", "violenta"]
classe = ["0", "1", "2", "3", "4"]

df['labels'] = pd.cut(df.precipitacao, bins = bins, labels = labels).astype('string')
df['classes'] = pd.cut(df.precipitacao, bins = bins, labels = classe).astype('float')

# Tratando dados ausentes das features de Lag e diff de lag

In [43]:
table_missing(df, len(df.columns)).query('Total_faltantes > 0')

Unnamed: 0_level_0,Total_faltantes,Percentual
Variaveis Numericas,Unnamed: 1_level_1,Unnamed: 2_level_1
diff_lag_6,6,0.026231
lag_6,6,0.026231
diff_lag_5,5,0.021859
lag_5,5,0.021859
diff_lag_4,4,0.017487
lag_4,4,0.017487
diff_lag_3,3,0.013115
lag_3,3,0.013115
diff_lag_2,2,0.008744
lag_2,2,0.008744


- Substituir valores ausentes pela media

In [44]:
columns = table_missing(df, len(df.columns)).query('Total_faltantes > 0').index
columns

Index(['diff_lag_6', 'lag_6', 'diff_lag_5', 'lag_5', 'diff_lag_4', 'lag_4',
       'diff_lag_3', 'lag_3', 'diff_lag_2', 'lag_2', 'diff_lag_1', 'lag_1'],
      dtype='object', name='Variaveis Numericas')

In [45]:
for column in columns:
    df[column] = df[column].fillna(df[column].mean())

# Treinamento de modelo

In [46]:
y = df["classes"]
X = df.drop(["labels", "classes", "data"], axis=1)

cv = KFold(n_splits=7, random_state=1, shuffle=True)

- Regressão Logistica

In [47]:
val_scores_lr = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=cv)
print(f'acuracia K_fold: {val_scores_lr}')
print(f'media acuracia {np.mean(val_scores_lr)} std acuracia {np.std(val_scores_lr)}')

acuracia K_fold: [0.95960832 0.95807834 0.95073439 0.94767442 0.94767442 0.95224977
 0.95194368]
media acuracia 0.9525661913555538 std acuracia 0.004337170861965945


- MLP

In [48]:
val_scores_mlp = cross_val_score( MLPClassifier(16), X, y, scoring='accuracy', cv=cv)
print(f'acuracia K_fold: {val_scores_mlp}')
print(f'media acuracia {np.mean(val_scores_mlp)} std acuracia {np.std(val_scores_mlp)}')

acuracia K_fold: [0.9874541  0.98408813 0.99112607 0.98041616 0.99143207 0.9684726
 0.98561371]
media acuracia 0.9840861202272129 std acuracia 0.007313892772189051


- Regressão Logistica

In [49]:
val_scores_tree = cross_val_score(DecisionTreeClassifier(), X, y, scoring='accuracy', cv=cv)
print(f'acuracia K_fold: {val_scores_tree}')
print(f'media acuracia {np.mean(val_scores_tree)} std acuracia {np.std(val_scores_tree)}')

acuracia K_fold: [0.999694   1.         1.         1.         1.         0.99938782
 1.        ]
media acuracia 0.999868831431088 std acuracia 0.00022295594555096998


- XGBOOST

In [53]:
val_scores_xgb = cross_val_score(XGBClassifier(), X, y, scoring='accuracy', cv=cv)
print(f'acuracia K_fold: {val_scores_xgb}')
print(f'media acuracia {np.mean(val_scores_xgb)} std acuracia {np.std(val_scores_xgb)}')

acuracia K_fold: [1. 1. 1. 1. 1. 1. 1.]
media acuracia 1.0 std acuracia 0.0


- Catboost

In [54]:
val_scores_cat = cross_val_score(CatBoostClassifier(), X, y, scoring='accuracy', cv=cv)
print(f'acuracia K_fold: {val_scores_cat}')
print(f'media acuracia {np.mean(val_scores_cat)} std acuracia {np.std(val_scores_cat)}')

acuracia K_fold: [0.999694   1.         1.         1.         1.         0.99938782
 1.        ]
media acuracia 0.999868831431088 std acuracia 0.00022295594555096998


- LGBM

In [52]:
val_scores_LGBM = cross_val_score(LGBMClassifier(), X, y, scoring='accuracy', cv=cv)
print(f'acuracia K_fold: {val_scores_LGBM}')
print(f'media acuracia {np.mean(val_scores_LGBM)} std acuracia {np.std(val_scores_LGBM)}')

acuracia K_fold: [0.999694   1.         1.         0.999694   1.         0.99938782
 1.        ]
media acuracia 0.9998251174950852 std acuracia 0.00022295069860324147
