In [1]:
import re
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup as soup
from typing import List, Union, Dict

In [2]:
periods = [str(i) for i in list(range(1957, 2017))]

In [3]:
import numpy as np
from KYOTO.libraries.dst import DstExtraction
from KYOTO.libraries.rw_processed_data import load_processed_data
classification_rules = {
    'fraca':            np.array(range(-31, -51, -1)),
    'moderada':         np.array(range(-51, -101, -1)),
    'intensa':          np.array(range(-101, -251, -1)),
    'super_intensa':    np.array(range(-251, -1001, -1)),
}

# dst = DstExtraction(periods=periods)
# dst.make_classification(classification_rules)

# A base completa com os dados de Dst estão neste arquivo: dst_full_data
df_filtered = load_processed_data(name='dst_full_data')

In [27]:
import numpy as np
from KYOTO.libraries.kp import KpExtraction

kp = KpExtraction(periods=periods)

In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder

In [8]:
def class_count(df: pd.DataFrame):
    count = {"fraca": 0, "moderada": 0, "intensa": 0, "super_intensa": 0}
    for i in range(1, len(df)):
        count[df['classification'].iloc[i]]+=1
    print(count)

In [9]:
class_count(df_filtered)

{'fraca': 4479, 'moderada': 2703, 'intensa': 592, 'super_intensa': 54}


In [10]:
def accuracy_range(results):
    mean = results.mean()
    dv = results.std()
    print('Acurácia média: {:.2f}%'.format(mean*100))
    print('Intervalo de acurácia: [{:.2f}% ~ {:.2f}%]'.format((mean - 2*dv)*100, (mean + 2*dv)*100))
    print(results)
    print('--------------------------------------------------------------')

In [11]:
def precision_range(results):
  mean = results.mean()
  dv = results.std()
  print('Precisão média: {:.2f}%'.format(mean*100))
  print('Intervalo de precisão: [{:.2f}% ~ {:.2f}%]'.format((mean - 2*dv)*100, (mean + 2*dv)*100))
  print(results)
  print('--------------------------------------------------------------')

In [149]:
def recall_range(results):
  mean = results.mean()
  dv = results.std()
  print('Recall médio: {:.2f}%'.format(mean*100))
  print('Intervalo de recall: [{:.2f}% ~ {:.2f}%]'.format((mean - 2*dv)*100, (mean + 2*dv)*100))
  print(results)
  print('--------------------------------------------------------------')

In [161]:
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [12]:
#Codificação da classe alvo
le = LabelEncoder()
df_filtered['classification'] = le.fit_transform(df_filtered['classification'])

# Divisao entre treino/teste e avaliação do dataframe original
X = df_filtered['dst_min'].values.reshape(-1, 1) 
y = df_filtered['classification']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, stratify = y)

print('O dataset de treino possui {} tempestades e o de teste {} tempestades.'.format(X_train.shape[0], X_test.shape[0]))
print('--------------------------------------------------------------')

#Definições de modelo e método de validação cruzada
SEED = 10
np.random.seed(SEED)
cv = StratifiedKFold(n_splits=5, shuffle=True)

model = GaussianNB()

#Treinamento/Avaliação do modelo
accuracy = cross_val_score(model, X_train, y_train, cv = cv, scoring='accuracy')
precision = cross_val_score(model, X_train, y_train, cv = cv, scoring='precision_macro')
recall = cross_val_score(model, X_train, y_train, cv = cv, scoring='recall_macro')

accuracy_range(accuracy)
precision_range(precision)
recall_range(recall)

'''
#Relatório de classificação -> Desempenho por classes | Modelo em treinamento (Partição de treino do dataset)
#print('Relatório de classificação:\n', classification_report(y_train, y_pred, digits=4))

#Matriz de confusão
y_pred = cross_val_predict(model, X_train, y_train, cv = cv)
fig, ax = plt.subplots()
matrix = confusion_matrix(y_train, y_pred)
sns.heatmap(matrix, annot=True, ax=ax, fmt='d', cmap='Reds')

ax.set_title("Matriz de Confusão", fontsize=18)
ax.set_ylabel("True label")
ax.set_xlabel("Predicted Label")

plt.tight_layout()
'''

#Relatório de Classificação -> Desempenho por classes | Modelo Final (Usando a partição de avaliação do dataset)
np.random.seed(SEED)
final_model = GaussianNB()
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
y_prob = final_model.predict_proba(X_test)
print("\nRelatório de Classificação do modelo final:\n\n", classification_report(y_test, y_pred, digits=4))

#Matriz de confusão
fig, ax = plt.subplots()
final_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(final_matrix, annot=True, ax=ax, fmt='d', cmap='Reds')

ax.set_title("Matriz de Confusão", fontsize=18)
ax.set_ylabel("True label")
ax.set_xlabel("Predicted Label")

plt.tight_layout()


O dataset de treino possui 490 tempestades e o de teste 211 tempestades.
Acurácia média: 98.16%
Intervalo de acurácia: [95.78% ~ 100.54%]
