In [165]:
import os
from datetime import datetime
from hashlib import sha256

import pandas as pd
import numpy as np

# -------------------------
# model libs
from pycaret.classification import *
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from nimbusml.ensemble import LightGbmClassifier # Módulo de ML da Microsoft
from catboost import CatBoostClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier

# -------------------------
# model validation and hyperparameter tunning libs
from sklearn.model_selection import (cross_val_score, RepeatedStratifiedKFold,
                                     RandomizedSearchCV, GridSearchCV, train_test_split)

from sklearn.metrics import (precision_recall_curve, average_precision_score, classification_report)

# -------------------------
# graph libs
import matplotlib.pyplot as plt
import seaborn as sns

import pickle

os.chdir("../src/")
from utils.data_describe import DataDescribe as dd

basepath = "../data/external/"
path_x_train = "../data/external/train_values.csv"
path_y_train = "../data/external/train_labels.csv"
path_x_test = "../data/external/test_values.csv"

path_x_selected = "../data/interim/"
path_processed = "../data/processed/"
path_model = "../models/"

In [15]:
# List all files in a directory using scandir()
print("-"*30)
print(f"Arquivos em {path_processed}:\n")

with os.scandir(path_processed) as entries:
    for entry in entries:
        if entry.is_file() and (("csv" in entry.name) or ("pqt" in entry.name)):
            print(entry.name)

print("-"*30)

------------------------------
Arquivos em ../data/processed/:

df_train.pqt
X_test_encoded.csv
X_test_encoded.pqt
y_test.csv
------------------------------


# Carregando dataframe da preparação de dados

In [26]:
df_train = pd.read_parquet(path_processed+"df_train.pqt")
print(f"Dimensão do dataset de treinamento: {df_train.shape}.")
df_train.head()

Dimensão do dataset de treinamento: (260601, 39).


Unnamed: 0_level_0,land_surface_condition,roof_type,legal_ownership_status,other_floor_type,position,foundation_type,ground_floor_type,plan_configuration,geo_level_1_id,geo_level_2_id,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,-0.374641,0.181631,0.108199,0.458191,1.991107,0.409977,0.359917,0.121091,-0.983414,-0.518705,...,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,3
28830,4.640181,0.181631,0.108199,0.458191,-0.25591,0.409977,0.063218,0.121091,-0.734459,0.481998,...,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,2
94947,-0.374641,0.181631,0.108199,0.349335,1.991107,0.409977,0.359917,0.121091,0.883744,-0.819158,...,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,3
590882,-0.374641,0.181631,0.108199,0.349335,-0.25591,0.409977,0.359917,0.121091,1.008221,-0.685893,...,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,2
201944,-0.374641,0.181631,0.108199,0.349335,-0.25591,0.409977,0.359917,0.121091,-0.361028,-1.381296,...,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,3


# Testando modelos

Nessa parte, estou usando a biblioteca [PyCaret](https://pycaret.org/) para avaliar vários modelos para esse problema de classificação múltipla. Depois dessa avaliação inicial, pegaremos um ou alguns dos modelos para seguir em frente (fazer tunning dos hiperparâmetros e testar com sets diferentes de dados obtidos após seleção de variáveis).

In [4]:
clf1 = setup(data = df_train, target = "damage_grade")
best = compare_models(sort = 'F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Extreme Gradient Boosting,0.724,0.0,0.6299,0.7245,0.7158,0.4724,0.4815,93.3758
1,CatBoost Classifier,0.7205,0.0,0.624,0.7214,0.7117,0.4641,0.4741,337.9511
2,Light Gradient Boosting Machine,0.7107,0.0,0.6106,0.7119,0.7005,0.4428,0.4543,15.5176
3,Extra Trees Classifier,0.6945,0.0,0.6181,0.6908,0.6895,0.4288,0.4321,28.9393
4,Random Forest Classifier,0.6879,0.0,0.6145,0.6852,0.6815,0.4137,0.4189,2.2636
5,Gradient Boosting Classifier,0.681,0.0,0.5669,0.6855,0.6637,0.3723,0.3922,1676.2733
6,Decision Tree Classifier,0.6459,0.0,0.5965,0.6471,0.6465,0.3648,0.3648,2.9573
7,K Neighbors Classifier,0.6406,0.0,0.5701,0.6357,0.6363,0.3347,0.3363,38.5735
8,Ada Boost Classifier,0.6456,0.0,0.517,0.6616,0.6112,0.2808,0.318,12.225
9,Logistic Regression,0.5883,0.0,0.4477,0.5772,0.524,0.1368,0.1731,10.85


## Resultado da seleção

Na tabela acima (e abaixo, adicionei uma figura para não perdermos o primeiro resultado, nosso baseline), vemos que os modelos de boosting trees se sairam melhor nesse problema.
Gostaria de testar os seguintes modelos:

1. ~~[Extreme Gradient Boosting](https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier) -> pois está em primeiro.~~ *veremos posteriormente, pois demora demais*
1. ~~[CatBoost Classifier](https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html) -> pois está em segundo.~~ *veremos posteriormente, pois demora demais*
1. [Light Gradient Boosting Machine](https://lightgbm.readthedocs.io/en/latest/Installation-Guide.html) -> pois está em terceiro.
1. ~~[SVM com kernel não linear](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC) -> pois NÃO está na lista, apenas seu kernel linear.~~ *veremos posteriormente, pois demora demais*
1. [Random Forest](https://scikit-learn.org/stable/modules/ensemble.html#forest) -> pois roda rápido.

<img src="./images/3_modeling_models_comparation_v1.JPG" alt="Tabela com o comparativo dos modelos.">



### Extreme Gradient Boosting (XGBoost)

Para os modelos Extreme Gradient Boosting, CatBoost Classifier e Light Gradient Boosting, usamos (muito) o texto do grande Jason Brownlee do [Machine Learning Mastery](https://machinelearningmastery.com/gradient-boosting-with-scikit-learn-xgboost-lightgbm-and-catboost/), além das respectivas documentações (links colocados anteriormente).

In [28]:
X = df_train.drop(["damage_grade"], axis=1)
y = df_train["damage_grade"]

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=random_state)

n_splits = 3
n_repeats = 3
random_state = 42
scoring = "f1_micro"

cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

In [5]:
# model_xgb = XGBClassifier()

# n_scores = cross_val_score(model_xgb, X, y, scoring=scoring, cv=cv, n_jobs=-1, error_score='raise')

# print(f"F1-micro: média: {n_scores.mean():.3f}  desvio padrão: {n_scores.std():.3f}")

In [6]:
print("""F1-micro: 0.729 (0.002) <- Esse é o valor que obtivemos com o XGboost com:
        n_splits = 10
        n_repeats = 3""")

F1-micro: 0.729 (0.002) <- Esse é o valor que obtivemos com o XGboost com:
        n_splits = 10
        n_repeats = 3


### CatBoost Classifier

In [7]:
# model_catb = CatBoostClassifier(verbose=0, n_estimators=100)

# n_scores = cross_val_score(model_catb, X, y, scoring=scoring, cv=cv, n_jobs=-1, error_score='raise')

# print(f"F1-micro: média: {n_scores.mean():.3f}  desvio padrão: {n_scores.std():.3f}")

### Light Gradient Boosting (LightGBM)

In [29]:
model_lgbm = LGBMClassifier()

n_scores = cross_val_score(model_lgbm, X_train, y_train, scoring=scoring, cv=cv, n_jobs=-1, error_score='raise')

print(f"F1-micro: média: {n_scores.mean():.3f}  desvio padrão: {n_scores.std():.3f}")

F1-micro: média: 0.711  desvio padrão: 0.001


### SVM com kernel não linear

In [30]:
# model_svc = SVC(kernel='rbf', random_state=random_state)

# n_scores = cross_val_score(model_svc, X, y, scoring=scoring, cv=cv, n_jobs=-1, error_score='raise')

# print(f"F1-micro: média: {n_scores.mean():.3f}  desvio padrão: {n_scores.std():.3f}")

### Random forest

In [137]:
model_rf = RandomForestClassifier()

n_scores = cross_val_score(model_rf, X_train, y_train, scoring=scoring, cv=cv, n_jobs=-1, error_score='raise')

print(f"F1-micro: média: {n_scores.mean():.3f}  desvio padrão: {n_scores.std():.3f}")

F1-micro: média: 0.715  desvio padrão: 0.002


In [382]:
model_rf.fit(X_train, y_train)

y_pred = model_rf.predict(X_test)
y_pred_proba = model_rf.predict_proba(X_test)

classification_report_dict = classification_report(y_validation, y_pred, output_dict=True)

In [383]:
classification_metrics = classification_report_dict["macro avg"]
classification_metrics["accuracy"] = classification_report_dict["accuracy"]

model_name = str(model_rf)[:str(model_rf).find("(")]

hash_evaluation_metric = hash((model_name, classification_metrics["f1-score"]))

result = {hash_evaluation_metric: {"model": model_name, **classification_metrics,
          "params": str(model_rf.get_params()), "date": datetime.today().date()}}

result = pd.DataFrame.from_dict(result, orient="index")

In [384]:
try:
    result2 = pd.read_csv(path_processed+"model_evaluation.csv", index_col=0)
except:
    result2 = pd.DataFrame(columns=['model', 'precision', 'recall', 'f1-score', 'support', 'accuracy','params', 'date'])
    result2.to_csv(path_processed+"model_evaluation.csv")

if result.index in result2.index.tolist():
    date = result2.loc[int(result.index.values), "date"]
    print(f"O registro {int(result.index.values)} já foi salvo anteriormente no dia {date}.")
else:
    print(f"O registro {result.index} foi salvo na base model_evaluation.csv.")
    result2 = result2.append(result)
    result2.to_csv(path_processed+"model_evaluation.csv")

result2

O registro Int64Index([5000852473507659831], dtype='int64') foi salvo na base model_evaluation.csv.


Unnamed: 0,model,precision,recall,f1-score,support,accuracy,params,date
5000852473507659831,RandomForestClassifier,0.706899,0.65522,0.676186,52121,0.727499,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",2020-09-14


# Otimização de hiperparâmetros

## Tunning do Light Gradient Boosting

Inicialmente, faremos o tunning sobre o Light Gradient Boosting, pois é o que possui menor tempo de execução (para comparação, o Extreme Gradient Boosting levou 17x mais que o Light Gradient Boosting, cerca de 60 min contra 3,5 min do LightGBM).

In [None]:
# params = {"boosting_type":["gbdt", "dart", "goss"],
#           "learning_rate":[.001, .01, .1],
#           "n_estimators":[10,50,100,150]}

# grid_search = GridSearchCV(estimator=model_lgbm, param_grid=params,
#                            scoring=scoring,cv=5, n_jobs=-1)

# grid_search = grid_search.fit(X, y)
# grid_search.best_params_

In [15]:
# grid_search.score(X, y)

0.7280286721846808

### Usando os valores de hiperparâmetros após otimização

In [16]:
# model_lgbm = LGBMClassifier(**grid_search.best_params_)

# n_scores = cross_val_score(model_lgbm, X, y, scoring=scoring, cv=cv, n_jobs=-1, error_score='raise')

# print('F1-micro: %.3f (%.3f)' % (n_scores.mean(), n_scores.std()))

F1-micro: 0.720 (0.002)


## Tunning do Random Forest

In [134]:
n_features_half = int(len(X_train.columns)/2)

params = {"n_estimators":[50, 100, 150],
          "criterion":["entropy", "gini"],
          "max_features":["sqrt", "log2", n_features_half]}

grid_search = GridSearchCV(estimator=model_rf, param_grid=params,
                           scoring=scoring, cv=3, n_jobs=1)

grid_search = grid_search.fit(X_train, y_train)
grid_search.best_params_

{'criterion': 'gini', 'max_features': 19, 'n_estimators': 150}

### Avaliando o modelo

In [376]:
grid_search.best_params_

{'criterion': 'gini', 'max_features': 19, 'n_estimators': 150}

In [385]:
model_rf = RandomForestClassifier(**grid_search.best_params_)
model_rf.fit(X_train, y_train)

y_pred = model_rf.predict(X_test)
y_pred_proba = model_rf.predict_proba(X_test)

classification_report_dict = classification_report(y_validation, y_pred, output_dict=True)

In [386]:
classification_metrics = classification_report_dict["macro avg"]
classification_metrics["accuracy"] = classification_report_dict["accuracy"]

model_name = str(model_rf)[:str(model_rf).find("(")]

hash_evaluation_metric = hash((model_name, classification_metrics["f1-score"]))

result = {hash_evaluation_metric: {"model": model_name, **classification_metrics,
          "params": str(model_rf.get_params()), "date": datetime.today().date()}}

result = pd.DataFrame.from_dict(result, orient="index")

In [387]:
try:
    result2 = pd.read_csv(path_processed+"model_evaluation.csv", index_col=0)
except:
    result2 = pd.DataFrame(columns=['model', 'precision', 'recall', 'f1-score', 'support', 'accuracy','params', 'date'])
    result2.to_csv(path_processed+"model_evaluation.csv")

if result.index in result2.index.tolist():
    date = result2.loc[int(result.index.values), "date"]
    print(f"O registro {int(result.index.values)} já foi salvo anteriormente no dia {date}.")
else:
    print(f"O registro {result.index} foi salvo na base model_evaluation.csv.")
    result2 = result2.append(result)
    result2.to_csv(path_processed+"model_evaluation.csv")

O registro Int64Index([4257942832718635178], dtype='int64') foi salvo na base model_evaluation.csv.


Unnamed: 0,model,precision,recall,f1-score,support,accuracy,params,date
5000852473507659831,RandomForestClassifier,0.706899,0.65522,0.676186,52121,0.727499,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",2020-09-14
4257942832718635178,RandomForestClassifier,0.704536,0.654066,0.674593,52121,0.726674,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",2020-09-14


### Salvando o modelo

Salvaremos o modelo em um arquivo pickle.

In [392]:
model_name = str(model_rf)[:str(model_rf).find("(")]
model_hash = str(result.index.values)[-6:-1]

print("Salvando modelo sob o nome model_"+model_name+"_"+model_hash+".pkl")

save_model(model_rf, path_model+"model_"+model_name+"_"+model_hash, verbose=0)

Salvando modelo sob o nome model_RandomForestClassifier_35178.pkl


NameError: name 'prep_pipe' is not defined

# Realizando a previsão



In [393]:
X_test_encoded = pd.read_parquet(path_processed+"X_test_encoded.pqt")
X_test_encoded.head()

Unnamed: 0_level_0,land_surface_condition,roof_type,legal_ownership_status,other_floor_type,position,foundation_type,ground_floor_type,plan_configuration,geo_level_1_id,geo_level_2_id,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300051,-0.092443,-1.693211,18.347691,0.255651,-0.330397,-1.440298,-1.715564,-1.314689,1348.12947,3438.423198,...,-0.262312,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731
99355,-0.092443,-1.693211,18.347691,0.255651,-0.330397,-1.440298,-1.715564,-1.314689,343.543525,803.557254,...,3.812247,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731
890251,-0.092443,-1.693211,18.347691,0.255651,-0.330397,-1.440298,-1.715564,-1.314689,1804.759445,97.065726,...,-0.262312,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731
745817,-0.092443,-1.694901,18.347691,-0.119327,-0.329306,-1.440298,-1.715749,-1.314689,2170.063425,212.88401,...,-0.262312,-0.186537,11.065671,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731
421793,-0.092443,-1.693089,18.347691,0.255651,-0.329306,-1.440298,-1.715564,-1.314689,1348.12947,1660.61255,...,-0.262312,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731


## Aplicando previsão

In [394]:
model_rf = RandomForestClassifier(**grid_search.best_params_)
model_rf.fit(X, y)

RandomForestClassifier(max_features=19, n_estimators=150)

In [395]:
y_test = model_rf.predict(X_test_encoded)

y_test = pd.DataFrame(data=y_test, columns=["damage_grade"], index=X_test_encoded.index)

y_test

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,2
99355,2
890251,2
745817,2
421793,2
...,...
310028,2
663567,2
1049160,2
442785,2


In [396]:
y_test.to_csv(path_processed+"y_test.csv")