In [86]:
import os

import pandas as pd
import numpy as np

from pycaret.classification import *
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from nimbusml.ensemble import LightGbmClassifier # Módulo de ML da Microsoft
from catboost import CatBoostClassifier
from sklearn.svm import LinearSVC
import pickle

from sklearn.model_selection import (cross_val_score, RepeatedStratifiedKFold,
                                     RandomizedSearchCV, GridSearchCV, train_test_split)

from sklearn.metrics import precision_recall_curve, average_precision_score

import matplotlib.pyplot as plt
import seaborn as sns

os.chdir("../src/")
from utils.data_prep import data_prep as dp

basepath = "../data/external/"
path_x_train = "../data/external/train_values.csv"
path_y_train = "../data/external/train_labels.csv"
path_x_test = "../data/external/test_values.csv"

path_x_selected = "../data/interim/"
path_processed = "../data/processed/"
path_model = "../models/"

In [87]:
# List all files in a directory using scandir()
print("-"*30)
print(f"""Arquivos em {path_processed}:""")

with os.scandir(path_processed) as entries:
    for entry in entries:
        if entry.is_file():
            print(entry.name)
print("-"*30)

------------------------------
Arquivos em ../data/processed/:
.gitkeep
df_train.pqt
X_test_encoded.csv
X_test_encoded.pqt
------------------------------


# Carregando dataframe da preparação de dados

In [88]:
df_train = pd.read_parquet(path_processed+"df_train.pqt")
df_train.head()

Unnamed: 0_level_0,land_surface_condition,roof_type,legal_ownership_status,other_floor_type,position,foundation_type,ground_floor_type,plan_configuration,geo_level_1_id,geo_level_2_id,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,-0.374641,0.181631,0.108199,0.458191,1.991107,0.409977,0.359917,0.121091,-0.983414,-0.518705,...,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,3
28830,4.640181,0.181631,0.108199,0.458191,-0.25591,0.409977,0.063218,0.121091,-0.734459,0.481998,...,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,2
94947,-0.374641,0.181631,0.108199,0.349335,1.991107,0.409977,0.359917,0.121091,0.883744,-0.819158,...,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,3
590882,-0.374641,0.181631,0.108199,0.349335,-0.25591,0.409977,0.359917,0.121091,1.008221,-0.685893,...,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,2
201944,-0.374641,0.181631,0.108199,0.349335,-0.25591,0.409977,0.359917,0.121091,-0.361028,-1.381296,...,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,3


# Testando modelos

Nessa parte, estou usando a biblioteca [PyCaret](https://pycaret.org/) para avaliar vários modelos para esse problema de classificação múltipla. Depois dessa avaliação inicial, pegaremos um ou alguns dos modelos para seguir em frente (fazer tunning dos hiperparâmetros e testar com sets diferentes de dados obtidos após seleção de variáveis).

In [12]:
clf1 = setup(data = df_train, target = "damage_grade")
best = compare_models(sort = 'F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Extreme Gradient Boosting,0.7286,0.0,0.6361,0.7295,0.7207,0.4815,0.4906,81.427
1,CatBoost Classifier,0.7244,0.0,0.6283,0.7253,0.716,0.4724,0.4819,144.9354
2,Light Gradient Boosting Machine,0.7125,0.0,0.6124,0.7136,0.7026,0.4465,0.4575,6.0235
3,Extra Trees Classifier,0.6995,0.0,0.624,0.6959,0.6947,0.4389,0.442,35.7377
4,Random Forest Classifier,0.6919,0.0,0.6208,0.6893,0.6859,0.4222,0.4272,2.9302
5,Gradient Boosting Classifier,0.6816,0.0,0.5682,0.6852,0.6651,0.3748,0.3934,160.6097
6,Decision Tree Classifier,0.6546,0.0,0.6074,0.6559,0.6552,0.3807,0.3808,3.4727
7,K Neighbors Classifier,0.6484,0.0,0.5795,0.6438,0.6444,0.3496,0.3512,63.8433
8,Ada Boost Classifier,0.6448,0.0,0.5164,0.6572,0.6119,0.2805,0.315,14.7907
9,Logistic Regression,0.5876,0.0,0.4462,0.5756,0.5229,0.1349,0.1709,13.6998


## Resultado da seleção

Na tabela acima (e abaixo, adicionei uma figura para não perdermos o primeiro resultado, nosso baseline), vemos que os modelos de boosting trees se sairam melhor nesse problema.
Gostaria de testar os seguintes modelos:

1. [Extreme Gradient Boosting](https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier) -> pois está em primeiro.
1. [CatBoost Classifier](https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html) -> pois está em segundo.
1. [Light Gradient Boosting Machine](https://lightgbm.readthedocs.io/en/latest/Installation-Guide.html) -> pois está em terceiro.
1. [SVM com kernel não linear](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC) -> pois NÃO está na lista, apenas seu kernel linear.

<img src="./images/3_modeling_models_comparation_v1.JPG" alt="Tabela com o comparativo dos modelos.">



### Extreme Gradient Boosting (XGBoost)

Para os modelos Extreme Gradient Boosting, CatBoost Classifier e Light Gradient Boosting, usamos (muito) o texto do grande Jason Brownlee do [Machine Learning Mastery](https://machinelearningmastery.com/gradient-boosting-with-scikit-learn-xgboost-lightgbm-and-catboost/), além das respectivas documentações (links colocados anteriormente).

In [89]:
n_splits = 10
n_repeats = 3
random_state = 42

X = df_train.drop(["damage_grade"], axis=1)
y = df_train["damage_grade"]

cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

In [27]:
model_xgb = XGBClassifier()

n_scores = cross_val_score(model_xgb, X, y, scoring='f1_micro', cv=cv, n_jobs=-1, error_score='raise')

print('F1-micro: %.3f (%.3f)' % (n_scores.mean(), n_scores.std()))

In [26]:
print("""F1-micro: 0.729 (0.002) <- Esse é o valor que obtivemos com o XGboost com:
        n_splits = 10
        n_repeats = 3""")

F1-micro: 0.729 (0.002) <- Esse é o valor que obtivemos com o XGboost com:
        n_splits = 10
        n_repeats = 3


### CatBoost Classifier

In [66]:
model_catb = CatBoostClassifier(verbose=0, n_estimators=100)

n_scores = cross_val_score(model_catb, X, y, scoring='f1_micro', cv=cv, n_jobs=-1, error_score='raise')

print('F1-micro: %.3f (%.3f)' % (n_scores.mean(), n_scores.std()))

F1-micro: 0.712 (0.002)


### Light Gradient Boosting (LightGBM)

In [70]:
model_lgbm = LGBMClassifier()

n_scores = cross_val_score(model_lgbm, X, y, scoring='f1_micro', cv=cv, n_jobs=-1, error_score='raise')

print('F1-micro: %.3f (%.3f)' % (n_scores.mean(), n_scores.std()))

F1-micro: 0.713 (0.002)


### SVM com kernel não linear

# Otimização de hiperparâmetros

## Tunning do Light Gradient Boosting

Inicialmente, faremos o tunning sobre o Light Gradient Boosting, pois é o que possui menor tempo de execução (para comparação, o Extreme Gradient Boosting levou 17x mais que o Light Gradient Boosting, cerca de 60 min contra 3,5 min do LightGBM).

In [71]:
params = {"boosting_type":["gbdt", "dart", "goss"],
          "learning_rate":[.001, .01, .1],
          "n_estimators":[10,50,100,150]}

grid_search = GridSearchCV(estimator=model_lgbm, param_grid=params,
                           scoring="f1_micro",cv=5, n_jobs=-1)

grid_search = grid_search.fit(X, y)
grid_search.best_params_

{'boosting_type': 'goss', 'learning_rate': 0.1, 'n_estimators': 150}

In [72]:
grid_search.score(X, y)

0.7280286721846808

### Usando os valores de hiperparâmetros após otimização

In [83]:
model_lgbm = LGBMClassifier(**grid_search.best_params_)

n_scores = cross_val_score(model_lgbm, X, y, scoring='f1_micro', cv=cv, n_jobs=-1, error_score='raise')

print('F1-micro: %.3f (%.3f)' % (n_scores.mean(), n_scores.std()))

F1-micro: 0.720 (0.002)


### Avaliando o modelo

### Salvando o modelo

Salvaremos o modelo em um arquivo pickle.

In [79]:
save_model(model_lgbm, path_model+"model_LGBMClassifier_v1", verbose=0)

INFO - Initializing save_model()
INFO - save_model(model=LGBMClassifier(boosting_type='goss', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=150, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0), model_name=../models/model_LGBMClassifier_v1, model_only=False, verbose=0)
INFO - Adding model into prep_pipe
INFO - ../models/model_LGBMClassifier_v1.pkl saved in current working directory
INFO - Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=True, features_todrop=[],
                                      ml_usecase='classification',
                             

# Realizando a previsão



In [81]:
X_test_encoded = pd.read_parquet(path_processed+"X_test_encoded.pqt")
X_test_encoded.head()

Unnamed: 0_level_0,land_surface_condition,roof_type,legal_ownership_status,other_floor_type,position,foundation_type,ground_floor_type,plan_configuration,geo_level_1_id,geo_level_2_id,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300051,-0.092443,-1.693211,18.347691,0.255651,-0.330397,-1.440298,-1.715564,-1.314689,1348.12947,3438.423198,...,-0.262312,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731
99355,-0.092443,-1.693211,18.347691,0.255651,-0.330397,-1.440298,-1.715564,-1.314689,343.543525,803.557254,...,3.812247,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731
890251,-0.092443,-1.693211,18.347691,0.255651,-0.330397,-1.440298,-1.715564,-1.314689,1804.759445,97.065726,...,-0.262312,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731
745817,-0.092443,-1.694901,18.347691,-0.119327,-0.329306,-1.440298,-1.715749,-1.314689,2170.063425,212.88401,...,-0.262312,-0.186537,11.065671,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731
421793,-0.092443,-1.693089,18.347691,0.255651,-0.329306,-1.440298,-1.715564,-1.314689,1348.12947,1660.61255,...,-0.262312,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731


## Aplicando previsão

In [90]:
model_lgbm = LGBMClassifier(**grid_search.best_params_)
model_lgbm.fit(X, y)

LGBMClassifier(boosting_type='goss', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=150, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [91]:
y_test = model_lgbm.predict(X_test_encoded)

y_test = pd.DataFrame(data=y_test, columns=["damage_grade"], index=X_test_encoded.index)

y_test.to_

y_test.head()

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [93]:
y_test.to_csv(path_processed+"y_test.csv")