In [1]:
import pandas as pd
import numpy as np
import datetime
from time import ctime
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Datos de entrenamiento limpios y con features agregados
train = pd.read_csv('../data/to_train_01.csv', low_memory=False, lineterminator='\n')

print(train.info())
train.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 85 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   building_id                             260601 non-null  int64  
 1   damage_grade                            260601 non-null  int64  
 2   geo_level_1_id                          260601 non-null  int64  
 3   geo_level_2_id                          260601 non-null  int64  
 4   geo_level_3_id                          260601 non-null  int64  
 5   count_floors_pre_eq                     260601 non-null  int64  
 6   age                                     260601 non-null  int64  
 7   area_percentage                         260601 non-null  int64  
 8   height_percentage                       260601 non-null  int64  
 9   has_superstructure_adobe_mud            260601 non-null  bool   
 10  has_superstructure_mud_mortar_stone     2606

Unnamed: 0,building_id,damage_grade,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,...,legal_ownership_status_w,ground_floor_type_f,ground_floor_type_x,ground_floor_type_v,ground_floor_type_z,ground_floor_type_m,position_t,position_s,position_j,position_o
0,802906,3,6,487,12198,2,30,6,5,True,...,0,1,0,0,0,0,1,0,0,0


In [3]:
# Datos de prueba
test = pd.read_csv('../data/to_test_01.csv', low_memory=False, lineterminator='\n')
print(test.info())
test.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86868 entries, 0 to 86867
Data columns (total 84 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   building_id                             86868 non-null  int64  
 1   geo_level_1_id                          86868 non-null  int64  
 2   geo_level_2_id                          86868 non-null  int64  
 3   geo_level_3_id                          86868 non-null  int64  
 4   count_floors_pre_eq                     86868 non-null  int64  
 5   age                                     86868 non-null  int64  
 6   area_percentage                         86868 non-null  int64  
 7   height_percentage                       86868 non-null  int64  
 8   has_superstructure_adobe_mud            86868 non-null  int64  
 9   has_superstructure_mud_mortar_stone     86868 non-null  int64  
 10  has_superstructure_stone_flag           86868 non-null  in

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,...,altura_por_antiguedad,area_por_altura,producto_area_altura,area_al_cuadrado,altura_al_cuadrado,promedio_area_altura,cant_familias_por_pisos,cant_familias_por_area,cant_familias_por_altura,cant_materiales
0,300051,17,596,11307,3,20,7,6,0,1,...,0.297767,1.166667,42,49,36,6.5,0.333333,0.142857,0.166667,1


## Entrenamiento del modelo

In [4]:
features_to_remove = ['building_id', 'geo_level_2_id', 'geo_level_3_id', 'damage_grade']
Y = train['damage_grade']
X = train.drop(features_to_remove, axis=1)

In [5]:
from sklearn import ensemble
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 195450 entries, 185050 to 206687
Data columns (total 81 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   geo_level_1_id                          195450 non-null  int64  
 1   count_floors_pre_eq                     195450 non-null  int64  
 2   age                                     195450 non-null  int64  
 3   area_percentage                         195450 non-null  int64  
 4   height_percentage                       195450 non-null  int64  
 5   has_superstructure_adobe_mud            195450 non-null  bool   
 6   has_superstructure_mud_mortar_stone     195450 non-null  bool   
 7   has_superstructure_stone_flag           195450 non-null  bool   
 8   has_superstructure_cement_mortar_stone  195450 non-null  bool   
 9   has_superstructure_mud_mortar_brick     195450 non-null  bool   
 10  has_superstructure_cement_mortar_brick 

### Parameter tuning

> https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [9]:
best_params = {
    'learning_rate':[0.1],
    'n_estimators':[1500],
    'max_depth':[4, 5],
    'min_child_weight':[1],
    'subsample':[0.8],
    'colsample_bytree':[0.8],
    'n_jobs':[-1],
    'objective':['multi:softmax'],
    'num_class':[3],
    'seed':[42],
    'scale_pos_weight':[1],
    'booster':['gbtree']
}


params_search = {
    'learning_rate':[0.1],
    'n_estimators':[1500],
    'max_depth':[4, 5],
    'min_child_weight':[1],
    'subsample':[0.8],
    'colsample_bytree':[0.8],
    'n_jobs':[-1],
    'objective':['multi:softmax'],
    'num_class':[3],
    'seed':[42],
    'scale_pos_weight':[1],
    'booster':['gbtree']
}
    
print('Hora antes de empezar a tunear/entrenar: {}'.format(ctime()))
model = GridSearchCV(estimator = XGBClassifier(), param_grid=params_search, scoring='f1_micro', n_jobs=-1, cv=5, verbose=10)
#model = XGBClassifier(**best_params)
model.fit(X, Y)
print('Hora al finalizar de tunear/entrenar: {}'.format(ctime()))

Hora antes de empezar a tunear/entrenar: Mon Jul  5 22:45:53 2021
Fitting 5 folds for each of 8 candidates, totalling 40 fits


KeyboardInterrupt: 

In [None]:
print('El mejor score de XGBBoost con GridSearchCV fue: {}'.format((model.best_score_).round(4)))
print('Mejor parametros hasta el momento: ')
model.best_params_

### Feature importance

In [None]:
import matplotlib.pyplot as plt

feature_cols = X.columns.tolist()
features = pd.Series(data=model.feature_importances_, index=feature_cols, name='Feature importance')
features.sort_values(ascending=True, inplace=True)
plt.figure(figsize=(10, 10))
features.plot(kind='barh')
plt.xlabel('Importancia')
plt.ylabel('Features')
plt.title('Importancia Features con XGBoost')
plt.show()

In [None]:
plt.rcParams["figure.figsize"] = (15, 15)
xgb.plot_importance(model, grid=True)
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(y_true, y_pred):
    names = sorted(set(y_true))
    cm = confusion_matrix(y_true, y_pred, names)
    df_cm = pd.DataFrame(cm, names, names)

    plt.figure(dpi=100)
    plt.title("Matriz de confusion")
    sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='g', square=True)
    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    plt.show()