In [47]:
import pandas as pd
import numpy as np
import seaborn as sns
import squarify # pip install squarify
import matplotlib.pyplot as plt
import xgboost as xgb
%matplotlib inline

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [48]:
# suprimimos la notacion cientifica en los outputs
pd.options.display.float_format = '{:20,.2f}'.format

train_values = pd.read_csv('train_values.csv', index_col='building_id')
train_labels = pd.read_csv('train_labels.csv', index_col='building_id')
test_values = pd.read_csv('test_values.csv', index_col='building_id')

# Custom features

Notamos que si utilizamos el csv que exportamos del notebook de feature engineering, por mas que utilicemos los mismos features, el modelo da un peor puntaje que cuando utilizamos el train_values.csv. Por lo tanto, esperando eliminar estos efectos negativos, calcularemos los features que vayamos a utilizar aqui mismo, copiando el codigo que hicimos en el notebook mencionado anteriormente.

In [49]:
def use(x):
    if not x['has_secondary_use']: # Es una casa
        return 1
    if x['has_secondary_use_agriculture']: # Es de agricultura
        return 2
    if x['has_secondary_use_hotel']: # Es un hotel
        return 3
    if x['has_secondary_use_rental']: # Es de alquiler
        return 4
    if x['has_secondary_use_institution']: # Es una institucion
        return 5
    if x['has_secondary_use_school']: # Es una escuela
        return 6
    if x['has_secondary_use_industry']: # Es una industria
        return 7
    if x['has_secondary_use_health_post']: # Es un puesto de salud
        return 8
    if x['has_secondary_use_gov_office']: # Es una oficina de gobierno
        return 9
    if x['has_secondary_use_use_police']: # Es una estacion de policias
        return 10
    if x['has_secondary_use_other']: # tiene otro uso
        return 11
    
train_values['use'] = train_values.apply(lambda x: use(x), axis=1)
test_values['use'] = test_values.apply(lambda x: use(x), axis=1)

In [50]:
train_values['height per area'] = train_values.height_percentage / train_values.area_percentage
test_values['height per area'] = test_values.height_percentage / test_values.area_percentage

In [51]:
subset = train_values[['has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
                       'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone',
                       'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick',
                       'has_superstructure_timber', 'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
                       'has_superstructure_rc_engineered', 'has_superstructure_other']]
train_values["cant_materials"] = subset.sum(axis=1)

subset_test = test_values[['has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
                       'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone',
                       'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick',
                       'has_superstructure_timber', 'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
                       'has_superstructure_rc_engineered', 'has_superstructure_other']]
test_values["cant_materials"] = subset_test.sum(axis=1)

In [52]:
# Viendo esto podemos agregar otra columna que asigne 1 si tiene la cantidad pisos que resultó con daño considerable
def bad_cant_floor(x):
    if (x > 0 and x < 6) or x == 8:
        return 1
    else:
        return 0

train_values['bad_cant_floor'] = train_values.apply(lambda x: bad_cant_floor(x['count_floors_pre_eq']), axis=1)
test_values['bad_cant_floor'] = test_values.apply(lambda x: bad_cant_floor(x['count_floors_pre_eq']), axis=1)

In [28]:
X1,y1 = train_values[['foundation_type', 'area_percentage', 'height_percentage', 'count_floors_pre_eq',
                     'land_surface_condition', 'has_superstructure_cement_mortar_stone', 'age', 'geo_level_1_id',
                     'geo_level_2_id','geo_level_3_id', 'height per area', 'cant_materials','bad_cant_floor']],train_labels['damage_grade']

In [16]:
X1 = pd.get_dummies(X1)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.25, random_state=12)

In [18]:
xg_clas = XGBClassifier(colsample_bytree = 0.4, 
                        learning_rate = 0.36,
                        max_depth = 5, 
                        alpha = 1,
                        n_estimators = 100)

In [19]:
xg_clas.fit(X_train,y_train)





XGBClassifier(alpha=1, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.36, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=1,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [20]:
predictions_train = xg_clas.predict(X_test)

In [21]:
from sklearn.metrics import f1_score

f1_score(y_test, predictions_train, average='micro')

0.7140489017820141

# Sin split

In [53]:
selected_features = ['foundation_type', 'area_percentage', 'height_percentage', 'count_floors_pre_eq',
                     'land_surface_condition', 'has_superstructure_cement_mortar_stone', 'age', 'geo_level_1_id',
                     'geo_level_2_id','geo_level_3_id', 'height per area', 'cant_materials', 'bad_cant_floor']

train_values_subset = train_values[selected_features]
test_values_subset = test_values[selected_features]

train_values_subset = pd.get_dummies(train_values_subset)
test_values_subset = pd.get_dummies(test_values_subset)

In [54]:
xg_clas = XGBClassifier(colsample_bytree = 0.4, 
                        learning_rate = 0.36,
                        max_depth = 5, 
                        alpha = 1,
                        n_estimators = 100)

In [55]:
xg_clas.fit(train_values_subset,train_labels.values.ravel())



XGBClassifier(alpha=1, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.36, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=1,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [56]:
predictions_train = xg_clas.predict(train_values_subset)

In [57]:
from sklearn.metrics import f1_score

f1_score(train_labels, predictions_train, average='micro')

0.7194792038403536