In [1]:
# encoding=utf8

# importacion general de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb

import pickle

from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

%matplotlib inline

plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
#plt.rcParams['figure.figsize'] = (20, 10)

sns.set(style="whitegrid") # seteando tipo de grid en seaborn

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

In [43]:
#Set de entrenamiento con el target Damage Grade includio
data = pd.read_csv('data/train_values.csv',dtype = {
    "building_id" : "int32",
    "geo_level_1_id" : "int8", 
    "geo_level_2_id" : "int16",
    "geo_level_3_id" : "int16",
    "count_floors_pre_eq" : "int8",
    "age" : "int16",
    "area_percentage" :"int8",
    "height_percentage" :"int8",
    "land_surface_condition" :"category",
    "foundation_type" :"category",
    "roof_type" :"category",
    "ground_floor_type" :"category",
    "other_floor_type" :"category",
    "position" :"category",
    "plan_configuration" :"category",
    "has_superstructure_adobe_mud" : "bool",
    "has_superstructure_mud_mortar_stone" : "bool",
    "has_superstructure_stone_flag" : "bool",
    "has_superstructure_cement_mortar_stone" : "bool",
    "has_superstructure_mud_mortar_brick" : "bool",
    "has_superstructure_cement_mortar_brick" : "bool",
    "has_superstructure_timber" : "bool",
    "has_superstructure_bamboo" : "bool",
    "has_superstructure_rc_non_engineered" : "bool",
    "has_superstructure_rc_engineered" : "bool",
    "has_superstructure_other" : "bool",
    "legal_ownership_status" :"category",
    "count_families" : "int8",
    "has_secondary_use" : "bool",
    "has_secondary_use_agriculture" : "bool",
    "has_secondary_use_hotel" : "bool",
    "has_secondary_use_rental" : "bool",
    "has_secondary_use_institution" : "bool",
    "has_secondary_use_school" : "bool",
    "has_secondary_use_industry" : "bool",
    "has_secondary_use_health_post" : "bool",
    "has_secondary_use_gov_office" : "bool",
    "has_secondary_use_use_police" : "bool",
    "has_secondary_use_other" : "bool"})
data = pd.get_dummies(data)
data = data.drop("building_id",axis=1)



In [52]:
label = pd.read_csv('data/train_labels.csv',dtype = {
    "building_id" : "int32",
    "damage_grade" : "int8"})
label = label.drop("building_id",axis=1)

In [37]:
#Set de Test
test = pd.read_csv('data/test_values.csv',dtype = {
    "building_id" : "int32",
    "geo_level_1_id" : "int8", 
    "geo_level_2_id" : "int16",
    "geo_level_3_id" : "int16",
    "count_floors_pre_eq" : "int8",
    "age" : "int16",
    "area_percentage" :"int8",
    "height_percentage" :"int8",
    "land_surface_condition" :"category",
    "foundation_type" :"category",
    "roof_type" :"category",
    "ground_floor_type" :"category",
    "other_floor_type" :"category",
    "position" :"category",
    "plan_configuration" :"category",
    "has_superstructure_adobe_mud" : "bool",
    "has_superstructure_mud_mortar_stone" : "bool",
    "has_superstructure_stone_flag" : "bool",
    "has_superstructure_cement_mortar_stone" : "bool",
    "has_superstructure_mud_mortar_brick" : "bool",
    "has_superstructure_cement_mortar_brick" : "bool",
    "has_superstructure_timber" : "bool",
    "has_superstructure_bamboo" : "bool",
    "has_superstructure_rc_non_engineered" : "bool",
    "has_superstructure_rc_engineered" : "bool",
    "has_superstructure_other" : "bool",
    "legal_ownership_status" :"category",
    "count_families" : "int8",
    "has_secondary_use" : "bool",
    "has_secondary_use_agriculture" : "bool",
    "has_secondary_use_hotel" : "bool",
    "has_secondary_use_rental" : "bool",
    "has_secondary_use_institution" : "bool",
    "has_secondary_use_school" : "bool",
    "has_secondary_use_industry" : "bool",
    "has_secondary_use_health_post" : "bool",
    "has_secondary_use_gov_office" : "bool",
    "has_secondary_use_use_police" : "bool",
    "has_secondary_use_other" : "bool"})
test = pd.get_dummies(test)
test =test.drop("building_id",axis=1)
test

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
0,17,596,11307,3,20,7,6,False,True,False,...,0,0,0,0,0,0,0,0,1,0
1,6,141,11987,2,25,13,5,False,True,False,...,0,0,0,0,0,0,0,0,1,0
2,22,19,10044,2,5,4,5,False,True,False,...,0,0,0,0,0,0,0,0,1,0
3,26,39,633,1,0,19,3,False,False,False,...,0,0,0,0,0,0,0,0,1,0
4,17,289,7970,3,15,8,7,False,True,False,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86863,4,605,3623,3,70,20,6,False,True,False,...,0,0,0,0,0,0,0,0,0,1
86864,10,1407,11907,3,25,6,7,True,True,True,...,0,0,0,0,0,0,0,0,1,0
86865,22,1136,7712,1,50,3,3,False,True,False,...,0,0,0,0,0,0,0,0,1,0
86866,6,1041,912,2,5,9,5,True,True,False,...,0,0,0,0,0,0,1,0,0,0


In [53]:
X, y = data,label

y

Unnamed: 0,damage_grade
0,3
1,2
2,3
3,2
4,3
...,...
260596,2
260597,3
260598,3
260599,2


In [98]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)



# XGBoost

## Hiperparametros

- learning_rate: tasa de aprendizaje
- max_depth: máxima profundidad de cada árbol
- subsample: porcentaje de muestras usadas para cada árbol (valor muy bajo, posible underfitting)
- colsample_bytree: porcentaje de features usadas para cada árbol (valores muy alto, posible overfitting)
- n_estimators: cantidad de árboles a construir.
- objective: función de error a utilizar (algunas: reg:squarederror para regresión, reg:logistic o binary:logistic para clasificación)

Parámetros de regularización:

- gamma: umbral para hacer split basado en la reducción de error de hacer el nuevo split.
- alpha: regularización para los pesos de las hojas. Un valor más alto genera una mayor regularización.
- lambda: similar alpha pero para la sintonia fina.

In [55]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [56]:
xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=10, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=10, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [95]:
preds = xg_reg.predict(X_test)
preds.round()


array([1., 2., 1., ..., 2., 2., 2.], dtype=float32)

In [104]:
preds_test = y_test["damage_grade"].astype("float32").to_numpy()
preds_test

array([1., 2., 1., ..., 2., 2., 2.], dtype=float32)

In [106]:
##f1_score(y_test, preds, average='micro')

## Prediccion

In [115]:
predictions = xg_reg.predict(test)
predictions = predictions.round()

In [112]:
submission_format = pd.read_csv('data/submission_format.csv', index_col='building_id')

In [124]:
my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [125]:
my_submission["damage_grade"] = my_submission["damage_grade"].astype("int8")
my_submission

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,2
99355,2
890251,2
745817,1
421793,2
...,...
310028,2
663567,2
1049160,2
442785,2


In [126]:
my_submission.to_csv('data/submissions/xg_boost_submission.csv')