In [21]:
# encoding=utf8

# importacion general de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn import tree
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

%matplotlib inline

plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
#plt.rcParams['figure.figsize'] = (20, 10)

sns.set(style="whitegrid") # seteando tipo de grid en seaborn

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

In [2]:
#Set de entrenamiento con el target Damage Grade includio
data = pd.read_csv('data/clean_Data.csv',dtype = {
    "building_id" : "int32",
    "geo_level_1_id" : "int8", 
    "geo_level_2_id" : "int16",
    "geo_level_3_id" : "int16",
    "count_floors_pre_eq" : "int8",
    "age" : "int16",
    "area_percentage" :"int8",
    "height_percentage" :"int8",
    "land_surface_condition" :"category",
    "foundation_type" :"category",
    "roof_type" :"category",
    "ground_floor_type" :"category",
    "other_floor_type" :"category",
    "position" :"category",
    "plan_configuration" :"category",
    "has_superstructure_adobe_mud" : "bool",
    "has_superstructure_mud_mortar_stone" : "bool",
    "has_superstructure_stone_flag" : "bool",
    "has_superstructure_cement_mortar_stone" : "bool",
    "has_superstructure_mud_mortar_brick" : "bool",
    "has_superstructure_cement_mortar_brick" : "bool",
    "has_superstructure_timber" : "bool",
    "has_superstructure_bamboo" : "bool",
    "has_superstructure_rc_non_engineered" : "bool",
    "has_superstructure_rc_engineered" : "bool",
    "has_superstructure_other" : "bool",
    "legal_ownership_status" :"category",
    "count_families" : "int8",
    "has_secondary_use" : "bool",
    "has_secondary_use_agriculture" : "bool",
    "has_secondary_use_hotel" : "bool",
    "has_secondary_use_rental" : "bool",
    "has_secondary_use_institution" : "bool",
    "has_secondary_use_school" : "bool",
    "has_secondary_use_industry" : "bool",
    "has_secondary_use_health_post" : "bool",
    "has_secondary_use_gov_office" : "bool",
    "has_secondary_use_use_police" : "bool",
    "has_secondary_use_other" : "bool",
    "damage_grade" : "int8" 
                    
})

In [3]:
#Set de Test
data_test = pd.read_csv('data/test_values.csv',dtype = {
    "building_id" : "int32",
    "geo_level_1_id" : "int8", 
    "geo_level_2_id" : "int16",
    "geo_level_3_id" : "int16",
    "count_floors_pre_eq" : "int8",
    "age" : "int16",
    "area_percentage" :"int8",
    "height_percentage" :"int8",
    "land_surface_condition" :"category",
    "foundation_type" :"category",
    "roof_type" :"category",
    "ground_floor_type" :"category",
    "other_floor_type" :"category",
    "position" :"category",
    "plan_configuration" :"category",
    "has_superstructure_adobe_mud" : "bool",
    "has_superstructure_mud_mortar_stone" : "bool",
    "has_superstructure_stone_flag" : "bool",
    "has_superstructure_cement_mortar_stone" : "bool",
    "has_superstructure_mud_mortar_brick" : "bool",
    "has_superstructure_cement_mortar_brick" : "bool",
    "has_superstructure_timber" : "bool",
    "has_superstructure_bamboo" : "bool",
    "has_superstructure_rc_non_engineered" : "bool",
    "has_superstructure_rc_engineered" : "bool",
    "has_superstructure_other" : "bool",
    "legal_ownership_status" :"category",
    "count_families" : "int8",
    "has_secondary_use" : "bool",
    "has_secondary_use_agriculture" : "bool",
    "has_secondary_use_hotel" : "bool",
    "has_secondary_use_rental" : "bool",
    "has_secondary_use_institution" : "bool",
    "has_secondary_use_school" : "bool",
    "has_secondary_use_industry" : "bool",
    "has_secondary_use_health_post" : "bool",
    "has_secondary_use_gov_office" : "bool",
    "has_secondary_use_use_police" : "bool",
    "has_secondary_use_other" : "bool",
})

 - Tras los analisis realizados en el proceso de feature engineering, se llego a la conclusion de que las siguientes columnas son las mas importantes:
     - count_floors_per_eq
     - area_percentage
     - foundation_type (r,w,u,i)
     - roof_type (x)
     - other_floor_type(s,q,j)
     - ground_floor_type(f,v)
     - superstructure_mud_mortar_stone
     - superstructure_cement_mortar_brick
     - superstructure_rc_non_engineered
     - superstructure_rc_engineered

In [4]:
data_train = pd.DataFrame()
data_train["count_floors_pre_eq"] = data["count_floors_pre_eq"]
data_train["area_percentage"] = data["area_percentage"]
data_train["has_superstructure_mud_mortar_stone"] = data["has_superstructure_mud_mortar_stone"]
data_train["has_superstructure_mud_mortar_brick"] = data["has_superstructure_mud_mortar_brick"]
data_train["has_superstructure_rc_non_engineered"] = data["has_superstructure_rc_non_engineered"]
data_train["has_superstructure_rc_engineered"] = data["has_superstructure_rc_engineered"]

foundation_dummies = pd.get_dummies(data["foundation_type"])
data_train["foundation_r"] = foundation_dummies["r"]
data_train["foundation_w"] = foundation_dummies["w"]
data_train["foundation_u"] = foundation_dummies["u"]
data_train["foundation_i"] = foundation_dummies["i"]

roof_dummies = pd.get_dummies(data["roof_type"])
data_train["roof_x"] = roof_dummies["x"]


pb_dummies = pd.get_dummies(data["ground_floor_type"])
data_train["ground_floor_f"] = pb_dummies["f"]
data_train["ground_floor_v"] = pb_dummies["v"]

of_dummies = pd.get_dummies(data["other_floor_type"])
data_train["other_floor_s"] = of_dummies["s"]
data_train["other_floor_q"] = of_dummies["q"]
data_train["other_floor_j"] = of_dummies["j"]

data_train["damage_grade"] = data["damage_grade"].astype("int8")

data_train

Unnamed: 0,count_floors_pre_eq,area_percentage,has_superstructure_mud_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,foundation_r,foundation_w,foundation_u,foundation_i,roof_x,ground_floor_f,ground_floor_v,other_floor_s,other_floor_q,other_floor_j,damage_grade
0,2,6,True,False,False,False,1,0,0,0,0,1,0,0,1,0,3
1,2,8,True,False,False,False,1,0,0,0,0,0,0,0,1,0,2
2,2,5,True,False,False,False,1,0,0,0,0,1,0,0,0,0,3
3,2,6,True,False,False,False,1,0,0,0,0,1,0,0,0,0,2
4,3,8,False,False,False,False,1,0,0,0,0,1,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,1,6,True,False,False,False,1,0,0,0,0,1,0,0,0,1,2
260597,2,6,True,False,False,False,1,0,0,0,0,1,0,0,1,0,3
260598,3,6,True,False,False,False,1,0,0,0,0,1,0,0,1,0,3
260599,2,14,False,False,False,False,1,0,0,0,1,0,1,1,0,0,2


# Modelo Inicial Machine Learning

In [5]:
#Defino la X e y, X son los features y es el target a predecir (Damage Grade) 
X, y = data_train.iloc[:,:-1],data_train.iloc[:,-1]

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

# Random Forest

Hiperparametros a considerar:
    
    n_estimators: cantidad de árboles a construir (100)
    max_depth: máxima profundidad de cada árbol
    min_samples_split: la cantidad mínima de datos requeridos para splitear un nodo interno (2)
    min_samples_leaf: cantidad mínima de datos requeridos para ser una hoja (1)
    max_features: la cantidad de features a considerar cuando se busca el mejor split (n)

In [7]:
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.559067


In [16]:
y_preds = pd.Series(preds)

In [18]:
f1_score(y_test, y_preds, average='micro')

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

In [22]:
pipe = make_pipeline(StandardScaler(), 
                     RandomForestClassifier(random_state=2018))
pipe

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=2018))])

In [23]:
param_grid = {'randomforestclassifier__n_estimators': [50, 100],
              'randomforestclassifier__min_samples_leaf': [1, 5]}
gs = GridSearchCV(pipe, param_grid, cv=5)

In [26]:
gs.fit(X, y.values.ravel())

KeyboardInterrupt: 

In [None]:
gs.best_params_