In [105]:
import pandas as pd
pd.options.display.float_format = '{:20,.2f}'.format
import matplotlib.pyplot as plt
import plotly.express as px

In [106]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.ensemble import RandomForestClassifier

# for combining the preprocess with model training
from sklearn.pipeline import make_pipeline

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import f1_score

In [107]:
df_train_labels_original = pd.read_csv('train_labels.csv',low_memory=False,index_col='building_id')
df_train_values_original = pd.read_csv('train_values.csv',low_memory=False, index_col='building_id', dtype= {
'geo_level_1_id':'uint8', 'geo_level_2_id':'uint16', 'geo_level_3_id':'uint16', 'count_floors_pre_eq':'uint8','age':'uint16', 'area_percentage':'uint16', 'height_percentage':'uint16', 
'land_surface_condition':'category', 'foundation_type':'category', 'roof_type':'category', 'ground_floor_type':'category', 'other_floor_type':'category', 'position':'category','plan_configuration':'category', 
'has_superstructure_adobe_mud':'bool', 'has_superstructure_mud_mortar_stone':'bool','has_superstructure_stone_flag':'bool', 'has_superstructure_cement_mortar_stone':'bool', 'has_superstructure_mud_mortar_brick':'bool', 'has_superstructure_cement_mortar_brick':'bool', 'has_superstructure_timber':'bool', 'has_superstructure_bamboo':'bool', 'has_superstructure_rc_non_engineered':'bool', 'has_superstructure_rc_engineered':'bool', 'has_superstructure_other':'bool', 
'legal_ownership_status':'category', 'count_families':'uint16', 
'has_secondary_use':'bool', 'has_secondary_use_agriculture':'bool', 'has_secondary_use_hotel':'bool', 'has_secondary_use_rental':'bool', 'has_secondary_use_institution':'bool', 'has_secondary_use_school':'bool', 'has_secondary_use_industry':'bool', 'has_secondary_use_health_post':'bool', 'has_secondary_use_gov_office':'bool', 'has_secondary_use_use_police':'bool', 'has_secondary_use_other':'bool',})

In [108]:
df_train_values_original.dtypes

geo_level_1_id                               uint8
geo_level_2_id                              uint16
geo_level_3_id                              uint16
count_floors_pre_eq                          uint8
age                                         uint16
area_percentage                             uint16
height_percentage                           uint16
land_surface_condition                    category
foundation_type                           category
roof_type                                 category
ground_floor_type                         category
other_floor_type                          category
position                                  category
plan_configuration                        category
has_superstructure_adobe_mud                  bool
has_superstructure_mud_mortar_stone           bool
has_superstructure_stone_flag                 bool
has_superstructure_cement_mortar_stone        bool
has_superstructure_mud_mortar_brick           bool
has_superstructure_cement_morta

In [109]:
df_test_values_original = pd.read_csv('test_values.csv',low_memory=False, index_col='building_id', dtype= {
'geo_level_1_id':'uint8', 'geo_level_2_id':'uint16', 'geo_level_3_id':'uint16', 'count_floors_pre_eq':'uint8','age':'uint16', 'area_percentage':'uint16', 'height_percentage':'uint16', 
'land_surface_condition':'category', 'foundation_type':'category', 'roof_type':'category', 'ground_floor_type':'category', 'other_floor_type':'category', 'position':'category','plan_configuration':'category', 
'has_superstructure_adobe_mud':'bool', 'has_superstructure_mud_mortar_stone':'bool','has_superstructure_stone_flag':'bool', 'has_superstructure_cement_mortar_stone':'bool', 'has_superstructure_mud_mortar_brick':'bool', 'has_superstructure_cement_mortar_brick':'bool', 'has_superstructure_timber':'bool', 'has_superstructure_bamboo':'bool', 'has_superstructure_rc_non_engineered':'bool', 'has_superstructure_rc_engineered':'bool', 'has_superstructure_other':'bool', 
'legal_ownership_status':'category', 'count_families':'uint16', 
'has_secondary_use':'bool', 'has_secondary_use_agriculture':'bool', 'has_secondary_use_hotel':'bool', 'has_secondary_use_rental':'bool', 'has_secondary_use_institution':'bool', 'has_secondary_use_school':'bool', 'has_secondary_use_industry':'bool', 'has_secondary_use_health_post':'bool', 'has_secondary_use_gov_office':'bool', 'has_secondary_use_use_police':'bool', 'has_secondary_use_other':'bool',})

In [110]:
floors_bool = pd.get_dummies(df_train_values_original['count_floors_pre_eq'])
floors_bool = floors_bool.drop(columns = [5, 6, 7, 8, 9])
floors_bool

Unnamed: 0_level_0,1,2,3,4
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
802906,0,1,0,0
28830,0,1,0,0
94947,0,1,0,0
590882,0,1,0,0
201944,0,0,1,0
...,...,...,...,...
688636,1,0,0,0
669485,0,1,0,0
602512,0,0,1,0
151409,0,1,0,0


In [128]:
# codigo para iteracion 1-3
# selected_features = ['geo_level_3_id', 'has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone', 'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick', 'has_superstructure_timber', 'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered', 'has_superstructure_rc_engineered', 'has_superstructure_other', 'area_percentage', 'height_percentage', 'legal_ownership_status']
# train_values_subset = df_train_values_original[selected_features]
# train_values_subset = pd.get_dummies(train_values_subset)
# train_values_subset = train_values_subset.drop(columns=['legal_ownership_status_v'])
# train_values_subset = pd.concat([train_values_subset, floors_bool], axis=1)
# train_values_subset.sample(3)

Unnamed: 0_level_0,geo_level_3_id,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,...,has_superstructure_other,area_percentage,height_percentage,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_w,1,2,3,4
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3354,6872,False,True,False,False,False,False,False,False,False,...,False,5,5,0,0,0,0,1,0,0
819032,7350,False,True,False,False,False,False,True,True,False,...,False,7,5,0,0,0,0,1,0,0
900564,7369,False,False,False,False,True,False,True,False,False,...,False,10,8,0,0,0,0,0,0,1


In [112]:
# iteracion 4
train_values_subset =  pd.get_dummies(df_train_values_original)
train_values_subset.sample(3)


Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
544201,6,376,4080,2,15,8,4,False,True,False,...,0,0,0,0,0,0,0,0,1,0
12998,20,120,5039,2,20,5,7,False,True,False,...,0,0,0,0,0,0,0,0,1,0
962232,27,1106,6874,3,50,8,7,True,False,False,...,0,0,0,0,0,0,0,0,1,0


In [113]:
train_labels_subset = df_train_labels_original['damage_grade']

In [114]:
pipe = make_pipeline(StandardScaler(), 
                     RandomForestClassifier(random_state=2018, oob_score=True, n_jobs=-1))

In [115]:
param_grid = {'randomforestclassifier__n_estimators': [50, 200],
              'randomforestclassifier__min_samples_leaf': [1, 10]}
gs = GridSearchCV(pipe, param_grid, cv=5)

In [116]:
gs.fit(train_values_subset, train_labels_subset.values.ravel())

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(n_jobs=-1,
                                                               oob_score=True,
                                                               random_state=2018))]),
             param_grid={'randomforestclassifier__min_samples_leaf': [1, 10],
                         'randomforestclassifier__n_estimators': [50, 200]})

In [117]:
gs.best_params_

{'randomforestclassifier__min_samples_leaf': 1,
 'randomforestclassifier__n_estimators': 200}

In [118]:
in_sample_preds = gs.predict(train_values_subset)
f1_score(train_labels_subset, in_sample_preds, average='micro')

0.984297834620742

In [119]:
test_values_subset = df_test_values_original
test_values_subset = pd.get_dummies(test_values_subset)

In [120]:
predictions = gs.predict(test_values_subset)

In [121]:
submission_format = pd.read_csv('submission_format.csv', index_col='building_id')

In [122]:
my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [123]:
my_submission.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,2
99355,2
890251,2
745817,1
421793,3


In [126]:
my_submission.to_csv('submission3.csv')

In [125]:
first iteration: 
    - sigo tutorial y elijo parametros que parecen importantes segun el informe        realizaco para la primer parte
    - selected_features = ['geo_level_1_id', 'has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone', 'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick', 'has_superstructure_timber', 'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered', 'has_superstructure_rc_engineered', 'has_superstructure_other', 'area_percentage', 'height_percentage', 'legal_ownership_status']
    - param_grid = {'randomforestclassifier__n_estimators': [50, 200],
              'randomforestclassifier__min_samples_leaf': [1, 10]}
    - f1Score = 0.6830211702948186

SyntaxError: invalid syntax (<ipython-input-125-ab25fa539c34>, line 1)

In [None]:
second iteration: 
    - cambio geolevel 1 por 3, mas especifico
    - selected_features = ['geo_level_3_id', 'has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone', 'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick', 'has_superstructure_timber', 'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered', 'has_superstructure_rc_engineered', 'has_superstructure_other', 'area_percentage', 'height_percentage', 'legal_ownership_status']
    - param_grid = {'randomforestclassifier__n_estimators': [50, 200],
              'randomforestclassifier__min_samples_leaf': [1, 10]}
    - f1Score = 0.8886305117785427

In [None]:
third iteration:
    - igual que la segunda, solo que dropeo legal_ownership_status_v y sumo los pisos 1 2 3 y 4 como columnas 1/0
    - f1Score = 0.8920533689433273

In [None]:
Realizo la primer entrega, conseguimos un score del 0.6520

In [None]:
fourth iteration:
    - agregue todas las columnas, pasando los categorys con get_dummys. Nos da un total de 68 columnas.
    - f1Score = 0.984297834620742
    - puede ser un caso de overfitting?

In [127]:
Realizo segunda entrega, conseguimos un score del 0.71
Tener todos los features basicos mejora el score, no por mucho igualmente
Teniendo en cuenta una diferencia de 3 veces mas features, solo hubo un aumento del 0.06 en el F1 score

SyntaxError: invalid syntax (<ipython-input-127-fec072e3d6fd>, line 1)

In [None]:
# 