In [1]:
filepath = 'data/'
#Imports
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from category_encoders import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import RandomizedSearchCV

In [2]:
def wrangle(fm_path, tv_path=None):
    if tv_path:
        df = pd.merge(pd.read_csv(fm_path),
                      pd.read_csv(tv_path)).set_index('building_id')
    else:
        df = pd.read_csv(fm_path, index_col='building_id')
        
    
    return df

In [3]:
df = wrangle(filepath + 'train_values.csv', filepath + 'train_labels.csv')
print(df.shape)
df.head()

(260601, 39)


Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
28830,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,2
94947,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
590882,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,2
201944,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,3


In [4]:
target = 'damage_grade'
X = df.drop(columns=target)
y = df[target]
print(X.shape)
y.shape

(260601, 38)


(260601,)

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
print('Baseline accuracy:', y_train.value_counts(normalize=True).max())

Baseline accuracy: 0.5697045280122793


In [7]:
model_rf = make_pipeline(
    OrdinalEncoder(),
    RandomForestClassifier(random_state=42, n_jobs=-1)
)

model_rf.fit(X_train, y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['land_surface_condition',
                                      'foundation_type', 'roof_type',
                                      'ground_floor_type', 'other_floor_type',
                                      'position', 'plan_configuration',
                                      'legal_ownership_status'],
                                mapping=[{'col': 'land_surface_condition',
                                          'data_type': dtype('O'),
                                          'mapping': n      1
o      2
t      3
NaN   -2
dtype: int64},
                                         {'col': 'foundation_type',
                                          'data_type': dtype('O...
dtype: int64},
                                         {'col': 'position',
                                          'data_type': dtype('O'),
                                          'mapping': s      1
t      2
j      3
o      4
Na

In [8]:
print('RF Training score before tuning: ', model_rf.score(X_train, y_train))
print('RF Validation score before tuning: ', model_rf.score(X_val, y_val))

RF Training score before tuning:  0.9867709132770529
RF Validation score before tuning:  0.7200168837896433


In [9]:
#Hyperparameter tuning
params_grid = {
    'randomforestclassifier__max_depth':range(29, 33, 1),
    'randomforestclassifier__n_estimators':range(130, 140, 2)
}

model = RandomizedSearchCV(
    model_rf, 
    param_distributions=params_grid,
    n_jobs=-1,
    n_iter=30,
    cv=5,
    verbose=1
    )
model.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  elif pd.api.types.is_categorical(cols):


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('ordinalencoder',
                                              OrdinalEncoder(cols=['land_surface_condition',
                                                                   'foundation_type',
                                                                   'roof_type',
                                                                   'ground_floor_type',
                                                                   'other_floor_type',
                                                                   'position',
                                                                   'plan_configuration',
                                                                   'legal_ownership_status'],
                                                             mapping=[{'col': 'land_surface_condition',
                                                                       'data_type': dtype('O'),
               

In [10]:
model_be = model.best_estimator_
print('RF hyperparameter tuned test score: ', model_be.score(X_train, y_train))
print('RF hyperparameter tuned score: ', model_be.score(X_val, y_val))
print(model.best_params_)

RF hyperparameter tuned test score:  0.93788852647736
RF hyperparameter tuned score:  0.9380288175591412
{'randomforestclassifier__n_estimators': 136, 'randomforestclassifier__max_depth': 30}


In [11]:
#permutaion_importance.
perm_imp = permutation_importance(model_be, X_val, y_val, random_state=42)
data = {'imp_mean':perm_imp['importances_mean'],
        'imp_std':perm_imp['importances_std']}
df_perm = pd.DataFrame(data, index=X_val.columns).sort_values('imp_mean')
df_perm

Unnamed: 0,imp_mean,imp_std
has_secondary_use_gov_office,0.0,0.0
has_secondary_use_institution,1.5e-05,8e-06
has_secondary_use_use_police,1.9e-05,0.0
has_secondary_use_health_post,3.8e-05,0.0
has_secondary_use_school,3.8e-05,0.0
has_secondary_use_industry,6.9e-05,9e-06
has_secondary_use_rental,0.000407,4.1e-05
has_secondary_use_other,0.000668,1.4e-05
has_superstructure_other,0.001999,6.7e-05
has_secondary_use_hotel,0.002471,0.000129


In [18]:
cols_to_remove = df_perm[df_perm['imp_mean'] <= 0.000019].index
model_bedc = model.best_estimator_
model_bedc.fit(X_train.drop(columns=cols_to_remove), y_train)

  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['land_surface_condition',
                                      'foundation_type', 'roof_type',
                                      'ground_floor_type', 'other_floor_type',
                                      'position', 'plan_configuration',
                                      'legal_ownership_status'],
                                mapping=[{'col': 'land_surface_condition',
                                          'data_type': dtype('O'),
                                          'mapping': n      1
o      2
t      3
NaN   -2
dtype: int64},
                                         {'col': 'foundation_type',
                                          'data_type': dtype('O...
                                          'data_type': dtype('O'),
                                          'mapping': s      1
t      2
j      3
o      4
NaN   -2
dtype: int64},
                                         {'col': 'plan

In [19]:
print('RF tuned and dropped low importance train score: ', model_bedc.score(X_train.drop(columns=cols_to_remove), y_train))
print('RF tuned and dropped low importance validation score: ', model_bedc.score(X_val.drop(columns=cols_to_remove), y_val))

RF tuned and dropped low importance train score:  0.9476640445126631
RF tuned and dropped low importance validation score:  0.7265977245256231


In [None]:
#ROC curve