In [56]:
import pandas as pd
pd.options.display.float_format = '{:20,.2f}'.format

In [57]:
pd.options.display.max_seq_items = 2000

In [58]:
from sklearn.feature_selection import SelectFromModel
from xgboost.sklearn import XGBClassifier

In [59]:
df_train_labels_original = pd.read_csv('train_labels.csv',low_memory=False,index_col='building_id')
df_train_values_original = pd.read_csv('train_values.csv',low_memory=False, index_col='building_id', dtype= {
'geo_level_1_id':'uint8', 'geo_level_2_id':'uint16', 'geo_level_3_id':'uint16', 'count_floors_pre_eq':'uint8','age':'uint16', 'area_percentage':'uint16', 'height_percentage':'uint16', 
'land_surface_condition':'category', 'foundation_type':'category', 'roof_type':'category', 'ground_floor_type':'category', 'other_floor_type':'category', 'position':'category','plan_configuration':'category', 
'has_superstructure_adobe_mud':'bool', 'has_superstructure_mud_mortar_stone':'bool','has_superstructure_stone_flag':'bool', 'has_superstructure_cement_mortar_stone':'bool', 'has_superstructure_mud_mortar_brick':'bool', 'has_superstructure_cement_mortar_brick':'bool', 'has_superstructure_timber':'bool', 'has_superstructure_bamboo':'bool', 'has_superstructure_rc_non_engineered':'bool', 'has_superstructure_rc_engineered':'bool', 'has_superstructure_other':'bool', 
'legal_ownership_status':'category', 'count_families':'uint16', 
'has_secondary_use':'bool', 'has_secondary_use_agriculture':'bool', 'has_secondary_use_hotel':'bool', 'has_secondary_use_rental':'bool', 'has_secondary_use_institution':'bool', 'has_secondary_use_school':'bool', 'has_secondary_use_industry':'bool', 'has_secondary_use_health_post':'bool', 'has_secondary_use_gov_office':'bool', 'has_secondary_use_use_police':'bool', 'has_secondary_use_other':'bool',})
df_test_values_original = pd.read_csv('test_values.csv',low_memory=False, index_col='building_id', dtype= {
'geo_level_1_id':'uint8', 'geo_level_2_id':'uint16', 'geo_level_3_id':'uint16', 'count_floors_pre_eq':'uint8','age':'uint16', 'area_percentage':'uint16', 'height_percentage':'uint16', 
'land_surface_condition':'category', 'foundation_type':'category', 'roof_type':'category', 'ground_floor_type':'category', 'other_floor_type':'category', 'position':'category','plan_configuration':'category', 
'has_superstructure_adobe_mud':'bool', 'has_superstructure_mud_mortar_stone':'bool','has_superstructure_stone_flag':'bool', 'has_superstructure_cement_mortar_stone':'bool', 'has_superstructure_mud_mortar_brick':'bool', 'has_superstructure_cement_mortar_brick':'bool', 'has_superstructure_timber':'bool', 'has_superstructure_bamboo':'bool', 'has_superstructure_rc_non_engineered':'bool', 'has_superstructure_rc_engineered':'bool', 'has_superstructure_other':'bool', 
'legal_ownership_status':'category', 'count_families':'uint16', 
'has_secondary_use':'bool', 'has_secondary_use_agriculture':'bool', 'has_secondary_use_hotel':'bool', 'has_secondary_use_rental':'bool', 'has_secondary_use_institution':'bool', 'has_secondary_use_school':'bool', 'has_secondary_use_industry':'bool', 'has_secondary_use_health_post':'bool', 'has_secondary_use_gov_office':'bool', 'has_secondary_use_use_police':'bool', 'has_secondary_use_other':'bool',})

In [60]:
train_values_subset = pd.get_dummies(df_train_values_original)
train_labels_subset = df_train_labels_original['damage_grade']

validation_size = df_train_values_original.index.size - df_test_values_original.index.size

In [61]:
geo1 = pd.get_dummies(train_values_subset["geo_level_1_id"])
geo1Names = {}
for x in geo1.columns: geo1Names[x] = ('geo1_'+ str(x))
geo1 = geo1.rename(geo1Names, axis=1)

In [62]:
geo2 = pd.get_dummies(train_values_subset["geo_level_2_id"])
geo2Names = {}
for x in geo2.columns: geo2Names[x] = ('geo2_'+ str(x))
geo2 = geo2.rename(geo2Names, axis=1)

In [63]:
train_values_subset = pd.concat([geo1], axis=1)

In [64]:
train_values, validation_values = (train_values_subset.iloc[0:173733], train_values_subset.iloc[173733:-1])
train_labels, validation_labels = (train_labels_subset.iloc[0:173733], train_labels_subset.iloc[173733:-1])

In [68]:
sel = SelectFromModel(XGBClassifier(eta=0.4, n_estimators=400, subsample=1, 
                        reg_lambda=0.5, reg_alpha= 5, min_child_weight=3, max_depth=6, 
                        gamma=0.25, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=0.5), max_features=20)
sel.fit(train_values, train_labels)
selected_feat= train_values_subset.columns[(sel.get_support())]
selected_feat



Index(['geo1_8', 'geo1_13', 'geo1_17', 'geo1_18', 'geo1_20', 'geo1_21', 'geo1_26', 'geo1_27'], dtype='object')

In [66]:
sel = SelectFromModel(XGBClassifier(eta=0.4, n_estimators=400, subsample=1, 
                        reg_lambda=0.5, reg_alpha= 5, min_child_weight=3, max_depth=6, 
                        gamma=0.25, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=0.5), max_features=10)
sel.fit(train_values, train_labels)
selected_feat= train_values_subset.columns[(sel.get_support())]
selected_feat



Index(['geo1_8', 'geo1_13', 'geo1_17', 'geo1_18', 'geo1_20', 'geo1_21', 'geo1_26', 'geo1_27'], dtype='object')

In [67]:
sel = SelectFromModel(XGBClassifier(eta=0.4, n_estimators=400, subsample=1, 
                        reg_lambda=0.5, reg_alpha= 5, min_child_weight=3, max_depth=6, 
                        gamma=0.25, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=0.5), max_features=15)
sel.fit(train_values, train_labels)
selected_feat= train_values_subset.columns[(sel.get_support())]
selected_feat



Index(['geo1_8', 'geo1_13', 'geo1_17', 'geo1_18', 'geo1_20', 'geo1_21', 'geo1_26', 'geo1_27'], dtype='object')

In [52]:
sel = SelectFromModel(XGBClassifier(eta=0.4, n_estimators=400, subsample=1, 
                        reg_lambda=0.5, reg_alpha= 5, min_child_weight=3, max_depth=6, 
                        gamma=0.25, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=0.5), max_features=5)
sel.fit(train_values, train_labels)
selected_feat= train_values_subset.columns[(sel.get_support())]
selected_feat



Index(['geo2_36', 'geo2_39', 'geo2_105', 'geo2_142', 'geo2_173', 'geo2_229', 'geo2_233', 'geo2_303', 'geo2_323', 'geo2_399', 'geo2_421', 'geo2_477', 'geo2_617', 'geo2_641', 'geo2_682', 'geo2_811', 'geo2_817', 'geo2_819', 'geo2_886', 'geo2_896', 'geo2_937', 'geo2_977', 'geo2_1001', 'geo2_1050', 'geo2_1074', 'geo2_1080', 'geo2_1149', 'geo2_1183', 'geo2_1253', 'geo2_1401'], dtype='object')

In [53]:
sel = SelectFromModel(XGBClassifier(eta=0.4, n_estimators=400, subsample=1, 
                        reg_lambda=0.5, reg_alpha= 5, min_child_weight=3, max_depth=6, 
                        gamma=0.25, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=0.5), max_features=5)
sel.fit(train_values, train_labels)
selected_feat= train_values_subset.columns[(sel.get_support())]
selected_feat



Index(['geo2_36', 'geo2_39', 'geo2_421', 'geo2_617', 'geo2_1149'], dtype='object')

In [54]:
sel = SelectFromModel(XGBClassifier(eta=0.4, n_estimators=400, subsample=1, 
                        reg_lambda=0.5, reg_alpha= 5, min_child_weight=3, max_depth=6, 
                        gamma=0.25, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=0.5), max_features=2)
sel.fit(train_values, train_labels)
selected_feat= train_values_subset.columns[(sel.get_support())]
selected_feat



Index(['geo2_39', 'geo2_421'], dtype='object')

In [37]:
selected_feat

Index(['geo_level_1_id', 'has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag', 'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick', 'foundation_type_h', 'foundation_type_i', 'foundation_type_r', 'foundation_type_w', 'roof_type_x', 'ground_floor_type_v', 'other_floor_type_q', 'other_floor_type_s', 'plan_configuration_u', 'geo1_1', 'geo1_3', 'geo1_4', 'geo1_5', 'geo1_6', 'geo1_8', 'geo1_10', 'geo1_11', 'geo1_13', 'geo1_17', 'geo1_18', 'geo1_20', 'geo1_21', 'geo1_24', 'geo1_26', 'geo1_27', 'geo2_10', 'geo2_14', 'geo2_21', 'geo2_29', 'geo2_30', 'geo2_38', 'geo2_41', 'geo2_46', 'geo2_48', 'geo2_49', 'geo2_76', 'geo2_87', 'geo2_88', 'geo2_91', 'geo2_95', 'geo2_104', 'geo2_105', 'geo2_107', 'geo2_127', 'geo2_138', 'geo2_142', 'geo2_155', 'geo2_165', 'geo2_173', 'geo2_181', 'geo2_199', 'geo2_225', 'geo2_239', 'geo2_257', 'geo2_258', 'geo2_260', 'geo2_269', 'geo2_275', 'geo2_276', 'geo2_282', 'geo2_310', 'geo2_323',
 