In [118]:
import pandas as pd
pd.options.display.float_format = '{:20,.2f}'.format
import matplotlib.pyplot as plt
import numpy as np

In [119]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.ensemble import RandomForestClassifier

# for combining the preprocess with model training
from sklearn.pipeline import make_pipeline

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import f1_score
from sklearn.tree import export_graphviz

from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

In [120]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [121]:
pd.options.display.max_seq_items = 200


In [122]:
df_train_labels_original = pd.read_csv('train_labels.csv',low_memory=False,index_col='building_id')
df_train_values_original = pd.read_csv('train_values.csv',low_memory=False, index_col='building_id', dtype= {
'geo_level_1_id':'uint8', 'geo_level_2_id':'uint16', 'geo_level_3_id':'uint16', 'count_floors_pre_eq':'uint8','age':'uint16', 'area_percentage':'uint16', 'height_percentage':'uint16', 
'land_surface_condition':'category', 'foundation_type':'category', 'roof_type':'category', 'ground_floor_type':'category', 'other_floor_type':'category', 'position':'category','plan_configuration':'category', 
'has_superstructure_adobe_mud':'bool', 'has_superstructure_mud_mortar_stone':'bool','has_superstructure_stone_flag':'bool', 'has_superstructure_cement_mortar_stone':'bool', 'has_superstructure_mud_mortar_brick':'bool', 'has_superstructure_cement_mortar_brick':'bool', 'has_superstructure_timber':'bool', 'has_superstructure_bamboo':'bool', 'has_superstructure_rc_non_engineered':'bool', 'has_superstructure_rc_engineered':'bool', 'has_superstructure_other':'bool', 
'legal_ownership_status':'category', 'count_families':'uint16', 
'has_secondary_use':'bool', 'has_secondary_use_agriculture':'bool', 'has_secondary_use_hotel':'bool', 'has_secondary_use_rental':'bool', 'has_secondary_use_institution':'bool', 'has_secondary_use_school':'bool', 'has_secondary_use_industry':'bool', 'has_secondary_use_health_post':'bool', 'has_secondary_use_gov_office':'bool', 'has_secondary_use_use_police':'bool', 'has_secondary_use_other':'bool',})
df_test_values_original = pd.read_csv('test_values.csv',low_memory=False, index_col='building_id', dtype= {
'geo_level_1_id':'uint8', 'geo_level_2_id':'uint16', 'geo_level_3_id':'uint16', 'count_floors_pre_eq':'uint8','age':'uint16', 'area_percentage':'uint16', 'height_percentage':'uint16', 
'land_surface_condition':'category', 'foundation_type':'category', 'roof_type':'category', 'ground_floor_type':'category', 'other_floor_type':'category', 'position':'category','plan_configuration':'category', 
'has_superstructure_adobe_mud':'bool', 'has_superstructure_mud_mortar_stone':'bool','has_superstructure_stone_flag':'bool', 'has_superstructure_cement_mortar_stone':'bool', 'has_superstructure_mud_mortar_brick':'bool', 'has_superstructure_cement_mortar_brick':'bool', 'has_superstructure_timber':'bool', 'has_superstructure_bamboo':'bool', 'has_superstructure_rc_non_engineered':'bool', 'has_superstructure_rc_engineered':'bool', 'has_superstructure_other':'bool', 
'legal_ownership_status':'category', 'count_families':'uint16', 
'has_secondary_use':'bool', 'has_secondary_use_agriculture':'bool', 'has_secondary_use_hotel':'bool', 'has_secondary_use_rental':'bool', 'has_secondary_use_institution':'bool', 'has_secondary_use_school':'bool', 'has_secondary_use_industry':'bool', 'has_secondary_use_health_post':'bool', 'has_secondary_use_gov_office':'bool', 'has_secondary_use_use_police':'bool', 'has_secondary_use_other':'bool',})

In [123]:
train_values_subset = pd.get_dummies(df_train_values_original)
train_labels_subset = df_train_labels_original['damage_grade']

validation_size = df_train_values_original.index.size - df_test_values_original.index.size

In [124]:
test_values_subset = pd.get_dummies(df_test_values_original)
geo1Test = pd.get_dummies(test_values_subset["geo_level_1_id"])
geo1TestNames = {}
for x in geo1Test.columns: geo1TestNames[x] = ('geo1Test_'+ str(x))
geo1Test = geo1Test.rename(geo1TestNames, axis=1)
test_values_subset = pd.concat([test_values_subset, geo1Test], axis=1)

In [125]:
# normal features selected
#  train_values_subset = train_values_subset[['geo_level_1_id', 'has_superstructure_mud_mortar_stone',
#        'has_superstructure_cement_mortar_brick', 'foundation_type_i',
#        'foundation_type_r', 'foundation_type_w', 'roof_type_x',
#        'ground_floor_type_v', 'other_floor_type_q']]

In [126]:
geo1 = pd.get_dummies(train_values_subset["geo_level_1_id"])
geo1Names = {}
for x in geo1.columns: geo1Names[x] = ('geo1_'+ str(x))
geo1 = geo1.rename(geo1Names, axis=1)

In [127]:
# geo2 = pd.get_dummies(train_values_subset["geo_level_2_id"])
# geo2Names = {}
# for x in geo2.columns: geo2Names[x] = ('geo2_'+ str(x))
# geo2 = geo2.rename(geo2Names, axis=1)

In [128]:
train_values_subset = pd.concat([train_values_subset, geo1], axis=1)

In [129]:
# train_values_subset = train_values_subset[['has_superstructure_mud_mortar_stone',
#        'has_superstructure_cement_mortar_brick', 'foundation_type_i',
#        'foundation_type_r', 'roof_type_x', 'ground_floor_type_v',
#        'other_floor_type_q', 'geo1_1', 'geo1_4', 'geo1_5', 'geo1_8', 'geo1_10',
#        'geo1_11', 'geo1_13', 'geo1_17', 'geo1_18', 'geo1_20', 'geo1_21',
#        'geo1_22', 'geo1_24', 'geo1_25', 'geo1_26', 'geo1_27', 'geo1_30']]

In [13]:
train_values, validation_values = (train_values_subset.iloc[0:173733], train_values_subset.iloc[173733:-1])
train_labels, validation_labels = (train_labels_subset.iloc[0:173733], train_labels_subset.iloc[173733:-1])

In [139]:
# 'importance_gain': ['gain'],
# 'kearning_rate': [0.5],

param_grid = {'booster': ['gbtree'],
'eta': [0.37, 0.4, 0.43, 0.7],
'max_depth': [6, 7, 8],
'subsample': [1],
'base_score': [0.5],
'gpu_id': [-1],
'n_estimators': [200, 250, 300],
'n_jobs': [16],
'objective': ['multi:softprob'],
'random_state': [0],
'reg_alpha': [0],
'reg_lambda': [0],
'validate_parameters': [1]}
gs = GridSearchCV(XGBClassifier(), param_grid)

In [None]:
gs.fit(train_values, train_labels)

In [141]:
gs.best_params_

{'base_score': 0.5,
 'booster': 'gbtree',
 'eta': 0.43,
 'gpu_id': -1,
 'max_depth': 6,
 'n_estimators': 300,
 'n_jobs': 16,
 'objective': 'multi:softprob',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 0,
 'subsample': 1,
 'validate_parameters': 1}

In [142]:
modelo = xgb.XGBClassifier(eta=0.43, n_estimators=300)

In [144]:
modelo.fit(train_values, train_labels)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.43, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.430000007, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=300, n_jobs=16, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [145]:
predictions = modelo.predict(validation_values) 

In [146]:
f1_score(validation_labels, predictions, average='micro')
# 0.7237961481345045 normal features + geolevel 1 y 2 categorizado
# 0.7129174485132443 selected features del anterior
# 0.7275720354104551 new best, normal features. solo get dummys
# 0.6717856032785753 normal features, selected. muy pocas categorias
# 0.7248322147651006 normal features + geoLevel1 categorizado.
# 0.6693450907709487 normal features + geolevel1 categorizado selected (solo 24 features)
# 0.7398513436249285 score con todos los datos, con normal features + geolevel1 categorizado
# Score real con lo de arriba: 0.7249
# cambiar a dart no trae mejoras
# cambiar a gblinear empeora

# 0.7262596843450332 baseline 
# 0.738093867636732 eta 0.7
# 0.7415474230720526 eta 0.7, n_estimators = 200
# 0.7438152578079132 eta=0.43, n_estimators=300
# esta fue la ultima entrega

# usar randomized search y probar otros algoritmos

0.7438152578079132

In [134]:
submission_format = pd.read_csv('submission_format.csv', index_col='building_id')
my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)
my_submission.to_csv('submission7.csv')

In [23]:
# train_values_subset = train_values_subset[['has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
#        'has_superstructure_stone_flag', 'has_superstructure_mud_mortar_brick',
#        'has_superstructure_cement_mortar_brick',
#        'has_superstructure_rc_engineered', 'has_superstructure_other',
#        'foundation_type_h', 'foundation_type_i', 'foundation_type_r',
#        'foundation_type_w', 'roof_type_q', 'roof_type_x',
#        'ground_floor_type_v', 'other_floor_type_j', 'other_floor_type_q',
#        'other_floor_type_s', 'plan_configuration_u', 'geo1_0', 'geo1_1',
#        'geo1_2', 'geo1_3', 'geo1_4', 'geo1_5', 'geo1_6', 'geo1_7', 'geo1_8',
#        'geo1_9', 'geo1_10', 'geo1_11', 'geo1_12', 'geo1_13', 'geo1_14',
#        'geo1_15', 'geo1_16', 'geo1_17', 'geo1_18', 'geo1_19', 'geo1_20',
#        'geo1_21', 'geo1_22', 'geo1_24', 'geo1_25', 'geo1_26', 'geo1_27',
#        'geo1_28', 'geo1_29', 'geo1_30', 'geo2_4', 'geo2_9', 'geo2_10',
#        'geo2_13', 'geo2_14', 'geo2_20', 'geo2_21', 'geo2_26', 'geo2_27',
#        'geo2_30', 'geo2_31', 'geo2_36', 'geo2_38', 'geo2_41', 'geo2_46',
#        'geo2_48', 'geo2_49', 'geo2_50', 'geo2_52', 'geo2_57', 'geo2_64',
#        'geo2_65', 'geo2_66', 'geo2_70', 'geo2_74', 'geo2_76', 'geo2_79',
#        'geo2_87', 'geo2_88', 'geo2_91', 'geo2_94', 'geo2_95', 'geo2_96',
#        'geo2_103', 'geo2_104', 'geo2_105', 'geo2_107', 'geo2_108', 'geo2_109',
#        'geo2_110', 'geo2_123', 'geo2_126', 'geo2_127', 'geo2_128', 'geo2_138',
#        'geo2_142', 'geo2_144', 'geo2_147', 'geo2_148', 'geo2_151', 'geo2_155',
#        'geo2_157', 'geo2_158', 'geo2_161', 'geo2_162', 'geo2_164', 'geo2_165',
#        'geo2_167', 'geo2_169', 'geo2_170', 'geo2_171', 'geo2_172', 'geo2_173',
#        'geo2_177', 'geo2_178', 'geo2_181', 'geo2_192', 'geo2_195', 'geo2_197',
#        'geo2_199', 'geo2_203', 'geo2_208', 'geo2_211', 'geo2_213', 'geo2_214',
#        'geo2_217', 'geo2_221', 'geo2_223', 'geo2_225', 'geo2_239', 'geo2_240',
#        'geo2_241', 'geo2_244', 'geo2_254', 'geo2_255', 'geo2_257', 'geo2_258',
#        'geo2_260', 'geo2_261', 'geo2_269', 'geo2_270', 'geo2_275', 'geo2_276',
#        'geo2_282', 'geo2_287', 'geo2_296', 'geo2_302', 'geo2_305', 'geo2_310',
#        'geo2_314', 'geo2_322', 'geo2_323', 'geo2_326', 'geo2_327', 'geo2_328',
#        'geo2_337', 'geo2_339', 'geo2_345', 'geo2_347', 'geo2_349', 'geo2_358',
#        'geo2_363', 'geo2_366', 'geo2_369', 'geo2_373', 'geo2_376', 'geo2_381',
#        'geo2_382', 'geo2_384', 'geo2_385', 'geo2_390', 'geo2_395', 'geo2_396',
#        'geo2_400', 'geo2_405', 'geo2_408', 'geo2_414', 'geo2_417', 'geo2_421',
#        'geo2_422', 'geo2_423', 'geo2_432', 'geo2_438', 'geo2_445', 'geo2_447',
#        'geo2_451', 'geo2_452', 'geo2_457', 'geo2_459', 'geo2_461', 'geo2_466',
#        'geo2_469', 'geo2_471', 'geo2_474', 'geo2_477', 'geo2_480', 'geo2_487',
#        'geo2_488', 'geo2_489', 'geo2_491', 'geo2_499', 'geo2_506', 'geo2_508',
#        'geo2_512', 'geo2_514', 'geo2_517', 'geo2_519', 'geo2_521', 'geo2_526',
#        'geo2_527', 'geo2_529', 'geo2_533', 'geo2_539', 'geo2_543', 'geo2_545',
#        'geo2_546', 'geo2_548', 'geo2_555', 'geo2_556', 'geo2_557', 'geo2_561',
#        'geo2_562', 'geo2_568', 'geo2_572', 'geo2_574', 'geo2_575', 'geo2_587',
#        'geo2_591', 'geo2_592', 'geo2_600', 'geo2_603', 'geo2_605', 'geo2_613',
#        'geo2_616', 'geo2_617', 'geo2_619', 'geo2_621', 'geo2_622', 'geo2_623',
#        'geo2_625', 'geo2_628', 'geo2_630', 'geo2_634', 'geo2_639', 'geo2_640',
#        'geo2_641', 'geo2_645', 'geo2_647', 'geo2_648', 'geo2_649', 'geo2_651',
#        'geo2_652', 'geo2_658', 'geo2_659', 'geo2_661', 'geo2_667', 'geo2_673',
#        'geo2_675', 'geo2_676', 'geo2_682', 'geo2_685', 'geo2_689', 'geo2_690',
#        'geo2_694', 'geo2_700', 'geo2_701', 'geo2_704', 'geo2_706', 'geo2_715',
#        'geo2_716', 'geo2_719', 'geo2_721', 'geo2_726', 'geo2_727', 'geo2_728',
#        'geo2_729', 'geo2_730', 'geo2_732', 'geo2_733', 'geo2_735', 'geo2_746',
#        'geo2_747', 'geo2_749', 'geo2_751', 'geo2_753', 'geo2_760', 'geo2_761',
#        'geo2_763', 'geo2_765', 'geo2_770', 'geo2_772', 'geo2_773', 'geo2_777',
#        'geo2_788', 'geo2_799', 'geo2_800', 'geo2_803', 'geo2_804', 'geo2_812',
#        'geo2_816', 'geo2_818', 'geo2_819', 'geo2_825', 'geo2_826', 'geo2_828',
#        'geo2_838', 'geo2_839', 'geo2_842', 'geo2_843', 'geo2_845', 'geo2_852',
#        'geo2_855', 'geo2_856', 'geo2_863', 'geo2_864', 'geo2_867', 'geo2_869',
#        'geo2_873', 'geo2_874', 'geo2_891', 'geo2_892', 'geo2_896', 'geo2_898',
#        'geo2_902', 'geo2_907', 'geo2_909', 'geo2_912', 'geo2_922', 'geo2_923',
#        'geo2_926', 'geo2_929', 'geo2_930', 'geo2_931', 'geo2_932', 'geo2_933',
#        'geo2_935', 'geo2_936', 'geo2_937', 'geo2_942', 'geo2_943', 'geo2_946',
#        'geo2_949', 'geo2_953', 'geo2_957', 'geo2_958', 'geo2_963', 'geo2_972',
#        'geo2_974', 'geo2_981', 'geo2_987', 'geo2_991', 'geo2_992', 'geo2_1005',
#        'geo2_1006', 'geo2_1007', 'geo2_1009', 'geo2_1012', 'geo2_1015',
#        'geo2_1019', 'geo2_1021', 'geo2_1023', 'geo2_1024', 'geo2_1031',
#        'geo2_1033', 'geo2_1035', 'geo2_1038', 'geo2_1039', 'geo2_1041',
#        'geo2_1046', 'geo2_1047', 'geo2_1049', 'geo2_1050', 'geo2_1051',
#        'geo2_1055', 'geo2_1056', 'geo2_1060', 'geo2_1062', 'geo2_1065',
#        'geo2_1073', 'geo2_1074', 'geo2_1076', 'geo2_1082', 'geo2_1085',
#        'geo2_1087', 'geo2_1088', 'geo2_1090', 'geo2_1091', 'geo2_1095',
#        'geo2_1101', 'geo2_1108', 'geo2_1114', 'geo2_1115', 'geo2_1126',
#        'geo2_1127', 'geo2_1128', 'geo2_1131', 'geo2_1133', 'geo2_1134',
#        'geo2_1137', 'geo2_1138', 'geo2_1140', 'geo2_1142', 'geo2_1147',
#        'geo2_1149', 'geo2_1154', 'geo2_1155', 'geo2_1156', 'geo2_1160',
#        'geo2_1161', 'geo2_1166', 'geo2_1168', 'geo2_1170', 'geo2_1181',
#        'geo2_1182', 'geo2_1183', 'geo2_1188', 'geo2_1194', 'geo2_1205',
#        'geo2_1206', 'geo2_1210', 'geo2_1211', 'geo2_1213', 'geo2_1215',
#        'geo2_1217', 'geo2_1218', 'geo2_1219', 'geo2_1221', 'geo2_1222',
#        'geo2_1223', 'geo2_1227', 'geo2_1228', 'geo2_1229', 'geo2_1234',
#        'geo2_1240', 'geo2_1241', 'geo2_1251', 'geo2_1253', 'geo2_1259',
#        'geo2_1265', 'geo2_1275', 'geo2_1277', 'geo2_1278', 'geo2_1282',
#        'geo2_1287', 'geo2_1289', 'geo2_1294', 'geo2_1297', 'geo2_1300',
#        'geo2_1308', 'geo2_1310', 'geo2_1313', 'geo2_1320', 'geo2_1322',
#        'geo2_1357', 'geo2_1359', 'geo2_1365', 'geo2_1366', 'geo2_1374',
#        'geo2_1376', 'geo2_1377', 'geo2_1378', 'geo2_1381', 'geo2_1382',
#        'geo2_1385', 'geo2_1386', 'geo2_1389', 'geo2_1394', 'geo2_1398',
#        'geo2_1399', 'geo2_1401', 'geo2_1402', 'geo2_1411', 'geo2_1415',
#        'geo2_1416', 'geo2_1417', 'geo2_1418', 'geo2_1421', 'geo2_1425',
#        'geo2_1426']]