In [56]:
import pandas as pd
pd.options.display.float_format = '{:20,.2f}'.format

In [57]:
# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import f1_score

In [58]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [59]:
df_train_labels_original = pd.read_csv('train_labels.csv',low_memory=False,index_col='building_id')
df_train_values_original = pd.read_csv('train_values.csv',low_memory=False, index_col='building_id', dtype= {
'geo_level_1_id':'uint8', 'geo_level_2_id':'uint16', 'geo_level_3_id':'uint16', 'count_floors_pre_eq':'uint8','age':'uint16', 'area_percentage':'uint16', 'height_percentage':'uint16', 
'land_surface_condition':'category', 'foundation_type':'category', 'roof_type':'category', 'ground_floor_type':'category', 'other_floor_type':'category', 'position':'category','plan_configuration':'category', 
'has_superstructure_adobe_mud':'bool', 'has_superstructure_mud_mortar_stone':'bool','has_superstructure_stone_flag':'bool', 'has_superstructure_cement_mortar_stone':'bool', 'has_superstructure_mud_mortar_brick':'bool', 'has_superstructure_cement_mortar_brick':'bool', 'has_superstructure_timber':'bool', 'has_superstructure_bamboo':'bool', 'has_superstructure_rc_non_engineered':'bool', 'has_superstructure_rc_engineered':'bool', 'has_superstructure_other':'bool', 
'legal_ownership_status':'category', 'count_families':'uint16', 
'has_secondary_use':'bool', 'has_secondary_use_agriculture':'bool', 'has_secondary_use_hotel':'bool', 'has_secondary_use_rental':'bool', 'has_secondary_use_institution':'bool', 'has_secondary_use_school':'bool', 'has_secondary_use_industry':'bool', 'has_secondary_use_health_post':'bool', 'has_secondary_use_gov_office':'bool', 'has_secondary_use_use_police':'bool', 'has_secondary_use_other':'bool',})
df_test_values_original = pd.read_csv('test_values.csv',low_memory=False, index_col='building_id', dtype= {
'geo_level_1_id':'uint8', 'geo_level_2_id':'uint16', 'geo_level_3_id':'uint16', 'count_floors_pre_eq':'uint8','age':'uint16', 'area_percentage':'uint16', 'height_percentage':'uint16', 
'land_surface_condition':'category', 'foundation_type':'category', 'roof_type':'category', 'ground_floor_type':'category', 'other_floor_type':'category', 'position':'category','plan_configuration':'category', 
'has_superstructure_adobe_mud':'bool', 'has_superstructure_mud_mortar_stone':'bool','has_superstructure_stone_flag':'bool', 'has_superstructure_cement_mortar_stone':'bool', 'has_superstructure_mud_mortar_brick':'bool', 'has_superstructure_cement_mortar_brick':'bool', 'has_superstructure_timber':'bool', 'has_superstructure_bamboo':'bool', 'has_superstructure_rc_non_engineered':'bool', 'has_superstructure_rc_engineered':'bool', 'has_superstructure_other':'bool', 
'legal_ownership_status':'category', 'count_families':'uint16', 
'has_secondary_use':'bool', 'has_secondary_use_agriculture':'bool', 'has_secondary_use_hotel':'bool', 'has_secondary_use_rental':'bool', 'has_secondary_use_institution':'bool', 'has_secondary_use_school':'bool', 'has_secondary_use_industry':'bool', 'has_secondary_use_health_post':'bool', 'has_secondary_use_gov_office':'bool', 'has_secondary_use_use_police':'bool', 'has_secondary_use_other':'bool',})

In [60]:
train_values_subset = pd.get_dummies(df_train_values_original)
train_labels_subset = df_train_labels_original['damage_grade']

validation_size = df_train_values_original.index.size - df_test_values_original.index.size

In [61]:
test_values_subset = pd.get_dummies(df_test_values_original)
geo1Test = pd.get_dummies(test_values_subset["geo_level_1_id"])
geo1TestNames = {}
for x in geo1Test.columns: geo1TestNames[x] = ('geo1Test_'+ str(x))
geo1Test = geo1Test.rename(geo1TestNames, axis=1)

test_values_subset = pd.concat([test_values_subset, geo1Test], axis=1)

mean_df = pd.read_csv('geolevel_2_id_mean.csv',low_memory=False)
test_values_subset = test_values_subset.reset_index().merge(mean_df,on='geo_level_2_id',how='left').set_index('building_id')
average_damage = pd.get_dummies(df_train_labels_original['damage_grade'][:int(len(df_train_labels_original)/2)]).mean().values
test_values_subset['0'] = test_values_subset['0'].fillna(average_damage[0])
test_values_subset['1'] = test_values_subset['1'].fillna(average_damage[1])
test_values_subset['2'] = test_values_subset['2'].fillna(average_damage[2])

In [62]:
geo1 = pd.get_dummies(train_values_subset["geo_level_1_id"])
geo1Names = {}
for x in geo1.columns: geo1Names[x] = ('geo1_'+ str(x))
geo1 = geo1.rename(geo1Names, axis=1)

In [63]:
train_values_subset = pd.concat([train_values_subset, geo1], axis=1)

In [64]:
mean_df = pd.read_csv('geolevel_2_id_mean.csv',low_memory=False)
train_values_subset = train_values_subset.reset_index().merge(mean_df,on='geo_level_2_id',how='left').set_index('building_id')
average_damage = pd.get_dummies(df_train_labels_original['damage_grade'][:int(len(df_train_labels_original)/2)]).mean().values
train_values_subset['0'] = train_values_subset['0'].fillna(average_damage[0])
train_values_subset['1'] = train_values_subset['1'].fillna(average_damage[1])
train_values_subset['2'] = train_values_subset['2'].fillna(average_damage[2])

In [65]:
train_values, validation_values = (train_values_subset.iloc[0:173733], train_values_subset.iloc[173733:-1])
train_labels, validation_labels = (train_labels_subset.iloc[0:173733], train_labels_subset.iloc[173733:-1])

In [66]:
param_grid = {
        'max_depth': [3, 5, 6, 7, 8, 10],
        'learning_rate': [0.1, 0.4, 0.43, 0.45, 0.5, 0.7],
        'subsample': [0.5, 0.75, 1.0],
        'colsample_bytree': [0.1, 0.5, 0.7, 0.8, 0.9, 1.0],
        'colsample_bylevel': [0.1, 0.5, 0.8, 1.0],
        'colsample_bynode': [0.1, 0.3, 0.5, 0.75, 0.9, 1.0],
        'min_child_weight': [0.1, 0.5, 1.0, 3.0, 5.0, 7.0, 10.0, 15, 20],
        'gamma': [0, 0.1, 0.15, 0.25, 0.5, 1],
        'reg_lambda': [0.1, 0.5, 1.0, 5.0, ],
        'reg_alpha': [0.1, 1.0, 5.0, 10.0],
        'n_estimators': [100, 200, 300, 400, 500]}

In [67]:
rs_clf = RandomizedSearchCV(XGBClassifier(), param_grid, verbose=2, scoring='f1_micro', cv=2, n_iter=50)

In [68]:
# rs_clf.fit(train_values, train_labels)

In [69]:
# rs_clf.best_params_

In [70]:
modelo = xgb.XGBClassifier(eta=0.4, n_estimators=400, subsample=1, 
                        reg_lambda=0.5, reg_alpha= 5, min_child_weight=3, max_depth=6, 
                        gamma=0.25, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=0.5)

In [75]:
modelo.fit(train_values, train_labels)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=0.5, colsample_bytree=1, eta=0.4, gamma=0.25,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.400000006, max_delta_step=0, max_depth=6,
              min_child_weight=3, missing=nan, monotone_constraints='()',
              n_estimators=400, n_jobs=16, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=5,
              reg_lambda=0.5, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [76]:
predictions = modelo.predict(validation_values) 
predictionsProba = modelo.predict_proba(validation_values) 

In [77]:
f1_score(validation_labels, predictions, average='micro')

0.7467507799279358

Una parte del proceso de desarrollo, no es exaustivo ni completo, solo algunas cosas que fueron siendo anotadas aca

- 0.7237961481345045 normal features + geolevel 1 y 2 categorizado
- 0.7129174485132443 selected features del anterior
- 0.7275720354104551 new best, normal features. solo get dummys
- 0.6717856032785753 normal features, selected. muy pocas categorias
- 0.7248322147651006 normal features + geoLevel1 categorizado.
- 0.6693450907709487 normal features + geolevel1 categorizado selected (solo 24 features)
- 0.7398513436249285 score con todos los datos, con normal features + geolevel1 categorizado
- Score real con lo de arriba: 0.7249
- cambiar a dart no trae mejoras
- cambiar a gblinear empeora

- 0.7262596843450332 baseline 
- 0.738093867636732 eta 0.7
- 0.7415474230720526 eta 0.7, n_estimators = 200
- 0.7438152578079132 eta=0.43, n_estimators=300

- usar randomized search y probar otros algoritmos
- 0.744264220014505 params3
- 0.7459334384749099 con 50 geolevels
- 0.7453002866451012 ultima entrega con mean encoding 

In [54]:
submission_format = pd.read_csv('submission_format.csv', index_col='building_id')
my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)
my_submission.to_csv('submissionXgBoost_meanEncoding_geo2.csv')

In [78]:
df_prediction_xgBoost = pd.DataFrame(predictionsProba, columns = ['xgb1', 'xgb2', 'xgb3'])
df_prediction_xgBoost.to_csv('xgBoostPredictionProbaTrain.csv')