In [72]:
import pandas as pd
pd.options.display.float_format = '{:20,.2f}'.format
import matplotlib.pyplot as plt
import numpy as np

In [73]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.ensemble import RandomForestClassifier

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import f1_score

In [74]:
df_train_labels_original = pd.read_csv('train_labels.csv',low_memory=False,index_col='building_id')
df_train_values_original = pd.read_csv('train_values.csv',low_memory=False, index_col='building_id', dtype= {
'geo_level_1_id':'uint8', 'geo_level_2_id':'uint16', 'geo_level_3_id':'uint16', 'count_floors_pre_eq':'uint8','age':'uint16', 'area_percentage':'uint16', 'height_percentage':'uint16', 
'land_surface_condition':'category', 'foundation_type':'category', 'roof_type':'category', 'ground_floor_type':'category', 'other_floor_type':'category', 'position':'category','plan_configuration':'category', 
'has_superstructure_adobe_mud':'bool', 'has_superstructure_mud_mortar_stone':'bool','has_superstructure_stone_flag':'bool', 'has_superstructure_cement_mortar_stone':'bool', 'has_superstructure_mud_mortar_brick':'bool', 'has_superstructure_cement_mortar_brick':'bool', 'has_superstructure_timber':'bool', 'has_superstructure_bamboo':'bool', 'has_superstructure_rc_non_engineered':'bool', 'has_superstructure_rc_engineered':'bool', 'has_superstructure_other':'bool', 
'legal_ownership_status':'category', 'count_families':'uint16', 
'has_secondary_use':'bool', 'has_secondary_use_agriculture':'bool', 'has_secondary_use_hotel':'bool', 'has_secondary_use_rental':'bool', 'has_secondary_use_institution':'bool', 'has_secondary_use_school':'bool', 'has_secondary_use_industry':'bool', 'has_secondary_use_health_post':'bool', 'has_secondary_use_gov_office':'bool', 'has_secondary_use_use_police':'bool', 'has_secondary_use_other':'bool',})
df_test_values_original = pd.read_csv('test_values.csv',low_memory=False, index_col='building_id', dtype= {
'geo_level_1_id':'uint8', 'geo_level_2_id':'uint16', 'geo_level_3_id':'uint16', 'count_floors_pre_eq':'uint8','age':'uint16', 'area_percentage':'uint16', 'height_percentage':'uint16', 
'land_surface_condition':'category', 'foundation_type':'category', 'roof_type':'category', 'ground_floor_type':'category', 'other_floor_type':'category', 'position':'category','plan_configuration':'category', 
'has_superstructure_adobe_mud':'bool', 'has_superstructure_mud_mortar_stone':'bool','has_superstructure_stone_flag':'bool', 'has_superstructure_cement_mortar_stone':'bool', 'has_superstructure_mud_mortar_brick':'bool', 'has_superstructure_cement_mortar_brick':'bool', 'has_superstructure_timber':'bool', 'has_superstructure_bamboo':'bool', 'has_superstructure_rc_non_engineered':'bool', 'has_superstructure_rc_engineered':'bool', 'has_superstructure_other':'bool', 
'legal_ownership_status':'category', 'count_families':'uint16', 
'has_secondary_use':'bool', 'has_secondary_use_agriculture':'bool', 'has_secondary_use_hotel':'bool', 'has_secondary_use_rental':'bool', 'has_secondary_use_institution':'bool', 'has_secondary_use_school':'bool', 'has_secondary_use_industry':'bool', 'has_secondary_use_health_post':'bool', 'has_secondary_use_gov_office':'bool', 'has_secondary_use_use_police':'bool', 'has_secondary_use_other':'bool',})

In [75]:
train_values_subset = pd.get_dummies(df_train_values_original)
train_labels_subset = df_train_labels_original['damage_grade']

validation_size = df_train_values_original.index.size - df_test_values_original.index.size

In [76]:
test_values_subset = pd.get_dummies(df_test_values_original)
geo1Test = pd.get_dummies(test_values_subset["geo_level_1_id"])
geo1TestNames = {}
for x in geo1Test.columns: geo1TestNames[x] = ('geo1Test_'+ str(x))
geo1Test = geo1Test.rename(geo1TestNames, axis=1)
# test_values_subset = pd.concat([test_values_subset, geo1Test], axis=1)

geo2Test = pd.get_dummies(test_values_subset["geo_level_2_id"])
geo2TestNames = {}
for x in geo2Test.columns: geo2TestNames[x] = ('geo2_'+ str(x))
geo2Test = geo2Test.rename(geo2TestNames, axis=1)

geo2Test = geo2Test[['geo2_36', 'geo2_39', 'geo2_51', 'geo2_88', 'geo2_105', 'geo2_142',
       'geo2_158', 'geo2_173', 'geo2_229', 'geo2_233', 'geo2_258', 'geo2_303',
       'geo2_323', 'geo2_363', 'geo2_399', 'geo2_421', 'geo2_477', 'geo2_508',
       'geo2_533', 'geo2_566', 'geo2_582', 'geo2_617', 'geo2_641', 'geo2_673',
       'geo2_682', 'geo2_715', 'geo2_797', 'geo2_811', 'geo2_817', 'geo2_819',
       'geo2_839', 'geo2_856', 'geo2_864', 'geo2_886', 'geo2_896', 'geo2_937',
       'geo2_977', 'geo2_1001', 'geo2_1009', 'geo2_1023', 'geo2_1050',
       'geo2_1074', 'geo2_1080', 'geo2_1132', 'geo2_1149', 'geo2_1155',
       'geo2_1183', 'geo2_1253', 'geo2_1313', 'geo2_1401']]

test_values_subset = pd.concat([test_values_subset, geo1Test, geo2Test], axis=1)

In [77]:
geo1 = pd.get_dummies(train_values_subset["geo_level_1_id"])
geo1Names = {}
for x in geo1.columns: geo1Names[x] = ('geo1_'+ str(x))
geo1 = geo1.rename(geo1Names, axis=1)

In [78]:
geo2 = pd.get_dummies(train_values_subset["geo_level_2_id"])
geo2Names = {}
for x in geo2.columns: geo2Names[x] = ('geo2_'+ str(x))
geo2 = geo2.rename(geo2Names, axis=1)
geo2 = geo2[['geo2_36', 'geo2_39', 'geo2_51', 'geo2_88', 'geo2_105', 'geo2_142',
       'geo2_158', 'geo2_173', 'geo2_229', 'geo2_233', 'geo2_258', 'geo2_303',
       'geo2_323', 'geo2_363', 'geo2_399', 'geo2_421', 'geo2_477', 'geo2_508',
       'geo2_533', 'geo2_566', 'geo2_582', 'geo2_617', 'geo2_641', 'geo2_673',
       'geo2_682', 'geo2_715', 'geo2_797', 'geo2_811', 'geo2_817', 'geo2_819',
       'geo2_839', 'geo2_856', 'geo2_864', 'geo2_886', 'geo2_896', 'geo2_937',
       'geo2_977', 'geo2_1001', 'geo2_1009', 'geo2_1023', 'geo2_1050',
       'geo2_1074', 'geo2_1080', 'geo2_1132', 'geo2_1149', 'geo2_1155',
       'geo2_1183', 'geo2_1253', 'geo2_1313', 'geo2_1401']]

In [79]:
train_values_subset = pd.concat([train_values_subset, geo1, geo2], axis=1)

In [80]:
train_values, validation_values = (train_values_subset.iloc[0:173733], train_values_subset.iloc[173733:-1])
train_labels, validation_labels = (train_labels_subset.iloc[0:173733], train_labels_subset.iloc[173733:-1])

In [82]:
param_grid = {
        'n_estimators': [100, 200, 300, 400],
        'min_samples_split': [0.1, 0.5, 0.9],
        'min_samples_leaf': [0.1, 0.3, 0.5],
        'max_features': ['auto', 'log2', None],
        'n_jobs': [-1],
        'bootstrap':[True]}

In [85]:
rs_clf = GridSearchCV(RandomForestClassifier(), param_grid, verbose=2, scoring='f1_micro', cv=2)

In [None]:
rs_clf.fit(train_values, train_labels)

In [None]:
rs_clf.best_params_

In [105]:
rndfor = RandomForestClassifier(n_estimators=300, bootstrap=True, n_jobs=-1)

In [113]:
rndfor.fit(train_values, train_labels)

RandomForestClassifier(n_jobs=-1)

In [114]:
# predictions = rndfor.predict(test_values_subset) 
predictionsProba = rndfor.predict_proba(validation_values) 

In [107]:
predictions = rndfor.predict(validation_values) 

In [112]:
df_prediction_rndForest = pd.DataFrame(predictionsProba, columns = ['rndf1', 'rndf2', 'rndf3'])
df_prediction_rndForest.to_csv("randomForestPredictionProbaTrain.csv")

In [108]:
f1_score(validation_labels, predictions, average='micro')

0.7203195689962817

In [None]:
submission_format = pd.read_csv('submission_format.csv', index_col='building_id')
my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)
my_submission.to_csv('submissionRandomForest.csv')