In [78]:
import pandas as pd
pd.options.display.float_format = '{:20,.2f}'.format
import matplotlib.pyplot as plt
import numpy as np

In [79]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.ensemble import RandomForestClassifier

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import f1_score

In [81]:
df_train_labels_original = pd.read_csv('train_labels.csv',low_memory=False,index_col='building_id')
df_train_values_original = pd.read_csv('train_values.csv',low_memory=False, index_col='building_id', dtype= {
'geo_level_1_id':'uint8', 'geo_level_2_id':'uint16', 'geo_level_3_id':'uint16', 'count_floors_pre_eq':'uint8','age':'uint16', 'area_percentage':'uint16', 'height_percentage':'uint16', 
'land_surface_condition':'category', 'foundation_type':'category', 'roof_type':'category', 'ground_floor_type':'category', 'other_floor_type':'category', 'position':'category','plan_configuration':'category', 
'has_superstructure_adobe_mud':'bool', 'has_superstructure_mud_mortar_stone':'bool','has_superstructure_stone_flag':'bool', 'has_superstructure_cement_mortar_stone':'bool', 'has_superstructure_mud_mortar_brick':'bool', 'has_superstructure_cement_mortar_brick':'bool', 'has_superstructure_timber':'bool', 'has_superstructure_bamboo':'bool', 'has_superstructure_rc_non_engineered':'bool', 'has_superstructure_rc_engineered':'bool', 'has_superstructure_other':'bool', 
'legal_ownership_status':'category', 'count_families':'uint16', 
'has_secondary_use':'bool', 'has_secondary_use_agriculture':'bool', 'has_secondary_use_hotel':'bool', 'has_secondary_use_rental':'bool', 'has_secondary_use_institution':'bool', 'has_secondary_use_school':'bool', 'has_secondary_use_industry':'bool', 'has_secondary_use_health_post':'bool', 'has_secondary_use_gov_office':'bool', 'has_secondary_use_use_police':'bool', 'has_secondary_use_other':'bool',})
df_test_values_original = pd.read_csv('test_values.csv',low_memory=False, index_col='building_id', dtype= {
'geo_level_1_id':'uint8', 'geo_level_2_id':'uint16', 'geo_level_3_id':'uint16', 'count_floors_pre_eq':'uint8','age':'uint16', 'area_percentage':'uint16', 'height_percentage':'uint16', 
'land_surface_condition':'category', 'foundation_type':'category', 'roof_type':'category', 'ground_floor_type':'category', 'other_floor_type':'category', 'position':'category','plan_configuration':'category', 
'has_superstructure_adobe_mud':'bool', 'has_superstructure_mud_mortar_stone':'bool','has_superstructure_stone_flag':'bool', 'has_superstructure_cement_mortar_stone':'bool', 'has_superstructure_mud_mortar_brick':'bool', 'has_superstructure_cement_mortar_brick':'bool', 'has_superstructure_timber':'bool', 'has_superstructure_bamboo':'bool', 'has_superstructure_rc_non_engineered':'bool', 'has_superstructure_rc_engineered':'bool', 'has_superstructure_other':'bool', 
'legal_ownership_status':'category', 'count_families':'uint16', 
'has_secondary_use':'bool', 'has_secondary_use_agriculture':'bool', 'has_secondary_use_hotel':'bool', 'has_secondary_use_rental':'bool', 'has_secondary_use_institution':'bool', 'has_secondary_use_school':'bool', 'has_secondary_use_industry':'bool', 'has_secondary_use_health_post':'bool', 'has_secondary_use_gov_office':'bool', 'has_secondary_use_use_police':'bool', 'has_secondary_use_other':'bool',})

In [82]:
train_values_subset = pd.get_dummies(df_train_values_original)
train_labels_subset = df_train_labels_original['damage_grade']

validation_size = df_train_values_original.index.size - df_test_values_original.index.size

In [83]:
test_values_subset = pd.get_dummies(df_test_values_original)
geo1Test = pd.get_dummies(test_values_subset["geo_level_1_id"])
geo1TestNames = {}
for x in geo1Test.columns: geo1TestNames[x] = ('geo1Test_'+ str(x))
geo1Test = geo1Test.rename(geo1TestNames, axis=1)
test_values_subset = pd.concat([test_values_subset, geo1Test], axis=1)

In [84]:
geo1 = pd.get_dummies(train_values_subset["geo_level_1_id"])
geo1Names = {}
for x in geo1.columns: geo1Names[x] = ('geo1_'+ str(x))
geo1 = geo1.rename(geo1Names, axis=1)

In [85]:
geo2 = pd.get_dummies(train_values_subset["geo_level_2_id"])
geo2Names = {}
for x in geo2.columns: geo2Names[x] = ('geo2_'+ str(x))
geo2 = geo2.rename(geo2Names, axis=1)

In [86]:
train_values_subset = pd.concat([train_values_subset, geo1], axis=1)

In [87]:
train_values, validation_values = (train_values_subset.iloc[0:173733], train_values_subset.iloc[173733:-1])
train_labels, validation_labels = (train_labels_subset.iloc[0:173733], train_labels_subset.iloc[173733:-1])

In [96]:
param_grid = {
        'n_estimators': [100, 150, 200, 300, 400],
        'min_samples_split': [0.01, 0.1, 0.25, 0.5, 0.75, 0.99],
        'min_samples_leaf': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5],
        'max_features': ['auto', 'log2', None],
        'n_jobs': [6],
        'bootstrap':[True]}

In [97]:
rs_clf = RandomizedSearchCV(RandomForestClassifier(), param_grid, verbose=2, scoring='f1_micro', cv=2, n_iter=50)

In [None]:
rs_clf.fit(train_values, train_labels)

In [None]:
rs_clf.best_params_

In [100]:
rndfor = RandomForestClassifier(n_estimators=200, min_samples_split = 0.01, min_samples_leaf = 0.01, max_features=None, bootstrap=True, n_jobs=2)

In [101]:
rndfor.fit(train_values, train_labels)

RandomForestClassifier(max_features=None, min_samples_leaf=0.01,
                       min_samples_split=0.01, n_estimators=200, n_jobs=2)

In [111]:
predictions = rndfor.predict(validation_values) 
predictionsProba = rndfor.predict_proba(validation_values) 

In [110]:
df_prediction_rndForest = pd.DataFrame(predictionsProba, columns = ['rndf1', 'rndf2', 'rndf3'])
df_prediction_rndForest.to_csv("randomForestPredictionProbaTrain.csv")

In [112]:
f1_score(validation_labels, predictions, average='micro')

0.6645676724187551

In [None]:
submission_format = pd.read_csv('submission_format.csv', index_col='building_id')
my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)
my_submission.to_csv('submissionRandomForest.csv')