In [2]:
import pandas as pd
pd.options.display.float_format = '{:20,.2f}'.format
import matplotlib.pyplot as plt
import numpy as np

In [3]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.ensemble import RandomForestClassifier

# for combining the preprocess with model training
from sklearn.pipeline import make_pipeline

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import f1_score

In [4]:
from treeinterpreter import treeinterpreter as ti

In [5]:
df_train_labels_original = pd.read_csv('train_labels.csv',low_memory=False,index_col='building_id')
df_train_values_original = pd.read_csv('train_values.csv',low_memory=False, index_col='building_id', dtype= {
'geo_level_1_id':'uint8', 'geo_level_2_id':'uint16', 'geo_level_3_id':'uint16', 'count_floors_pre_eq':'uint8','age':'uint16', 'area_percentage':'uint16', 'height_percentage':'uint16', 
'land_surface_condition':'category', 'foundation_type':'category', 'roof_type':'category', 'ground_floor_type':'category', 'other_floor_type':'category', 'position':'category','plan_configuration':'category', 
'has_superstructure_adobe_mud':'bool', 'has_superstructure_mud_mortar_stone':'bool','has_superstructure_stone_flag':'bool', 'has_superstructure_cement_mortar_stone':'bool', 'has_superstructure_mud_mortar_brick':'bool', 'has_superstructure_cement_mortar_brick':'bool', 'has_superstructure_timber':'bool', 'has_superstructure_bamboo':'bool', 'has_superstructure_rc_non_engineered':'bool', 'has_superstructure_rc_engineered':'bool', 'has_superstructure_other':'bool', 
'legal_ownership_status':'category', 'count_families':'uint16', 
'has_secondary_use':'bool', 'has_secondary_use_agriculture':'bool', 'has_secondary_use_hotel':'bool', 'has_secondary_use_rental':'bool', 'has_secondary_use_institution':'bool', 'has_secondary_use_school':'bool', 'has_secondary_use_industry':'bool', 'has_secondary_use_health_post':'bool', 'has_secondary_use_gov_office':'bool', 'has_secondary_use_use_police':'bool', 'has_secondary_use_other':'bool',})
df_test_values_original = pd.read_csv('test_values.csv',low_memory=False, index_col='building_id', dtype= {
'geo_level_1_id':'uint8', 'geo_level_2_id':'uint16', 'geo_level_3_id':'uint16', 'count_floors_pre_eq':'uint8','age':'uint16', 'area_percentage':'uint16', 'height_percentage':'uint16', 
'land_surface_condition':'category', 'foundation_type':'category', 'roof_type':'category', 'ground_floor_type':'category', 'other_floor_type':'category', 'position':'category','plan_configuration':'category', 
'has_superstructure_adobe_mud':'bool', 'has_superstructure_mud_mortar_stone':'bool','has_superstructure_stone_flag':'bool', 'has_superstructure_cement_mortar_stone':'bool', 'has_superstructure_mud_mortar_brick':'bool', 'has_superstructure_cement_mortar_brick':'bool', 'has_superstructure_timber':'bool', 'has_superstructure_bamboo':'bool', 'has_superstructure_rc_non_engineered':'bool', 'has_superstructure_rc_engineered':'bool', 'has_superstructure_other':'bool', 
'legal_ownership_status':'category', 'count_families':'uint16', 
'has_secondary_use':'bool', 'has_secondary_use_agriculture':'bool', 'has_secondary_use_hotel':'bool', 'has_secondary_use_rental':'bool', 'has_secondary_use_institution':'bool', 'has_secondary_use_school':'bool', 'has_secondary_use_industry':'bool', 'has_secondary_use_health_post':'bool', 'has_secondary_use_gov_office':'bool', 'has_secondary_use_use_police':'bool', 'has_secondary_use_other':'bool',})

In [6]:
train_values_subset = pd.get_dummies(df_train_values_original)

In [7]:
geo1 = pd.get_dummies(train_values_subset["geo_level_1_id"])
geo1Names = {}
for x in geo1.columns: geo1Names[x] = ('geo1_'+ str(x))
geo1 = geo1.rename(geo1Names, axis=1)

In [8]:
geo2 = pd.get_dummies(train_values_subset["geo_level_2_id"])
geo2Names = {}
for x in geo2.columns: geo2Names[x] = ('geo2_'+ str(x))
geo2 = geo2.rename(geo2Names, axis=1)

In [9]:
train_values_subset = train_values_subset.drop(["geo_level_2_id", "geo_level_3_id", "geo_level_1_id"], axis=1)
train_values_subset = pd.concat([train_values_subset, geo1, geo2], axis=1)
# train_values_subset.head()
train_values_subset.columns.size

1510

In [10]:
test_values_subset = pd.get_dummies(df_test_values_original)

In [11]:
geo1Test = pd.get_dummies(test_values_subset["geo_level_1_id"])
geo1TestNames = {}
for x in geo1Test.columns: geo1TestNames[x] = ('geo1_'+ str(x))
geo1Test = geo1Test.rename(geo1TestNames, axis=1)

In [12]:
geo2Test = pd.get_dummies(test_values_subset["geo_level_2_id"])
geo2TestNames = {}
for x in geo2Test.columns: geo2TestNames[x] = ('geo2_'+ str(x))
geo2Test = geo2Test.rename(geo2TestNames, axis=1)

In [13]:
test_values_subset = test_values_subset.drop(["geo_level_2_id", "geo_level_3_id", "geo_level_1_id"], axis=1)
test_values_subset = pd.concat([test_values_subset, geo1Test, geo2Test], axis=1)
# test_values_subset.head()
test_values_subset.columns.size

1460

In [14]:
for x in geo2Test.columns.difference(geo2.columns): train_values_subset[x] = 0

In [15]:
for x in geo2.columns.difference(geo2Test.columns): test_values_subset[x] = 0

In [16]:
test_values_subset.columns.size

1514

In [17]:
train_values_subset.columns.size

1514

In [18]:
train_labels_subset = df_train_labels_original['damage_grade']

In [19]:
train_values, validation_values = (train_values_subset.iloc[0:173733], train_values_subset.iloc[173733:-1])
train_labels, validation_labels = (train_labels_subset.iloc[0:173733], train_labels_subset.iloc[173733:-1])

In [20]:
rndfor = RandomForestClassifier(n_estimators=100, max_features=None, bootstrap=True, n_jobs=-1)    

In [21]:
rndfor.fit(train_values_subset, train_labels_subset)

RandomForestClassifier(max_features=None, n_estimators=1000, n_jobs=-1)

In [31]:
predictions = rndfor.predict(train_values_subset) 
# predictions

In [24]:
rndfor.score(train_values_subset, predictions)

1.0

In [25]:
predictions

array([3, 2, 3, ..., 3, 2, 3], dtype=int64)

In [32]:
# f1_score(train_values_subset, predictions, average='micro')
f1_score(train_labels_subset, predictions, average='micro')

0.9683232220904755

In [26]:
np.mean(predictions)

2.235628412784295

In [27]:
np.std(predictions)

0.6073983255995316

In [28]:
predictions = rndfor.predict(test_values_subset) 

In [29]:
submission_format = pd.read_csv('submission_format.csv', index_col='building_id')
my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)
my_submission.to_csv('submission5.csv')