In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import squarify # pip install squarify
import matplotlib.pyplot as plt
import xgboost as xgb
%matplotlib inline

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier


In [24]:
# suprimimos la notacion cientifica en los outputs
pd.options.display.float_format = '{:20,.2f}'.format

train_values = pd.read_csv('train_values.csv', index_col='building_id')
train_labels = pd.read_csv('train_labels.csv', index_col='building_id')
test_values = pd.read_csv('test_values.csv', index_col='building_id')

In [25]:
train_values['height per area'] = train_values.height_percentage / train_values.area_percentage
test_values['height per area'] = test_values.height_percentage / test_values.area_percentage

In [26]:
subset = train_values[['has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
                       'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone',
                       'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick',
                       'has_superstructure_timber', 'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
                       'has_superstructure_rc_engineered', 'has_superstructure_other']]
train_values["cant_materials"] = subset.sum(axis=1)

subset_test = test_values[['has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
                       'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone',
                       'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick',
                       'has_superstructure_timber', 'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
                       'has_superstructure_rc_engineered', 'has_superstructure_other']]
test_values["cant_materials"] = subset_test.sum(axis=1)

In [27]:
# Viendo esto podemos agregar otra columna que asigne 1 si tiene la cantidad pisos que resultó con daño considerable
def bad_cant_floor(x):
    if (x > 0 and x < 6) or x == 8:
        return 1
    else:
        return 0

train_values['bad_cant_floor'] = train_values.apply(lambda x: bad_cant_floor(x['count_floors_pre_eq']), axis=1)
test_values['bad_cant_floor'] = test_values.apply(lambda x: bad_cant_floor(x['count_floors_pre_eq']), axis=1)

In [28]:
train_values['has_good_foundation_type'] = train_values.apply(lambda x: 1 if x["foundation_type"] in ["i","u","w"] else 0, axis=1)
train_values['has_good_roof_type'] = train_values.apply(lambda x: 1 if x["roof_type"] == "x" else 0, axis=1)
train_values['has_good_ground_floor_type'] = train_values.apply(lambda x: 1 if x["ground_floor_type"] == "v" else 0, axis=1)
train_values['has_good_other_floor_type'] = train_values.apply(lambda x: 1 if x["other_floor_type"] == "s" else 0, axis=1)

In [29]:
train_values = pd.get_dummies(train_values, drop_first = True)

In [30]:
X1,y1 = train_values, train_labels

X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size = 0.4, random_state = 42)

In [31]:
dt = DecisionTreeClassifier(max_features = None,
                            max_depth = 45,
                            min_samples_split = 3,
                            min_samples_leaf = 30,
                            random_state=42)

In [32]:
model = dt.fit(X_train, y_train)

In [33]:
predictions_train = model.predict(X_test)

In [34]:
from sklearn.metrics import f1_score

f1_score(y_test, predictions_train, average='micro')

0.7098646405924732