In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import squarify # pip install squarify
import matplotlib.pyplot as plt
import xgboost as xgb
%matplotlib inline

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score


In [10]:
# suprimimos la notacion cientifica en los outputs
pd.options.display.float_format = '{:20,.2f}'.format

train_values = pd.read_csv('train_values.csv', index_col='building_id')
train_labels = pd.read_csv('train_labels.csv', index_col='building_id')
test_values = pd.read_csv('test_values.csv', index_col='building_id')

# Custom features

Notamos que si utilizamos el csv que exportamos del notebook de feature engineering, por mas que utilicemos los mismos features, el modelo da un peor puntaje que cuando utilizamos el train_values.csv. Por lo tanto, esperando eliminar estos efectos negativos, calcularemos los features que vayamos a utilizar aqui mismo, copiando el codigo que hicimos en el notebook mencionado anteriormente.

In [11]:
train_values['more_families_than_mean'] = train_values.apply(lambda x: x['count_families'] > 1.07, axis=1)
test_values['more_families_than_mean'] = test_values.apply(lambda x: x['count_families'] > 1.07, axis=1)

train_values['older_than_mean'] = train_values.apply(lambda x: x['age'] > 26.54, axis=1)
test_values['older_than_mean'] = test_values.apply(lambda x: x['age'] > 26.55, axis=1)

In [12]:
def use(x):
    if not x['has_secondary_use']: # Es una casa
        return 1
    if x['has_secondary_use_agriculture']: # Es de agricultura
        return 2
    if x['has_secondary_use_hotel']: # Es un hotel
        return 3
    if x['has_secondary_use_rental']: # Es de alquiler
        return 4
    if x['has_secondary_use_institution']: # Es una institucion
        return 5
    if x['has_secondary_use_school']: # Es una escuela
        return 6
    if x['has_secondary_use_industry']: # Es una industria
        return 7
    if x['has_secondary_use_health_post']: # Es un puesto de salud
        return 8
    if x['has_secondary_use_gov_office']: # Es una oficina de gobierno
        return 9
    if x['has_secondary_use_use_police']: # Es una estacion de policias
        return 10
    if x['has_secondary_use_other']: # tiene otro uso
        return 11
    
train_values['use'] = train_values.apply(lambda x: use(x), axis=1)
test_values['use'] = test_values.apply(lambda x: use(x), axis=1)

In [13]:
train_values['height per area'] = train_values.height_percentage / train_values.area_percentage
test_values['height per area'] = test_values.height_percentage / test_values.area_percentage

In [14]:
subset = train_values[['has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
                       'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone',
                       'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick',
                       'has_superstructure_timber', 'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
                       'has_superstructure_rc_engineered', 'has_superstructure_other']]
train_values["cant_materials"] = subset.sum(axis=1)

subset_test = test_values[['has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
                       'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone',
                       'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick',
                       'has_superstructure_timber', 'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
                       'has_superstructure_rc_engineered', 'has_superstructure_other']]
test_values["cant_materials"] = subset_test.sum(axis=1)

In [15]:
# Viendo esto podemos agregar otra columna que asigne 1 si tiene la cantidad pisos que resultó con daño considerable
def bad_cant_floor(x):
    if (x > 0 and x < 6) or x == 8:
        return 1
    else:
        return 0

train_values['bad_cant_floor'] = train_values.apply(lambda x: bad_cant_floor(x['count_floors_pre_eq']), axis=1)
test_values['bad_cant_floor'] = test_values.apply(lambda x: bad_cant_floor(x['count_floors_pre_eq']), axis=1)

In [16]:
train_values['has_good_foundation_type'] = train_values.apply(lambda x: 1 if x["foundation_type"] in ["i","u","w"] else 0, axis=1)
train_values['has_good_roof_type'] = train_values.apply(lambda x: 1 if x["roof_type"] == "x" else 0, axis=1)
train_values['has_good_ground_floor_type'] = train_values.apply(lambda x: 1 if x["ground_floor_type"] == "v" else 0, axis=1)
train_values['has_good_other_floor_type'] = train_values.apply(lambda x: 1 if x["other_floor_type"] == "s" else 0, axis=1)

test_values['has_good_foundation_type'] = test_values.apply(lambda x: 1 if x["foundation_type"] in ["i","u","w"] else 0, axis=1)
test_values['has_good_roof_type'] = test_values.apply(lambda x: 1 if x["roof_type"] == "x" else 0, axis=1)
test_values['has_good_ground_floor_type'] = test_values.apply(lambda x: 1 if x["ground_floor_type"] == "v" else 0, axis=1)
test_values['has_good_other_floor_type'] = test_values.apply(lambda x: 1 if x["other_floor_type"] == "s" else 0, axis=1)

In [17]:
def value_of_region(x,l1,l2,l3):
    if x['geo_level_1_id'] == l1:
        if x['geo_level_2_id'] == l2:
            if x['geo_level_3_id'] == l3:
                return 3
            else:
                return 2
        else:
            return 1
    else:
        return 0   
train_values['most_damaged_region'] = train_values.apply(lambda x: value_of_region(x,17,363,8236), axis=1)
test_values['most_damaged_region'] = test_values.apply(lambda x: value_of_region(x,17,363,8236), axis=1)

train_values['less_damaged_region'] = train_values.apply(lambda x: value_of_region(x,26,39,9133), axis=1)
test_values['less_damaged_region'] = test_values.apply(lambda x: value_of_region(x,26,39,9133), axis=1)

def has_the_most_damaged_combination(x):
    if x['plan_configuration'] == 'n':
        if x['has_superstructure_adobe_mud'] or x['has_superstructure_stone_flag'] or x['has_superstructure_mud_mortar_brick'] or x['has_superstructure_other']:
            return 1
    return 0
train_values['has_most_damaged_construct_combination'] = train_values.\
apply(lambda x: has_the_most_damaged_combination(x), axis=1)

test_values['has_most_damaged_construct_combination'] = test_values.\
apply(lambda x: has_the_most_damaged_combination(x), axis=1)

most_used_train_material = train_values[['has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
              'has_superstructure_cement_mortar_stone',
              'has_superstructure_mud_mortar_brick',
              'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
              'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
              'has_superstructure_rc_engineered', 'has_superstructure_other']].sum().sort_values(ascending=False).keys()[0]

most_used_test_material = test_values[['has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
              'has_superstructure_cement_mortar_stone',
              'has_superstructure_mud_mortar_brick',
              'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
              'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
              'has_superstructure_rc_engineered', 'has_superstructure_other']].sum().sort_values(ascending=False).keys()[0]

train_values['has_most_used_material'] = train_values.apply(lambda x: x[most_used_train_material], axis=1)
test_values['has_most_used_material'] = test_values.apply(lambda x: x[most_used_test_material], axis=1)

least_used_train_material = train_values[['has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
              'has_superstructure_cement_mortar_stone',
              'has_superstructure_mud_mortar_brick',
              'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
              'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
              'has_superstructure_rc_engineered', 'has_superstructure_other']].sum().sort_values().keys()[0]

least_used_test_material = test_values[['has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
              'has_superstructure_cement_mortar_stone',
              'has_superstructure_mud_mortar_brick',
              'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
              'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
              'has_superstructure_rc_engineered', 'has_superstructure_other']].sum().sort_values().keys()[0]

train_values['has_least_used_material'] = train_values.apply(lambda x: x[least_used_train_material], axis=1)
test_values['has_least_used_material'] = test_values.apply(lambda x: x[least_used_test_material], axis=1)

In [18]:
selected_features = ['foundation_type', 'area_percentage', 'height_percentage', 'count_floors_pre_eq',
                     'land_surface_condition', 'has_superstructure_cement_mortar_stone', 'age', 'geo_level_1_id',
                     'geo_level_2_id','geo_level_3_id']#, 'height per area', 'cant_materials', 'bad_cant_floor']

train_values_subset = train_values#[selected_features]
test_values_subset = test_values#[selected_features]

train_values_subset = pd.get_dummies(train_values_subset)
test_values_subset = pd.get_dummies(test_values_subset)

In [19]:
def ensamblador(estimadores,X, y):
    eclf3 = VotingClassifier(estimators=[
       ('xgb', estimadores[0]), ('rf', estimadores[1]), ('knn', estimadores[2]), ('dt', estimadores[3])],
       voting='soft', weights=[0.5, 1, 0.4, 0.7],
      flatten_transform=True)

        
        
    votC_param_grid = {
        'weights': [[1, 0.5, 1, 0.4], [1, 1, 1, 1]],
        
    }
    
    gsvotC = GridSearchCV(eclf3, param_grid = votC_param_grid, cv=4, scoring="f1_micro", n_jobs= 4, verbose = 1)
    print('Voting Classifier')
    gsvotC.fit(X, y)
    
    
    votC_best = gsvotC.best_estimator_

    # Best score
    
    return votC_best

In [20]:
xgb_model = XGBClassifier(colsample_bytree = 0.4, 
                        learning_rate = 0.36,
                        max_depth = 5, 
                        alpha = 1,
                        n_estimators = 100)
rf_model = RandomForestClassifier(random_state=2018, n_estimators=100, min_samples_leaf=5)
#mlp_model = MLPClassifier() Este no nos funciono muy bien
knn_model = KNeighborsClassifier(n_neighbors=10)
dt_model = DecisionTreeClassifier(max_features = None,
                            max_depth = 45,
                            min_samples_split = 3,
                            min_samples_leaf = 30,
                            random_state=42)
ensamble = ensamblador([xgb_model,rf_model,knn_model,dt_model],train_values_subset, train_labels.values.ravel())

Voting Classifier
Fitting 4 folds for each of 2 candidates, totalling 8 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   8 out of   8 | elapsed:  6.7min finished




In [21]:
ensamble.fit(train_values_subset, train_labels.values.ravel())





VotingClassifier(estimators=[('xgb',
                              XGBClassifier(alpha=1, base_score=None,
                                            booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.4, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=0.36,
                                            max_delta_step=None, max_depth=5,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estimator...
                                            reg_lambda=None,
                                            scale_pos_

In [22]:
predictions_train = ensamble.predict(train_values_subset)

In [23]:
from sklearn.metrics import f1_score

f1_score(train_labels, predictions_train, average='micro')

0.7766010107405574

In [24]:
predictions = ensamble.predict(test_values_subset)
submission_format = pd.read_csv('submission_format.csv', index_col='building_id')
my_submission = pd.DataFrame(data=predictions, columns=submission_format.columns, index=submission_format.index)
my_submission.to_csv('submission_27_7_E_6.csv')

Este último tuvo un score de 0.7412 en driven data y 0.7764 en f1 score.
Se utilizaron todos los feates, no se hizo grid search para los hiperparámetros de los modelos pero si para los pesos de los algoritmos.

Anteriormente se hizo otro.
Este tuvo un valor de 0.7368 en driven data y 0.7764 en f1.

Se utilizaron todos los features pero no se hizo grid search para los hiperparámetros de los modelos ni para los pesos de los algoritmos.