In [72]:
import pandas as pd
import tensorflow as tf
import keras_tuner as kt
import numpy as np
df_train_labels_original = pd.read_csv('train_labels.csv',low_memory=False, dtype= {
    'damage_grade':'uint8'
}).set_index('building_id').apply(lambda x: x-1)
df_train_values_original = pd.read_csv('train_values.csv',low_memory=False, dtype= {
    'geo_level_1_id':'category', 
    'geo_level_2_id':'int64',
    'geo_level_3_id':'int64', 
    'count_floors_pre_eq':'uint8',
    'age':'uint16',
    'area_percentage':'uint16', 
    'height_percentage':'uint16', 
    'land_surface_condition':'category', 
    'foundation_type':'category',
    'roof_type':'category',
    'ground_floor_type':'category',
    'other_floor_type':'category',
    'position':'category',
    'plan_configuration':'category', 
    'has_superstructure_adobe_mud':'uint8',
    'has_superstructure_mud_mortar_stone':'uint8',
    'has_superstructure_stone_flag':'uint8',
    'has_superstructure_cement_mortar_stone':'uint8', 
    'has_superstructure_mud_mortar_brick':'uint8', 
    'has_superstructure_cement_mortar_brick':'uint8', 
    'has_superstructure_timber':'uint8', 
    'has_superstructure_bamboo':'uint8',
    'has_superstructure_rc_non_engineered':'uint8',
    'has_superstructure_rc_engineered':'uint8',
    'has_superstructure_other':'uint8', 
    'legal_ownership_status':'category',
    'count_families':'uint16', 
    'has_secondary_use':'uint8', 
    'has_secondary_use_agriculture':'uint8', 
    'has_secondary_use_hotel':'uint8',
    'has_secondary_use_rental':'uint8',
    'has_secondary_use_institution':'uint8',
    'has_secondary_use_school':'uint8', 
    'has_secondary_use_industry':'uint8', 
    'has_secondary_use_health_post':'uint8', 
    'has_secondary_use_gov_office':'uint8', 
    'has_secondary_use_use_police':'uint8', 
    'has_secondary_use_other':'uint8',
}).set_index('building_id').drop(columns=['geo_level_3_id'])


pd.options.display.float_format = '{:20,.2f}'.format

In [73]:
df = df_train_values_original.join(df_train_labels_original,how="inner")
df

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,2,30,6,5,t,r,n,f,...,0,0,0,0,0,0,0,0,0,2
28830,8,900,2,10,8,7,o,r,n,x,...,0,0,0,0,0,0,0,0,0,1
94947,21,363,2,10,5,5,t,r,n,f,...,0,0,0,0,0,0,0,0,0,2
590882,22,418,2,10,6,5,t,r,n,f,...,0,0,0,0,0,0,0,0,0,1
201944,11,131,3,30,8,9,t,r,n,f,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688636,25,1335,1,55,6,3,n,r,n,f,...,0,0,0,0,0,0,0,0,0,1
669485,17,715,2,0,6,5,t,r,n,f,...,0,0,0,0,0,0,0,0,0,2
602512,17,51,3,55,6,7,t,r,q,f,...,0,0,0,0,0,0,0,0,0,2
151409,26,39,2,10,14,6,t,r,x,v,...,0,0,0,0,0,0,0,0,0,1


In [74]:
def mean_encode(dataframe, column_name):
    new_column_names = {
        0: column_name+'_0',
        1: column_name+'_1',
        2: column_name+'_2',
    }
    cross = pd.crosstab(dataframe[column_name], dataframe['damage_grade']).rename(columns=new_column_names)
    prob = cross.divide(cross.apply('sum',axis=1),axis=0).reset_index()
    return dataframe.reset_index().merge(prob,on=column_name).set_index('building_id').drop(columns=[column_name])

In [75]:
def one_hot_encode_data(dataframe, column_name):
    dummies = pd.get_dummies(dataframe[column_name])
    rename_columns = {}
    for column in dummies.columns.values:
        rename_columns[column] = column_name + '_' + column
    return dataframe.drop(columns=[column_name]).join(dummies.rename(columns=rename_columns))

In [76]:
#df = mean_encode(df, 'geo_level_2_id')
df = one_hot_encode_data(df,'land_surface_condition')
df = one_hot_encode_data(df,'foundation_type')
df = one_hot_encode_data(df,'roof_type')
df = one_hot_encode_data(df,'ground_floor_type')
df = one_hot_encode_data(df,'other_floor_type')
df = one_hot_encode_data(df,'position')
df = one_hot_encode_data(df,'plan_configuration')
df = one_hot_encode_data(df,'legal_ownership_status')
df = one_hot_encode_data(df,'geo_level_1_id')
df

Unnamed: 0_level_0,geo_level_2_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,...,geo_level_1_id_28,geo_level_1_id_29,geo_level_1_id_3,geo_level_1_id_30,geo_level_1_id_4,geo_level_1_id_5,geo_level_1_id_6,geo_level_1_id_7,geo_level_1_id_8,geo_level_1_id_9
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,487,2,30,6,5,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
28830,900,2,10,8,7,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
94947,363,2,10,5,5,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
590882,418,2,10,6,5,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
201944,131,3,30,8,9,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688636,1335,1,55,6,3,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
669485,715,2,0,6,5,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
602512,51,3,55,6,7,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
151409,39,2,10,14,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [77]:
train_df, target = (df.drop(columns=['damage_grade'])[:int(len(df)/2)], df['damage_grade'][:int(len(df)/2)])
test_df, test_target = (df.drop(columns=['damage_grade'])[int(len(df)/2):], df['damage_grade'][int(len(df)/2):])
train_df

Unnamed: 0_level_0,geo_level_2_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,...,geo_level_1_id_28,geo_level_1_id_29,geo_level_1_id_3,geo_level_1_id_30,geo_level_1_id_4,geo_level_1_id_5,geo_level_1_id_6,geo_level_1_id_7,geo_level_1_id_8,geo_level_1_id_9
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,487,2,30,6,5,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
28830,900,2,10,8,7,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
94947,363,2,10,5,5,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
590882,418,2,10,6,5,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
201944,131,3,30,8,9,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168990,1132,2,5,7,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
966340,36,1,5,7,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
842927,755,2,35,6,7,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
411281,352,3,55,23,6,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
test_df

Unnamed: 0_level_0,geo_level_2_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,...,geo_level_1_id_28,geo_level_1_id_29,geo_level_1_id_3,geo_level_1_id_30,geo_level_1_id_4,geo_level_1_id_5,geo_level_1_id_6,geo_level_1_id_7,geo_level_1_id_8,geo_level_1_id_9
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
666622,1023,1,10,10,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
194057,463,2,5,8,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
736347,1131,1,5,16,3,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69992,1030,1,5,4,4,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
133103,1115,2,15,10,5,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688636,1335,1,55,6,3,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
669485,715,2,0,6,5,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
602512,51,3,55,6,7,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
151409,39,2,10,14,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=4, shuffle=True).split(train_df, target)
generic_prob = pd.crosstab(train_df['geo_level_2_id'], target).applymap(lambda x: 1/3)
train_df_after = pd.DataFrame()
average_damage = pd.get_dummies(target).mean().values

for training_index, validation_index in folds:
    cross = pd.crosstab(train_df.iloc[training_index]['geo_level_2_id'], target.iloc[training_index])
    prob = cross.divide(cross.apply('sum',axis=1),axis=0).reset_index()
    result = train_df.iloc[validation_index].reset_index().merge(prob,on='geo_level_2_id',how='left')
    result[0] = result[0].fillna(average_damage[0])
    result[1] = result[1].fillna(average_damage[1])
    result[2] = result[2].fillna(average_damage[2])
    result = result.set_index('building_id')
    train_df_after = train_df_after.append(result)

alfa = 10
n_rows = train_df.groupby('geo_level_2_id').count()['height_percentage'].rename('n_rows')
train_df_after = train_df_after.reset_index().merge(n_rows,on='geo_level_2_id',how='left').set_index('building_id')
train_df_after[0] = train_df_after[0].multiply(train_df_after['n_rows'])
train_df_after[1] = train_df_after[1].multiply(train_df_after['n_rows'])
train_df_after[2] = train_df_after[2].multiply(train_df_after['n_rows'])
train_df_after['n_rows'] = train_df_after['n_rows'].add(10)
train_df_after[0] = train_df_after[0].add(alfa * average_damage[0])
train_df_after[1] = train_df_after[1].add(alfa * average_damage[1])
train_df_after[2] = train_df_after[2].add(alfa * average_damage[2])
train_df_after[0] = train_df_after[0].divide(train_df_after['n_rows'])
train_df_after[1] = train_df_after[1].divide(train_df_after['n_rows'])
train_df_after[2] = train_df_after[2].divide(train_df_after['n_rows'])
train_df_after = train_df_after.drop(columns=['n_rows'])
    
global_mean = train_df_after.groupby('geo_level_2_id').mean()[[0,1,2]].reset_index()
test_df_after = test_df.reset_index().merge(global_mean,on='geo_level_2_id',how='left')
test_df_after = test_df_after.drop(columns=['geo_level_2_id']).set_index('building_id')
test_df_after[0] = test_df_after[0].fillna(average_damage[0])
test_df_after[1] = test_df_after[1].fillna(average_damage[1])
test_df_after[2] = test_df_after[2].fillna(average_damage[2])
train_df_after = train_df_after.drop(columns=['geo_level_2_id'])
train_df_after

Unnamed: 0_level_0,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,...,geo_level_1_id_30,geo_level_1_id_4,geo_level_1_id_5,geo_level_1_id_6,geo_level_1_id_7,geo_level_1_id_8,geo_level_1_id_9,0,1,2
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
590882,2,10,6,5,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0.02,0.86,0.12
475515,2,0,8,6,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0.76,0.20,0.05
605134,2,25,7,6,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0.04,0.59,0.36
219578,2,0,6,5,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0.02,0.59,0.39
406413,2,35,11,5,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0.14,0.76,0.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191977,2,15,9,5,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0.23,0.60,0.17
703128,2,20,24,7,1,0,0,0,0,1,...,0,0,0,1,0,0,0,0.12,0.80,0.08
701660,2,20,5,4,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0.04,0.74,0.22
481356,3,50,6,8,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0.07,0.55,0.38


In [86]:
cardinal_train = train_df.copy()
cardinal_test = test_df.copy()
train_df = train_df.drop(columns=['geo_level_2_id'])
test_df = test_df.drop(columns=['geo_level_2_id'])

In [87]:
dataset = tf.data.Dataset.from_tensor_slices((train_df.values, target.values))
dataset_after = tf.data.Dataset.from_tensor_slices((train_df_after.values, target.values))
for feat, targ in dataset.take(5):
  print ('Features: {}, Target: {}'.format(feat, targ))

Features: [ 2 30  6  5  1  1  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0
  0  0  0  0  0  1  0  0  1  0  0  1  0  0  1  0  0  0  0  0  1  0  0  0
  0  0  1  0  0  1  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0], Target: 2
Features: [ 2 10  8  7  0  1  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0
  0  0  0  0  1  0  0  0  1  0  0  1  0  0  0  0  0  1  0  0  1  0  0  0
  0  1  0  0  0  1  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0], Target: 1
Features: [ 2 10  5  5  0  1  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0
  0  0  0  0  0  1  0  0  1  0  0  1  0  0  1  0  0  0  0  0  0  0  1  0
  0  0  1  0  0  1  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0], Target: 2
Features: [ 2 10  6  5  0  1  0  0  0  0  1  1  0  0  0  1

In [88]:
def compile_model(hp):
    layers = []
    l_amount = hp.Int('l_amount', min_value=3, max_value=9, step=1)
    l_size = hp.Int('l_size', min_value=200, max_value=1000, step=100)
    for x in range(l_amount):
        layers.append(tf.keras.layers.Dense(l_size, activation='relu'))
    layers.append(tf.keras.layers.Dense(units=3, activation='softmax'))
    model = tf.keras.Sequential(layers)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    opt = tf.keras.optimizers.Adam(clipnorm=1.0)
    model.compile(optimizer=opt,
                  loss=loss_fn,
                  metrics=['accuracy'])
    return model

In [89]:
tuner = kt.BayesianOptimization(compile_model,
                                objective='val_loss',
                                max_trials=25,
                                overwrite=True
                               )


In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

tuner.search(
    train_df_after.to_numpy(), 
    target.to_numpy(), 
    epochs=25, 
    validation_data=(test_df_after, test_target),
    batch_size=128,
    callbacks=[stop_early],
)

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]


Trial 13 Complete [00h 02m 30s]
val_loss: 0.9098461866378784

Best val_loss So Far: 0.9003689885139465
Total elapsed time: 00h 33m 07s

Search: Running Trial #14

Hyperparameter    |Value             |Best Value So Far 
l_amount          |9                 |4                 
l_size            |1000              |700               

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25

In [None]:
best_hps.get('l_size')

In [None]:
best_hps.get('l_amount')

In [None]:
save_best = tf.keras.callbacks.ModelCheckpoint('modelos/NNModelBY_Best', save_best_only=True)

best_model = compile_model(best_hps)
best_model.fit(dataset_after.batch(128), epochs=200, validation_data=(test_df_after, test_target), callbacks=[save_best])

In [None]:
best_model.save('modelos/NNModelBY')

In [64]:
tf.keras.models.load_model('modelos/NNModelBY_Best').evaluate(test_df_after, test_target)



[0.6748371720314026, 0.6856278777122498]