In [1]:
import pandas as pd
import numpy as np
from time import ctime
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Datos de entrenamiento limpios y con features agregados
train = pd.read_csv('../data/to_train_01.csv', low_memory=False, lineterminator='\n')

print(train.info())
train.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 86 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   building_id                             260601 non-null  int64  
 1   damage_grade                            260601 non-null  int64  
 2   count_floors_pre_eq                     260601 non-null  int64  
 3   age                                     260601 non-null  int64  
 4   area_percentage                         260601 non-null  int64  
 5   height_percentage                       260601 non-null  int64  
 6   land_surface_condition                  260601 non-null  float64
 7   foundation_type                         260601 non-null  float64
 8   roof_type                               260601 non-null  float64
 9   ground_floor_type                       260601 non-null  float64
 10  other_floor_type                        2606

Unnamed: 0,building_id,damage_grade,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,...,altura_por_antiguedad,area_por_altura,producto_area_altura,area_al_cuadrado,altura_al_cuadrado,promedio_area_altura,cant_familias_por_pisos,cant_familias_por_area,cant_familias_por_altura,cant_materiales\r
0,802906,3,2,30,6,5,2.234176,2.329448,2.269586,2.309002,...,0.16129,1.2,30,36,25,5.5,0.5,0.166667,0.2,2


In [3]:
# Datos de prueba
test = pd.read_csv('../data/to_test_01.csv', low_memory=False, lineterminator='\n')
print(test.info())
test.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86868 entries, 0 to 86867
Data columns (total 85 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   building_id                             86868 non-null  int64  
 1   count_floors_pre_eq                     86868 non-null  int64  
 2   age                                     86868 non-null  int64  
 3   area_percentage                         86868 non-null  int64  
 4   height_percentage                       86868 non-null  int64  
 5   land_surface_condition                  86868 non-null  float64
 6   foundation_type                         86868 non-null  float64
 7   roof_type                               86868 non-null  float64
 8   ground_floor_type                       86868 non-null  float64
 9   other_floor_type                        86868 non-null  float64
 10  position                                86868 non-null  fl

Unnamed: 0,building_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,...,altura_por_antiguedad,area_por_altura,producto_area_altura,area_al_cuadrado,altura_al_cuadrado,promedio_area_altura,cant_familias_por_pisos,cant_familias_por_area,cant_familias_por_altura,cant_materiales\r
0,300051,3,20,7,6,2.234176,2.329448,2.269586,2.309002,2.315611,...,0.285714,1.166667,42,49,36,6.5,0.333333,0.142857,0.166667,1


## Entrenamiento del modelo

### Seleccion de features

In [4]:
def seleccion_de_features_to_remove(df_train):
    '''
    Funcion para seleccionar features.
    features_to_remove es la base que contiene building_id y damage_grade
    Las demas listas, las creamos y las agregamos a subset_features para luego appendearlas a features_to_remove
    '''
    features_to_remove = ['building_id', 'damage_grade']
    subset_features = []
    has_secondary = [col for col in df_train.columns if 'has_secondary' in col]
    legal_ownership = [col for col in df_train.columns if 'legal_ownership' in col]
    plan_configuration = [col for col in df_train.columns if 'plan_configuration' in col]
    probas = [col for col in df_train.columns if 'proba_danio' in col]
    relaciones_geo = [col for col in df_train.columns if 'relacion_' in col]
    cantidad_geo = [col for col in df_train.columns if 'cant_geolevel' in col]
    geo_level_embedded = [col for col in df_train.columns if 'geo_level_embedded_' in col]
    #subset_features.append(has_secondary)
    subset_features.append(cantidad_geo)
    subset_features.append(legal_ownership)
    subset_features.append(relaciones_geo)
    #subset_features.append(geo_level_embedded)
    subset_features.append(probas)
    #subset_features.append(plan_configuration)
    for subset in subset_features:
        features_to_remove += subset
    return features_to_remove

In [5]:
#features_adicionales_a_remover = ['proba_danio_1_dado_geolevel3', 'proba_danio_2_dado_geolevel3', 'proba_danio_3_dado_geolevel3']
#otras_features_a_sacar = ['has_superstructure_stone_flag', 'has_superstructure_other', 'has_superstructure_bamboo', 'has_superstructure_cement_mortar_stone',\
#      'has_superstructure_adobe_mud']

In [6]:
# Arreglo el nombre de la columna 'cant_materiales'
train.rename(columns={'cant_materiales\r': 'cant_materiales'}, inplace=True)
test.rename(columns={'cant_materiales\r': 'cant_materiales'}, inplace=True)

In [7]:
features_to_remove = seleccion_de_features_to_remove(train)
Y = train['damage_grade'] - 1 # Restamos -1 porque LGBM acepta labels desde [0, 1, ..., n-1] donde n es la cantidad de labels unicos
X = train.drop(features_to_remove, axis=1)

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 65 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   count_floors_pre_eq                     260601 non-null  int64  
 1   age                                     260601 non-null  int64  
 2   area_percentage                         260601 non-null  int64  
 3   height_percentage                       260601 non-null  int64  
 4   land_surface_condition                  260601 non-null  float64
 5   foundation_type                         260601 non-null  float64
 6   roof_type                               260601 non-null  float64
 7   ground_floor_type                       260601 non-null  float64
 8   other_floor_type                        260601 non-null  float64
 9   position                                260601 non-null  float64
 10  plan_configuration                      2606

### Entrenamiento de los modelos

In [9]:
import lightgbm as lgbm
from sklearn.metrics import f1_score 
from sklearn.model_selection import KFold

> https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html

In [10]:
CANT_MODELOS = 5

In [11]:
k_fold = KFold(n_splits=CANT_MODELOS, shuffle=True, random_state=1881)
X_array = np.array(X)
suma_scores_test = 0
suma_scores_train = 0
models = []
print('Hora antes de empezar a entrenar: {}'.format(ctime()))
for index, (train_index, test_index) in enumerate(k_fold.split(X)):
    best_params = {
        'objective' : 'multiclass',
        'num_class':3,
        'metric' : 'multi_error',
        'boosting': 'gbdt',
        'max_depth' : -1,
        'num_leaves' : 30,
        'learning_rate' : 0.1,
        'feature_fraction' : 0.5,
        'min_data_in_leaf':20,
        'min_sum_hessian_in_leaf' : 0.1,
        'max_bin': 8192,
        'verbosity':-1,
        'n_jobs':-1,
        'random_state':1881
    }

    X_train, X_test = X_array[train_index], X_array[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    train_lgb = lgbm.Dataset(X_train, label=y_train)
    test_lgb = lgbm.Dataset(X_test, label=y_test)
    model = lgbm.train(best_params, train_lgb, 20000, valid_sets=[test_lgb], early_stopping_rounds=2000, verbose_eval=1000)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    # Cada predict del modelo devuelve la probabilidad de cada label para cada entrada
    # Los argmax son para obtener el indice (posicion) del label con mayor probabilidad y le sumamos +1 para
    # volver al valor de los labels normales (1, 2, o 3)
    model_f1_micro_test = f1_score(y_test + 1, y_pred_test.argmax(axis=1) + 1, average='micro')
    model_f1_micro_train = f1_score(y_train + 1, y_pred_train.argmax(axis=1) + 1, average='micro')
    suma_scores_train += model_f1_micro_train
    suma_scores_test += model_f1_micro_test
    models.append(model)
    print('Testing Score: %.4f' % model_f1_micro_test)
    print('Training Score: %.4f' % model_f1_micro_train)
    model.save_model('./LGBM-model-ensamble-proba-{}.dat'.format(index))
print('Hora al finalizar de entrenar: {}'.format(ctime()))

Hora antes de empezar a entrenar: Thu Jul 22 21:33:14 2021
Training until validation scores don't improve for 2000 rounds
[1000]	valid_0's multi_error: 0.249707
[2000]	valid_0's multi_error: 0.247405
[3000]	valid_0's multi_error: 0.247923
[4000]	valid_0's multi_error: 0.248633
Early stopping, best iteration is:
[2163]	valid_0's multi_error: 0.247232
Testing Score: 0.7528
Training Score: 0.8148
Training until validation scores don't improve for 2000 rounds
[1000]	valid_0's multi_error: 0.25447
[2000]	valid_0's multi_error: 0.251439
[3000]	valid_0's multi_error: 0.251439
[4000]	valid_0's multi_error: 0.251708
Early stopping, best iteration is:
[2311]	valid_0's multi_error: 0.250556
Testing Score: 0.7494
Training Score: 0.8202
Training until validation scores don't improve for 2000 rounds
[1000]	valid_0's multi_error: 0.25543
[2000]	valid_0's multi_error: 0.252015
[3000]	valid_0's multi_error: 0.250403
[4000]	valid_0's multi_error: 0.25165
Early stopping, best iteration is:
[2891]	valid_0

In [11]:
# Cargamos los modelos si ya los tenemos guardados anteriormente
models = []
for i in range(CANT_MODELOS):
    model = lgbm.Booster(model_file='./LGBM-model-ensamble-proba-{}.dat'.format(i))
    models.append(model)

In [12]:
np.array([[0.05, 0.35, 0.6]]) + np.array([[0.10,0.10,0.8]])

array([[0.15, 0.45, 1.4 ]])

In [13]:
def ensamble(models, df_test):
    '''
    Ensamblado de los modelos. Por cada modelo de LGBM hallamos las predicciones sobre df_test,
    sumamos las probabilidades obtenidas para cada registro y nos quedamos con la mayor para cada
    observacion.
    Es distinto a quedarnos directamente con la probabilidad mayor sin sumar, pues podria pasar que
    [[0.05, 0.75, 0.20], [0.10, 0.65, 0.25], [0.15, 0.05, 0.8]] donde en la clase_2_suma = 1.45 y clase_3_suma = 1.25
    pero la mayor proba individual cae en la clase 3 ya que es de 0.8, cuando el mayor intervalo de confianza lo tiene la clase 2
    al ser un poco mas estable.
    '''
    y_preds_probas = []
    for model in models:
        y_pred = model.predict(df_test)
        y_preds_probas.append(y_pred)
    y_pred_probas_acumuladas = y_preds_probas[0]
    for ypred in y_preds_probas[1:]:
        # Sumamos la proba de que pertenezca a cada clase por cada prediccion
        y_pred_probas_acumuladas += ypred
    return y_pred_probas_acumuladas

### Formateo para prediccion de un submit

In [14]:
# Saco damage_grade de las features a remover ya que dicha feature NO esta en el set de test
features_to_remove.remove('damage_grade')
to_test = test.drop(features_to_remove, axis=1)

In [15]:
y_pred = ensamble(models, to_test)
# argmax nos devuelve el indice de la posicion que tenga mayor probabilidad acumulada (0, 1, 2)
# Sumamos 1 por el -1 que aplicamos para entrenar los modelos con LGBM
y_pred_real = y_pred.argmax(axis=1) + 1

In [16]:
test_id = test['building_id']
test_id

0         300051
1          99355
2         890251
3         745817
4         421793
          ...   
86863     310028
86864     663567
86865    1049160
86866     442785
86867     501372
Name: building_id, Length: 86868, dtype: int64

In [17]:
submit = pd.concat([test_id, pd.Series(y_pred_real, name='damage_grade')], axis=1)
submit.set_index('building_id', inplace=True)
submit.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,2
890251,2
745817,1
421793,3


In [15]:
promedio_f1_score_train = suma_scores_train / CANT_MODELOS
promedio_f1_score_train

0.8285040885200157

In [16]:
# Test = 0.7501
# DD = 0.7514
promedio_f1_score_test = suma_scores_test / CANT_MODELOS
promedio_f1_score_test

0.7501083938735567

In [17]:
print('Training score: %.4f' % promedio_f1_score_train)
print('Testing score: %.4f' % promedio_f1_score_test)

Training score: 0.8285
Testing score: 0.7501


In [18]:
submit.to_csv('../submits-csv/submit-ensamble-LightGBM-proba-{}.csv'.format(123))