In [79]:
import os

import pandas as pd
import numpy as np

from category_encoders import TargetEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from feature_engine import categorical_encoders as ce

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

import matplotlib.pyplot as plt
import seaborn as sns

os.chdir("../src/")
from utils.data_describe import DataDescribe as dd

basepath = "../data/external/"
path_x_train = "../data/external/train_values.csv"
path_y_train = "../data/external/train_labels.csv"
path_x_test = "../data/external/test_values.csv"

path_x_selected = "../data/interim/"
path_processed = "../data/processed/"

# Configurações para mostrar todas as colunas no iPython
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [80]:
# List all files in a directory using scandir()
print("-"*30)
print("Arquivos em data/external:\n")
with os.scandir(basepath) as entries:
    for entry in entries:
        if entry.is_file():
            print(entry.name)

print("-"*30)
print("Arquivos em data/interim:\n")

with os.scandir(path_x_selected) as entries:
    for entry in entries:
        if entry.is_file():
            print(entry.name)
print("-"*30)

------------------------------
Arquivos em data/external:

submission_format.csv
train_values.csv
test_values.csv
train_labels.csv
.gitkeep
------------------------------
Arquivos em data/interim:

.gitkeep
lst_X_train_booleanas.csv
lst_X_train_categoricas.csv
lst_X_train_continuas.csv
lst_X_train_categoricas_ordinais.csv
lst_X_train_categoricas_nominais.csv
------------------------------


# Início da preparação

A preparação de dados inicial será:
1. One hot encoding dos categóricos nominais;
1. Standard scaling de todos os atributos.

In [86]:
# Carregando dados de treino
X_train = pd.read_csv(path_x_train, index_col=0)
y_train = pd.read_csv(path_y_train, index_col=0)

# Resgatando lista de atributos booleanos e categóricos.
lst_features_target_encoding = pd.read_csv(path_x_selected+"lst_X_train_categoricas_nominais.csv", index_col=0)
lst_features_target_encoding = list(lst_features_target_encoding["0"].unique())

## Aplicando o one hot encoding nos atributos categóricos nominais

In [87]:
ohe_hot_encoder = ce.OneHotCategoricalEncoder(top_categories=None,
                                              variables=lst_features_target_encoding,
                                              drop_last=True)

ohe_hot_encoder.fit(X_train)

X_train_target_encoding = ohe_hot_encoder.transform(X_train)

# Criando lista com colunas que passaram pelo OHE:
lst_ohe_encoded_columns = []
for item in lst_features_target_encoding:
    lst_ohe_encoded_columns.extend([coluna for coluna in X_train_target_encoding.columns if item in coluna])

X_train_target_encoding[lst_ohe_encoded_columns].head()

Unnamed: 0_level_0,land_surface_condition_t,land_surface_condition_o,roof_type_n,roof_type_q,legal_ownership_status_v,legal_ownership_status_a,legal_ownership_status_r,other_floor_type_q,other_floor_type_x,other_floor_type_j,position_t,position_s,position_j,foundation_type_r,foundation_type_w,foundation_type_i,foundation_type_u,ground_floor_type_f,ground_floor_type_x,ground_floor_type_v,ground_floor_type_z,plan_configuration_d,plan_configuration_u,plan_configuration_s,plan_configuration_q,plan_configuration_m,plan_configuration_c,plan_configuration_a,plan_configuration_n,plan_configuration_f
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
802906,1,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
28830,0,1,1,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0
94947,1,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
590882,1,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
201944,1,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0


## Aplicando Standard Scaler sobre todos os atributos

In [88]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_target_encoding)

X_train_scaled = pd.DataFrame(data=X_train_scaled, columns=X_train_target_encoding.columns, index=X_train_target_encoding.index)

# Dataframe para alimentar o modelo

In [89]:
df_train = pd.concat([X_train_scaled, y_train], axis=1)

df_train.to_parquet(path_processed+"df_train.pqt")

df_train.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,land_surface_condition_t,land_surface_condition_o,roof_type_n,roof_type_q,legal_ownership_status_v,legal_ownership_status_a,legal_ownership_status_r,other_floor_type_q,other_floor_type_x,other_floor_type_j,position_t,position_s,position_j,foundation_type_r,foundation_type_w,foundation_type_i,foundation_type_u,ground_floor_type_f,ground_floor_type_x,ground_floor_type_v,ground_floor_type_z,plan_configuration_d,plan_configuration_u,plan_configuration_s,plan_configuration_q,plan_configuration_m,plan_configuration_c,plan_configuration_a,plan_configuration_n,plan_configuration_f,damage_grade
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
802906,-0.983414,-0.518705,1.629055,-0.178274,0.0471,-0.45946,-0.226419,3.206391,0.558971,-0.188554,-0.136284,-0.270442,-0.285298,-0.585029,-0.304811,-0.210914,-0.126945,-0.123339,0.038365,-0.354928,-0.262312,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,0.449747,-0.181556,0.652135,-0.556227,0.196223,-0.146997,-0.075395,0.759411,-0.447303,-0.424833,2.252816,-1.858462,-0.231741,0.43462,-0.248163,-0.205699,-0.240598,0.493166,-0.324861,-0.322807,-0.062189,0.205192,-0.119168,-0.036462,-0.149431,-0.013287,-0.035337,-0.031112,-0.012076,-0.009188,3
28830,-0.734459,0.481998,-0.945017,-0.178274,-0.224765,-0.00411,0.816109,-0.311877,0.558971,-0.188554,-0.136284,-0.270442,-0.285298,-0.585029,-0.304811,-0.210914,-0.126945,-0.123339,0.038365,-0.354928,-0.262312,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,-2.223471,5.507931,0.652135,-0.556227,0.196223,-0.146997,-0.075395,0.759411,-0.447303,-0.424833,-0.443889,0.538079,-0.231741,0.43462,-0.248163,-0.205699,-0.240598,-2.027715,3.078243,-0.322807,-0.062189,0.205192,-0.119168,-0.036462,-0.149431,-0.013287,-0.035337,-0.031112,-0.012076,-0.009188,2
94947,0.883744,-0.819158,0.744612,-0.178274,-0.224765,-0.687135,-0.226419,-0.311877,0.558971,-0.188554,-0.136284,-0.270442,-0.285298,-0.585029,-0.304811,-0.210914,-0.126945,-0.123339,0.038365,-0.354928,-0.262312,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,0.449747,-0.181556,0.652135,-0.556227,0.196223,-0.146997,-0.075395,-1.31681,2.23562,-0.424833,2.252816,-1.858462,-0.231741,0.43462,-0.248163,-0.205699,-0.240598,0.493166,-0.324861,-0.322807,-0.062189,0.205192,-0.119168,-0.036462,-0.149431,-0.013287,-0.035337,-0.031112,-0.012076,-0.009188,3
590882,1.008221,-0.685893,1.216589,-0.178274,-0.224765,-0.45946,-0.226419,-0.311877,0.558971,-0.188554,-0.136284,-0.270442,-0.285298,1.709316,3.280725,-0.210914,-0.126945,-0.123339,0.038365,-0.354928,-0.262312,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,0.449747,-0.181556,0.652135,-0.556227,0.196223,-0.146997,-0.075395,-1.31681,2.23562,-0.424833,-0.443889,0.538079,-0.231741,0.43462,-0.248163,-0.205699,-0.240598,0.493166,-0.324861,-0.322807,-0.062189,0.205192,-0.119168,-0.036462,-0.149431,-0.013287,-0.035337,-0.031112,-0.012076,-0.009188,2
201944,-0.361028,-1.381296,-1.308119,1.195989,0.0471,-0.00411,1.858636,3.206391,-1.789003,-0.188554,-0.136284,-0.270442,-0.285298,-0.585029,-0.304811,-0.210914,-0.126945,-0.123339,0.038365,-0.354928,-0.262312,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,0.449747,-0.181556,0.652135,-0.556227,0.196223,-0.146997,-0.075395,-1.31681,2.23562,-0.424833,-0.443889,0.538079,-0.231741,0.43462,-0.248163,-0.205699,-0.240598,0.493166,-0.324861,-0.322807,-0.062189,0.205192,-0.119168,-0.036462,-0.149431,-0.013287,-0.035337,-0.031112,-0.012076,-0.009188,3


In [90]:
# Armazenando ordem dos atributos:
lst_features_train = df_train.columns.tolist()
lst_features_train.pop()

'damage_grade'

# Aplicando os encoders treinados no dataset de teste

In [97]:
X_test = pd.read_csv(path_x_test, index_col=0)
X_test.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
300051,17,596,11307,3,20,7,6,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
99355,6,141,11987,2,25,13,5,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,1,1,0,0,0,0,0,0,0,0,0
890251,22,19,10044,2,5,4,5,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
745817,26,39,633,1,0,19,3,t,r,x,v,j,t,d,0,0,0,0,0,1,0,0,0,0,0,v,2,1,0,0,1,0,0,0,0,0,0,0
421793,17,289,7970,3,15,8,7,t,r,q,f,q,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [98]:
X_test_encoded = X_test.copy()

X_test_encoded = ohe_hot_encoder.transform(X_test_encoded)
X_test_encoded_scaled = scaler.transform(X_test_encoded)

X_test_encoded_scaled = pd.DataFrame(data=X_test_encoded, columns=X_test_encoded.columns, index=X_test_encoded.index)
X_test_encoded_scaled = X_test_encoded_scaled[lst_features_train]

In [99]:
X_test_encoded.to_parquet(path_processed+"X_test_encoded.pqt")