In [26]:
import os

import pandas as pd
import numpy as np

from category_encoders import TargetEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from feature_engine import categorical_encoders as ce

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

import matplotlib.pyplot as plt
import seaborn as sns

os.chdir("../src/")
from utils.data_describe import DataDescribe as dd

basepath = "../data/external/"
path_x_train = "../data/external/train_values.csv"
path_y_train = "../data/external/train_labels.csv"
path_x_test = "../data/external/test_values.csv"

path_x_selected = "../data/interim/"
path_processed = "../data/processed/"

# Configurações para mostrar todas as colunas no iPython
# pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', None)

In [27]:
# List all files in a directory using scandir()
print("-"*30)
print("Arquivos em data/external:\n")
with os.scandir(basepath) as entries:
    for entry in entries:
        if entry.is_file():
            print(entry.name)

print("-"*30)
print("Arquivos em data/interim:\n")

with os.scandir(path_x_selected) as entries:
    for entry in entries:
        if entry.is_file():
            print(entry.name)
print("-"*30)

------------------------------
Arquivos em data/external:

.gitkeep
submission_format.csv
test_values.csv
train_labels.csv
train_values.csv
------------------------------
Arquivos em data/interim:

.gitkeep
lst_X_train_booleanas.csv
lst_X_train_categoricas.csv
lst_X_train_categoricas_nominais.csv
lst_X_train_categoricas_ordinais.csv
lst_X_train_continuas.csv
lst_selected_features.csv
------------------------------


# Início da preparação

A preparação de dados inicial será:

<p class="aligncenter">
    <img src="./images/2_data_preparation_schema.svg" alt="Soma do quadrado dos resíduos" width="800">
</p>
<p style="text-align:center"> Figura 1 - Esquema mostrando as tranformações executadas em cada subconjunto de dados.</p>


In [28]:
# Carregando dados de treino
X_train = pd.read_csv(path_x_train, index_col=0)
y_train = pd.read_csv(path_y_train, index_col=0)
X_test = pd.read_csv(path_x_test, index_col=0)

## Aplicando o Rare Label Encoding

In [29]:
lst_features_rare_label_encoding = pd.read_csv(path_x_selected+"lst_X_train_categoricas_ordinais.csv", index_col=0)
lst_features_rare_label_encoding = pd.concat([lst_features_rare_label_encoding, pd.read_csv(path_x_selected+"lst_X_train_categoricas_nominais.csv", index_col=0)], axis=0)
lst_features_rare_label_encoding = list(lst_features_rare_label_encoding["0"].unique())

In [30]:
# A conversão dos valores em strings deve-se ao fato de que a função (ce.RareLabelCategoricalEncoder) somente aceita strings.
for coluna in X_train[lst_features_rare_label_encoding].columns:
    X_train.loc[:, coluna] = X_train.loc[:, coluna].astype(str)

rare_label_encoder = ce.RareLabelCategoricalEncoder(tol=0.025, n_categories=5,
                                         variables=lst_features_rare_label_encoding,
                                         replace_with='Rare')

X_train = rare_label_encoder.fit_transform(X_train)

X_train[lst_features_rare_label_encoding].head()

Unnamed: 0_level_0,count_floors_pre_eq,count_families,land_surface_condition,roof_type,legal_ownership_status,other_floor_type,position,foundation_type,ground_floor_type,plan_configuration
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
802906,2,1,t,n,v,q,t,r,f,d
28830,2,1,o,n,v,q,s,r,x,d
94947,2,1,t,n,v,x,t,r,f,d
590882,2,1,t,n,v,x,s,r,f,d
201944,3,1,t,n,v,x,s,r,f,d


## Transformando a classe 'Rare' em número inteiro

In [31]:
lst_X_train_categoricas_ordinais = pd.read_csv(path_x_selected+"lst_X_train_categoricas_ordinais.csv", index_col=0)
lst_X_train_categoricas_ordinais = list(lst_X_train_categoricas_ordinais["0"].unique())

In [32]:
# Essa célula, apenans transforma todas as classes dos atributos categoricos ordinais em números inteiros e troca o label 'Rare' para o próximo número inteiro da lista.

def transform_rare_int(df, lst_colunas):
    dct_features = {}

    for feature in lst_colunas:
        lst_features = sorted(df[feature].unique())
        dct_features[feature] = {}
        for feature_in in lst_features:
            if 'Rare' in feature_in:
                dct_features[feature][feature_in] = int(max(dct_features[feature]))+1
            else:
                dct_features[feature][feature_in] = int(feature_in)
        df[feature].replace(dct_features[feature], inplace=True)

    return df

In [34]:
try:
    X_train = transform_rare_int(X_train, lst_X_train_categoricas_ordinais)
except:
    print("Já tinha executado a transformação.")
X_train[lst_features_rare_label_encoding].head()

Já tinha executado a transformação.


Unnamed: 0_level_0,count_floors_pre_eq,count_families,land_surface_condition,roof_type,legal_ownership_status,other_floor_type,position,foundation_type,ground_floor_type,plan_configuration
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
802906,2,1,t,n,v,q,t,r,f,d
28830,2,1,o,n,v,q,s,r,x,d
94947,2,1,t,n,v,x,t,r,f,d
590882,2,1,t,n,v,x,s,r,f,d
201944,3,1,t,n,v,x,s,r,f,d


## Aplicando o one hot encoding nos atributos categóricos nominais

In [36]:
lst_X_train_categoricas_nominais = pd.read_csv(path_x_selected+"lst_X_train_categoricas_nominais.csv", index_col=0)
lst_X_train_categoricas_nominais = list(lst_X_train_categoricas_nominais["0"].unique())

In [39]:
ohe_hot_encoder = ce.OneHotCategoricalEncoder(top_categories=None,
                                              variables=lst_X_train_categoricas_nominais,
                                              drop_last=True)

ohe_hot_encoder.fit(X_train)

X_train = ohe_hot_encoder.transform(X_train)

# Criando lista com colunas que passaram pelo OHE:
lst_ohe_encoded_columns = []
for item in lst_X_train_categoricas_nominais:
    lst_ohe_encoded_columns.extend([coluna for coluna in X_train.columns if item in coluna])

X_train[lst_ohe_encoded_columns].head()

Unnamed: 0_level_0,land_surface_condition_t,land_surface_condition_o,roof_type_n,roof_type_q,legal_ownership_status_v,legal_ownership_status_a,legal_ownership_status_r,other_floor_type_q,other_floor_type_x,other_floor_type_j,position_t,position_s,position_j,foundation_type_r,foundation_type_w,foundation_type_i,foundation_type_u,ground_floor_type_f,ground_floor_type_x,ground_floor_type_v,ground_floor_type_z,plan_configuration_d
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
802906,1,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1
28830,0,1,1,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1
94947,1,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,1
590882,1,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1
201944,1,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1


# Dataframe para alimentar o modelo

In [41]:
df_train = pd.concat([X_train, y_train], axis=1)

df_train.to_parquet(path_processed+"df_train.pqt")

df_train.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,land_surface_condition_t,land_surface_condition_o,roof_type_n,roof_type_q,legal_ownership_status_v,legal_ownership_status_a,legal_ownership_status_r,other_floor_type_q,other_floor_type_x,other_floor_type_j,position_t,position_s,position_j,foundation_type_r,foundation_type_w,foundation_type_i,foundation_type_u,ground_floor_type_f,ground_floor_type_x,ground_floor_type_v,ground_floor_type_z,plan_configuration_d,damage_grade
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1
802906,6,487,12198,2,30,6,5,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,3
28830,8,900,2812,2,10,8,7,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,2
94947,21,363,8973,2,10,5,5,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,1,3
590882,22,418,10694,2,10,6,5,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,2
201944,11,131,1488,3,30,8,9,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,3


In [42]:
# Armazenando ordem dos atributos:
lst_features_train = df_train.columns.tolist()
lst_features_train.pop()

'damage_grade'

# Aplicando os encoders treinados no dataset de teste

In [43]:
X_test.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
300051,17,596,11307,3,20,7,6,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
99355,6,141,11987,2,25,13,5,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,1,1,0,0,0,0,0,0,0,0,0
890251,22,19,10044,2,5,4,5,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
745817,26,39,633,1,0,19,3,t,r,x,v,j,t,d,0,0,0,0,0,1,0,0,0,0,0,v,2,1,0,0,1,0,0,0,0,0,0,0
421793,17,289,7970,3,15,8,7,t,r,q,f,q,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [45]:
X_test_encoded = X_test.copy()

X_test_encoded = rare_label_encoder.transform(X_test_encoded)
X_test_encoded = transform_rare_int(X_test_encoded, lst_X_train_categoricas_ordinais)
X_test_encoded = ohe_hot_encoder.transform(X_test_encoded)

X_test_encoded.to_parquet(path_processed+"X_test_encoded.pqt")