In [140]:
import os

import pandas as pd
import numpy as np

from category_encoders import TargetEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

import matplotlib.pyplot as plt
import seaborn as sns

os.chdir("../src/")
from utils.data_prep import data_prep as dp

basepath = "../data/external/"
path_x_train = "../data/external/train_values.csv"
path_y_train = "../data/external/train_labels.csv"
path_x_selected = "../data/interim/"
path_processed = "../data/processed/"

In [131]:
# List all files in a directory using scandir()
print("-"*30)
print("Arquivos em data/external:")
with os.scandir(basepath) as entries:
    for entry in entries:
        if entry.is_file():
            print(entry.name)

print("-"*30)
print("Arquivos em data/interim:")

with os.scandir(path_x_selected) as entries:
    for entry in entries:
        if entry.is_file():
            print(entry.name)
print("-"*30)

------------------------------
Arquivos em data/external:
submission_format.csv
train_values.csv
test_values.csv
train_labels.csv
.gitkeep
------------------------------
Arquivos em data/interim:
.gitkeep
lst_X_train_booleanas.csv
lst_X_train_categoricas.csv
lst_X_train_continuas.csv
lst_X_train_categoricas_ordinais.csv
lst_X_train_categoricas_nominais.csv
------------------------------


# Início da preparação

A preparação de dados inicial será:
1. Target encoding dos categóricos nominais;
1. Standard scaling de todos os atributos.

In [132]:
# Carregando dados de treino
X_train = pd.read_csv(path_x_train, index_col=0)
y_train = pd.read_csv(path_y_train, index_col=0)

# Resgatando lista de atributos booleanos e categóricos.
lst_features_target_encoding = pd.read_csv(path_x_selected+"lst_X_train_categoricas_nominais.csv", index_col=0)
lst_features_target_encoding = list(lst_features_target_encoding["0"].unique())

## Aplicando o Target encoding nos atributos categóricos nominais

In [133]:
X_train_target_encoding = X_train[lst_features_target_encoding].copy()
X_train.drop(lst_features_target_encoding, axis=1, inplace=True)

X_train_target_encoding = pd.concat([X_train_target_encoding, y_train], axis=1)
X_train_target_encoding.head()

Unnamed: 0_level_0,land_surface_condition,roof_type,legal_ownership_status,other_floor_type,position,foundation_type,ground_floor_type,plan_configuration,damage_grade
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
802906,t,n,v,q,t,r,f,d,3
28830,o,n,v,q,s,r,x,d,2
94947,t,n,v,x,t,r,f,d,3
590882,t,n,v,x,s,r,f,d,2
201944,t,n,v,x,s,r,f,d,3


In [134]:
target_encoder = TargetEncoder()
X_train_target_encoding = target_encoder.fit_transform(X_train_target_encoding.iloc[:, :-1], X_train_target_encoding.iloc[:,-1])

X_train_target_encoding.head()

Unnamed: 0_level_0,land_surface_condition,roof_type,legal_ownership_status,other_floor_type,position,foundation_type,ground_floor_type,plan_configuration
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
802906,2.23417,2.269637,2.244063,2.315751,2.309399,2.329573,2.309104,2.243646
28830,2.289081,2.269637,2.244063,2.315751,2.229131,2.329573,2.250714,2.243646
94947,2.23417,2.269637,2.244063,2.297344,2.309399,2.329573,2.309104,2.243646
590882,2.23417,2.269637,2.244063,2.297344,2.229131,2.329573,2.309104,2.243646
201944,2.23417,2.269637,2.244063,2.297344,2.229131,2.329573,2.309104,2.243646


**IMPORTANTE**

**Problema identificado:** A função Target Encoder do pacote [category_encoders](http://contrib.scikit-learn.org/category_encoders/index.html) **NÃO** retorna a saída correta para um problema de classificação multiclasse.

**Ação:** Transformar cada classe da resposta em um problema de classifição binário e, portanto, cada classe da resposta gera um atributo novo para cada atributo categórico.

*Referência: ["Target Encoding For Multi-Class Classification"](https://towardsdatascience.com/target-encoding-for-multi-class-classification-c9a7bcb1a53) de Nishant Mohan.*

In [135]:
# TESTAREI DEPOIS

# def target_encode_multiclass(X,y): #X,y are pandas df and series
#     import category_encoders as ce
    
#     y=y.astype(str)   #convert to string to onehot encode
#     enc=ce.OneHotEncoder().fit(y)
#     y_onehot=enc.transform(y)
#     class_names=y_onehot.columns  #names of onehot encoded columns
#     X_obj=X.select_dtypes('object') #separate categorical columns
#     X=X.select_dtypes(exclude='object') 
#     for class_ in class_names:
      
#         enc=ce.TargetEncoder()
#         enc.fit(X_obj,y_onehot[class_]) #convert all categorical 
#         temp=enc.transform(X_obj)       #columns for class_
#         temp.columns=[str(x)+'_'+str(class_) for x in temp.columns]
#         X=pd.concat([X,temp],axis=1)    #add to original dataset
      
#     return X

# target_encode_multiclass(X_train_target_encoding.iloc[:,:-1], X_train_target_encoding.iloc[:,-1])

## Aplicando Standard Scaler sobre todos os atributos

In [142]:
X_train = pd.concat([X_train_target_encoding, X_train], axis=1)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

X_train_scaled = pd.DataFrame(data=X_train_scaled, columns=X_train.columns, index=X_train.index)

Unnamed: 0_level_0,land_surface_condition,roof_type,legal_ownership_status,other_floor_type,position,foundation_type,ground_floor_type,plan_configuration,geo_level_1_id,geo_level_2_id,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,-0.374641,0.181631,0.108199,0.458191,1.991107,0.409977,0.359917,0.121091,-0.983414,-0.518705,...,-0.262312,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731
28830,4.640181,0.181631,0.108199,0.458191,-0.25591,0.409977,0.063218,0.121091,-0.734459,0.481998,...,-0.262312,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731
94947,-0.374641,0.181631,0.108199,0.349335,1.991107,0.409977,0.359917,0.121091,0.883744,-0.819158,...,-0.262312,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731
590882,-0.374641,0.181631,0.108199,0.349335,-0.25591,0.409977,0.359917,0.121091,1.008221,-0.685893,...,-0.262312,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731
201944,-0.374641,0.181631,0.108199,0.349335,-0.25591,0.409977,0.359917,0.121091,-0.361028,-1.381296,...,-0.262312,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731


# Dataframe para alimentar o modelo

In [146]:
df_train = pd.concat([X_train_scaled, y_train], axis=1)

df_train.to_parquet(path_processed+"df_train.pqt")

df_train.head()

Unnamed: 0_level_0,land_surface_condition,roof_type,legal_ownership_status,other_floor_type,position,foundation_type,ground_floor_type,plan_configuration,geo_level_1_id,geo_level_2_id,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,-0.374641,0.181631,0.108199,0.458191,1.991107,0.409977,0.359917,0.121091,-0.983414,-0.518705,...,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,3
28830,4.640181,0.181631,0.108199,0.458191,-0.25591,0.409977,0.063218,0.121091,-0.734459,0.481998,...,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,2
94947,-0.374641,0.181631,0.108199,0.349335,1.991107,0.409977,0.359917,0.121091,0.883744,-0.819158,...,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,3
590882,-0.374641,0.181631,0.108199,0.349335,-0.25591,0.409977,0.359917,0.121091,1.008221,-0.685893,...,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,2
201944,-0.374641,0.181631,0.108199,0.349335,-0.25591,0.409977,0.359917,0.121091,-0.361028,-1.381296,...,-0.186537,-0.09037,-0.030676,-0.018996,-0.032738,-0.013714,-0.012076,-0.009395,-0.071731,3
