# 2_data_preparation_suto

In [1]:
import os
import warnings

import pandas as pd
from pandas.api.types import is_string_dtype
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from feature_engine.encoding import RareLabelEncoder, CountFrequencyEncoder

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import (
    cross_val_score, RepeatedStratifiedKFold, RandomizedSearchCV, GridSearchCV, train_test_split)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

warnings.filterwarnings("ignore")

path_data = "../data/raw/"
path_interim_data = "../data/interim/"

# Parâmetros estéticos dos gráficos:
set_palette = "Paired"
axes_style = "ticks"

linha = 50*"-"

In [2]:
def non_numeric_to_nan(df: pd.DataFrame, lst_numeric: list) -> pd.DataFrame:
    """ This function only transform non numeric values of numeric columns into np.nans"""

    df_transformed = df.copy()
    lst_columns = df.columns.tolist()
    for feature in lst_numeric:
        if is_string_dtype(df_transformed[feature]):
            df_transformed.loc[~df_transformed[feature].str.isnumeric(), feature] = np.nan
            df_transformed[feature] = df_transformed[feature].astype('float64')

    return df_transformed

class KNNImputerDataframe(BaseEstimator, TransformerMixin):  

    def __init__(self, lst_numeric: list) -> None:
        if not isinstance(lst_numeric, list):
            self.lst_numeric = [lst_numeric]
        else:
            self.lst_numeric = lst_numeric


    def fit(self, X: pd.DataFrame):
        # persist mode in a dictionary
        model = KNNImputer(n_neighbors=2, weights='distance')
        model.fit(X[self.lst_numeric])
        self.model = model

        return self


    def transform(self, X: pd.DataFrame):
        # if not self.model.check_is_fitted():
        #     print("Model not fitted.")
        # else:
        #     df_transformed = self.model.transform(X)
        lst_columns = X.columns.tolist()
        lst_index = X.index.tolist()

        df_transformed = self.model.transform(X[self.lst_numeric])
        df_transformed = pd.DataFrame(df_transformed, columns=self.lst_numeric, index=lst_index)
        
        X.drop(self.lst_numeric, axis=1, inplace=True)
        df_transformed = pd.concat([X, df_transformed], axis=1)
        df_transformed = df_transformed[lst_columns]
        
        return df_transformed


def get_car_brand(string):
    """
    Essa função, extrai a marca do carro. Ela extrai os caracteres anteriores ao primeiro espaço.
    Inputs: string
    Output: slice de string
    """
    index = string.find(" ")
    return string[:index]

In [3]:
df_data = pd.read_csv(path_data+"auto-mpg.csv")

print(f"Dimensões do dataset: {df_data.shape}")

df_data.head()

Dimensões do dataset: (398, 9)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


## Separação em conjunto de treino e validação

In [4]:
X = df_data.drop(['mpg'], axis=1)
y = df_data['mpg']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print(f"""Dimensão do conjunto de treino: {X_train.shape}
Dimensão do conjunto de validação: {X_test.shape}""")

Dimensão do conjunto de treino: (358, 8)
Dimensão do conjunto de validação: (40, 8)


## Etapas da preparação:

Conforme concluimos na etapa anterior, a preparação de dados passará pelos seguintes processos:
 
 1. substituir os valores faltantes dos fatores numéricos usando KNN;
 1. criação do fator "car brand" contendo o nome do fabricante; e
 1. transformaremos os dados categóricos em numéricos para serem consumidos pelo modelo.


 ### 1. substituir os valores faltantes dos fatores contínuos usando KNN

In [5]:
lst_continuas = ['acceleration', 'displacement', 'horsepower']

X_train = non_numeric_to_nan(X_train, lst_continuas)

selected_indexes = X_train.loc[X_train.isna().any(axis=1)].index.tolist()
print(f"Índices que possuem NaNs: {selected_indexes}\n")

X_train.loc[selected_indexes, :]

Índices que possuem NaNs: [336, 374, 32, 354, 330]



Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
336,4,140.0,,2905,14.3,80,1,ford mustang cobra
374,4,151.0,,3035,20.5,82,1,amc concord dl
32,4,98.0,,2046,19.0,71,1,ford pinto
354,4,100.0,,2320,15.8,81,2,renault 18i
330,4,85.0,,1835,17.3,80,2,renault lecar deluxe


In [61]:
model = KNNImputerDataframe(lst_continuas)
model.fit(X_train)
X_train = model.transform(X_train)

X_train.loc[selected_indexes, :]

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
336,4,140.0,82.769231,2905,14.3,80,1,ford mustang cobra
374,4,151.0,90.0,3035,20.5,82,1,amc concord dl
32,4,98.0,65.333333,2046,19.0,71,1,ford pinto
354,4,100.0,78.371799,2320,15.8,81,2,renault 18i
330,4,85.0,70.0,1835,17.3,80,2,renault lecar deluxe


### 2. criação do fator "car brand" contendo o nome do fabricante

In [66]:
X_train["car brand"] = X_train["car name"].apply(get_car_brand)
X_train["car brand"].replace(
    {"chevroelt":"chevrolet", "maxda":"mazda", "mercedes-benz":"mercedes", "subar":"subaru",
    "toyouta":"toyota", "vokswagen":"volkswagen","vw":"volkswagen"}, inplace=True)

X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,car brand
307,6,173.0,115.0,2700,12.9,79,1,oldsmobile omega brougham,oldsmobile
55,4,97.0,60.0,1834,19.0,71,2,volkswagen model 111,volkswagen
76,4,121.0,112.0,2933,14.5,72,2,volvo 145e (sw),volvo
25,8,360.0,215.0,4615,14.0,70,1,ford f250,ford
82,4,120.0,97.0,2506,14.5,72,3,toyouta corona mark ii (sw),toyota


### 3. transformaremos os dados categóricos em numéricos para serem consumidos pelo modelo

In [72]:
pipe = Pipeline([('RareLabel', RareLabelEncoder(tol=0.05, n_categories=4, variables=['car brand'], replace_with='Rare'))
                 , ('CountFrequency', CountFrequencyEncoder(encoding_method='frequency', variables=['car brand']))])

X_train_encoded = pipe.fit_transform(X_train)

X_train_encoded.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,car brand
307,6,173.0,115.0,2700,12.9,79,1,oldsmobile omega brougham,0.365922
55,4,97.0,60.0,1834,19.0,71,2,volkswagen model 111,0.053073
76,4,121.0,112.0,2933,14.5,72,2,volvo 145e (sw),0.365922
25,8,360.0,215.0,4615,14.0,70,1,ford f250,0.125698
82,4,120.0,97.0,2506,14.5,72,3,toyouta corona mark ii (sw),0.067039


### Criação do pipeline

Por fim, é importante a criação de um pipeline para agregar todas as transformações necessárias e aplicar sobre o dataset de validação.

## Extração para a próxima etapa

In [76]:
X_train_encoded.to_parquet(path_interim_data+"X_train_encoded_step_2.pqt")
X_test_encoded.to_parquet(path_interim_data+"X_test_encoded_2_output.pqt")

In [77]:
# ls_continuous_features = df_data.select_dtypes(include=[float]).columns
# ls_continuous_features

# df_data_step_2_output = df_data.copy()
# df_data_step_2_output = df_data_step_2_output[ls_continuous_features]

# df_data_step_2_output.to_parquet(path_interim_data+"df_data_step_2_output.pqt")