## Estudando Pipelines

In [71]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [72]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV, cross_validate, StratifiedKFold
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, MinMaxScaler, StandardScaler
from imblearn.over_sampling import SMOTENC
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.compose import ColumnTransformer

Pipeline é uma classe do sklearn que permite aplicar uma sequência de transformações em um estimador final.

Para isso, os passos intermendiários devem ter implementados métodos de fit e transform enquanto o estimador final só precisa ter o fit implmentado. O propósito do pipeline é:
##### Reunir várias etapas para serem validadas de forma cruzada (cv) ao definir parâmetros diferentes.
##### Ajuda a criar códigos que possuam um padrão que possa ser facilmetne entendido e compartilhado entre times de cientistas de dados.

- Transformer : Um transformador se refere à um objeto de uma classe que possuem os métodos fit() e transform() e que nos ajudam a transformar o dado na forma que queremos. OneHotEncoder, SimpleImputer e MinMaxScaler são exemplos de transformers.

- Estimator : Um estimador se refere à um algoritmo de ML. Ele é um objeto de uma classe que possui os métodos fit() e predict().

In [73]:
# Dataset
# Vamos importar o dataset
df = sns.load_dataset('tips')

# Add missing values -> pra dar uma graça no dataset
df.iloc[[1, 2, 4, 12], [2]] = np.nan
df.iloc[[10, 20, 40, 120, 222], [1]] = np.nan
df.iloc[[61, 27, 145, 212], [3]] = np.nan
df.iloc[[143, 237, 48, 102, 20], [4]] = np.nan
df.iloc[[71, 172, 194, 182], [5]] = np.nan
df.iloc[[83, 90, 33, 228], [6]] = np.nan
df

# Target : tip | features : [total_bill, sex, smoker, day, time, size of the party]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2.0
1,10.34,1.66,,No,Sun,Dinner,3.0
2,21.01,3.50,,No,Sun,Dinner,3.0
3,23.68,3.31,Male,No,Sun,Dinner,2.0
4,24.59,3.61,,No,Sun,Dinner,4.0
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3.0
240,27.18,2.00,Female,Yes,Sat,Dinner,2.0
241,22.67,2.00,Male,Yes,Sat,Dinner,2.0
242,17.82,1.75,Male,No,Sat,Dinner,2.0


In [74]:
df.shape

(244, 7)

In [75]:
df.info()
# 244 entradas

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         239 non-null    float64 
 2   sex         240 non-null    category
 3   smoker      240 non-null    category
 4   day         239 non-null    category
 5   time        240 non-null    category
 6   size        240 non-null    float64 
dtypes: category(4), float64(3)
memory usage: 7.4 KB


In [76]:
df.isnull().sum()
# Geralmente, em ML supervisionado, o target nunca é nulo

total_bill    0
tip           5
sex           4
smoker        4
day           5
time          4
size          4
dtype: int64

In [77]:
# Para as features numéricas vamos: 1) padronizar as features; 2) adicionar a mediana nos valores nulos e 3) treinar o algoritmo ML.
# Precisamos "treinar" esses passos SOMENTE na base de treino e depois aplicar na base de validação
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = ["total_bill"]),
                                                   df['total_bill'],
                                                   test_size = .2,
                                                   random_state = 42)

In [78]:
# Para os processamentos das features, faremos uma Pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer # Preencher os NaNs com a mediana
from sklearn.preprocessing import StandardScaler # Padronização das features
from sklearn.neighbors import KNeighborsRegressor # algoritmo de ML
from sklearn.metrics import mean_squared_error, mean_absolute_error # métricas de avaliação

O Pipeline é construído com uma lista de pares (key, value) nos quais a key é uma string que contém um nome para o step escolhido e o valor é o objeto da classe.

O Pipeline segue o mesmo framwork do sklearn e por isso temos os métodos .fit(), .fit_transform() e .transform() para os transformers e .fit() e predict() quando temos estimadores definidos dentro da sequência do pipeline.

In [79]:
# Vamos separar variáveis numéricas das categóricas
num_cols = X_train.select_dtypes('number').columns
cat_cols = X_train.select_dtypes(exclude = 'number').columns

# Criaremos uma Pipeline com SimpleImputer, StandardScaler e KNeighborsRegressor
pipe_knn = Pipeline([
    ('imputer', SimpleImputer(strategy = "median")),
    ('scaler', StandardScaler()),
    ('model', KNeighborsRegressor(n_neighbors = 7))
], verbose = True)

In [82]:
impt = SimpleImputer(strategy = "median").fit_transform(X_train[num_cols])

In [85]:
impt = pd.DataFrame(impt)
impt.head()

Unnamed: 0,0,1
0,2.72,2.0
1,2.03,2.0
2,4.0,2.0
3,4.5,4.0
4,2.03,2.0


In [84]:
X_train[num_cols].head()

Unnamed: 0,tip,size
228,2.72,
208,2.03,2.0
96,4.0,2.0
167,4.5,4.0
84,2.03,2.0


In [86]:
pipe_knn.get_params()

{'memory': None,
 'steps': [('imputer', SimpleImputer(strategy='median')),
  ('scaler', StandardScaler()),
  ('model', KNeighborsRegressor(n_neighbors=7))],
 'verbose': True,
 'imputer': SimpleImputer(strategy='median'),
 'scaler': StandardScaler(),
 'model': KNeighborsRegressor(n_neighbors=7),
 'imputer__add_indicator': False,
 'imputer__copy': True,
 'imputer__fill_value': None,
 'imputer__missing_values': nan,
 'imputer__strategy': 'median',
 'imputer__verbose': 0,
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'model__algorithm': 'auto',
 'model__leaf_size': 30,
 'model__metric': 'minkowski',
 'model__metric_params': None,
 'model__n_jobs': None,
 'model__n_neighbors': 7,
 'model__p': 2,
 'model__weights': 'uniform'}

In [87]:
# Melhor visualização do Pipeline
from sklearn import set_config
set_config(display = "diagram")
pipe_knn

In [88]:
# Vamos rodas, agora, somente as colunas numéricas
pipe_knn.fit(X_train[num_cols], y_train)

[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.0s
[Pipeline] ............. (step 3 of 3) Processing model, total=   0.0s


In [89]:
y_pred = pipe_knn.predict(X_test[num_cols])
y_pred

array([23.74428571, 12.94428571, 25.39285714, 32.95571429, 12.94428571,
       12.94428571, 14.05714286, 14.19142857, 22.42714286, 20.07      ,
       20.07      , 12.40857143, 14.96428571, 12.94428571, 11.73428571,
       24.08428571, 24.81      , 18.56714286, 13.48857143, 30.34571429,
       13.48857143, 24.59571429, 18.56714286, 12.40857143, 25.13857143,
       20.07      , 20.07      , 21.24571429, 25.39285714, 29.3       ,
       24.81      , 12.40857143, 25.39285714, 23.74428571, 25.13857143,
       27.28428571, 14.96428571, 33.14285714, 16.70428571, 12.94428571,
       14.21428571, 12.94428571, 14.19142857, 24.08428571, 12.94428571,
       14.05571429, 12.94285714, 25.12      , 12.40857143])

In [91]:
# Avaliando o modelo com y_test
print(f"MSE: {mean_squared_error(y_test, y_pred):.1f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.1f}")

MSE: 49.3
MAE: 5.8


In [92]:
# Avaliando o modelo com y_test - Não importa a ordem, um vai ao quadrado, outro tem o valor absoluto
print(f"MSE: {mean_squared_error(y_pred, y_test):.1f}")
print(f"MAE: {mean_absolute_error(y_pred, y_test):.1f}")

MSE: 49.3
MAE: 5.8


In [93]:
from sklearn.model_selection import cross_validate

cross_validate(pipe_knn, X = X_train[num_cols], y = y_train, cv = 3, verbose = True, scoring = "neg_mean_absolute_error")

[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.0s
[Pipeline] ............. (step 3 of 3) Processing model, total=   0.0s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.0s
[Pipeline] ............. (step 3 of 3) Processing model, total=   0.0s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.0s
[Pipeline] ............. (step 3 of 3) Processing model, total=   0.0s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


{'fit_time': array([0.00752163, 0.00501704, 0.00595641]),
 'score_time': array([0.00303411, 0.00300217, 0.00299215]),
 'test_score': array([-4.34206593, -4.92586813, -5.39676923])}

In [95]:
# Tunando os hiperparâmetros
pipe_knn.get_params()

{'memory': None,
 'steps': [('imputer', SimpleImputer(strategy='median')),
  ('scaler', StandardScaler()),
  ('model', KNeighborsRegressor(n_neighbors=7))],
 'verbose': True,
 'imputer': SimpleImputer(strategy='median'),
 'scaler': StandardScaler(),
 'model': KNeighborsRegressor(n_neighbors=7),
 'imputer__add_indicator': False,
 'imputer__copy': True,
 'imputer__fill_value': None,
 'imputer__missing_values': nan,
 'imputer__strategy': 'median',
 'imputer__verbose': 0,
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'model__algorithm': 'auto',
 'model__leaf_size': 30,
 'model__metric': 'minkowski',
 'model__metric_params': None,
 'model__n_jobs': None,
 'model__n_neighbors': 7,
 'model__p': 2,
 'model__weights': 'uniform'}

In [102]:
# Usaremos KFold e GridSsearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

params_grid = {
    'model__n_neighbors': [3,5,7,9],
    'model__p': [1,2],
    'model__weights': ['uniform', 'distance']
}

fold = KFold(n_splits = 5, shuffle = True, random_state = 42)
grid = GridSearchCV(pipe_knn, param_grid = params_grid, cv = fold, verbose = 2, scoring = 'neg_mean_absolute_error')

grid.fit(X_train[num_cols], y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.0s
[Pipeline] ............. (step 3 of 3) Processing model, total=   0.0s
[CV] END model__n_neighbors=3, model__p=1, model__weights=uniform; total time=   0.0s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.0s
[Pipeline] ............. (step 3 of 3) Processing model, total=   0.0s
[CV] END model__n_neighbors=3, model__p=1, model__weights=uniform; total time=   0.0s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.0s
[Pipeline] ............. (step 3 of 3) Processing model, total=   0.0s
[CV] END model__n_neighbors=3, model__p=1, model__weights=uniform; total time=   0.0s
[Pipeline] ........... (step 1 of 3) Proce

In [103]:
grid.best_score_

-4.654472934472933

In [104]:
grid.best_params_

{'model__n_neighbors': 9, 'model__p': 1, 'model__weights': 'uniform'}

In [105]:
grid.best_estimator_

In [106]:
y_pred = grid.best_estimator_.predict(X_test[num_cols])
y_pred

array([22.00777778, 13.41222222, 26.08555556, 31.99555556, 13.41222222,
       13.41222222, 13.66888889, 14.25333333, 23.09      , 22.12      ,
       22.12      , 13.69      , 13.69      , 13.41222222, 11.68666667,
       22.60444444, 24.97555556, 18.66888889, 14.25333333, 30.87555556,
       14.25333333, 23.68444444, 18.66888889, 13.69      , 23.61333333,
       22.12      , 22.12      , 21.47111111, 26.08555556, 30.72444444,
       24.97555556, 13.66111111, 26.08555556, 23.09      , 23.61333333,
       26.34222222, 13.69      , 31.99555556, 16.74      , 13.41222222,
       13.22444444, 13.41222222, 14.25333333, 22.60444444, 13.41222222,
       12.95333333, 12.28222222, 22.60444444, 13.66111111])

In [108]:
print(f"MSE : {mean_squared_error(y_test, y_pred):.2f}")
print(f"MAE : {mean_absolute_error(y_test, y_pred):.2f}")

MSE : 48.16
MAE : 5.67


## Function transformer

Adcionamos funções que não possuem, os métodos .fit() e .transform(). A função criada deve retornar um DataFrame pandas ou um array numpy

## ColumnTransform()

Serve para especificarmos em quais colunas a transformação deve ser aplicada. 

In [131]:
# Vamos importar o dataset
df = sns.load_dataset('tips')

# Add missing values -> pra dar uma graça no dataset
df.iloc[[1, 2, 4, 12], [2]] = np.nan
df.iloc[[10, 20, 40, 120, 222], [1]] = np.nan
df.iloc[[61, 27, 145, 212], [3]] = np.nan
df.iloc[[143, 237, 48, 102, 20], [4]] = np.nan
df.iloc[[71, 172, 194, 182], [5]] = np.nan
df.iloc[[83, 90, 33, 228], [6]] = np.nan
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         239 non-null    float64 
 2   sex         240 non-null    category
 3   smoker      240 non-null    category
 4   day         239 non-null    category
 5   time        240 non-null    category
 6   size        240 non-null    float64 
dtypes: category(4), float64(3)
memory usage: 7.4 KB


In [124]:
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer # imputar valores NaNs
from sklearn.model_selection import KFold, GridSearchCV, train_test_split # model_selection para cross validation, grid_search e separar treino e teste
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler # preprocessamento de variáveis categóricas
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor # algortimo de Ml

In [164]:
# Separar treino e teste
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['total_bill']), 
                                                    df['total_bill'], 
                                                    test_size=.2, 
                                                    random_state=42)

# Separar features categóricas das numéricas
num_cols = X_train.select_dtypes('number').columns
cat_cols = X_train.select_dtypes(exclude = "number").columns

# Invetar uma função só para usar Function Transform
def somar_10(df):
    return df+10
soma_10 = FunctionTransformer(somar_10, validate = False)

#Pipeline numérico
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy = 'mean')),
    ('log', FunctionTransformer(np.log)),
    ('soma_10', soma_10),
    ('scaler', StandardScaler())
], verbose = True)

# Pipeline categórico, Imputer e OneHotEncoder
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, drop='first', handle_unknown = 'ignore'))
], verbose = True)

# Concaternar pipelines categóricas e numéricas e aplica os transforms dos pipelines
preprocessor = ColumnTransformer([
    ('num_pipe', num_pipe, num_cols),
    ('cat_cols', cat_pipe, cat_cols)
])

# Pipeline final com o pré procesamento e o estimador (KNeighborsRegressor)
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', KNeighborsRegressor())
])

# param_grid
param_grid = {
   'model__n_neighbors': [3,4,5],
    'model__p': [1,2],
    'model__weights': ['uniform', 'distance']
}

fold = KFold(n_splits = 3, shuffle = True, random_state = 42)
grid = GridSearchCV(pipe, param_grid = param_grid, cv = fold, n_jobs= -1, scoring = 'neg_mean_absolute_error', return_train_score = True)

In [165]:
grid.fit(X_train, y_train)

[Pipeline] ........... (step 1 of 4) Processing imputer, total=   0.0s
[Pipeline] ............... (step 2 of 4) Processing log, total=   0.0s
[Pipeline] ........... (step 3 of 4) Processing soma_10, total=   0.0s
[Pipeline] ............ (step 4 of 4) Processing scaler, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing ohe, total=   0.0s


In [166]:
y_pred = grid.predict(X_test)
y_pred

array([15.3275, 15.685 , 18.855 , 36.185 , 14.535 , 20.795 , 16.1075,
       14.6025, 18.7875, 16.5475, 18.7875, 13.43  , 11.2025, 20.795 ,
       11.4525,  9.975 , 24.22  , 19.835 , 17.61  , 31.5325, 16.125 ,
       19.4425, 22.9725, 13.43  , 28.0775, 19.4675, 18.7875, 22.7025,
       18.855 , 26.44  , 19.77  , 17.755 , 21.235 , 17.4375, 22.4675,
       23.62  , 13.29  , 24.7575, 18.1075, 15.385 , 18.185 , 12.03  ,
       14.1825, 15.895 , 16.1075,  9.625 , 15.8675, 18.9325, 12.6025])

In [167]:
# sem best params
print(f"MAE : {mean_absolute_error(y_pred, y_test):.2f}")

MAE : 5.69
