### Pipelines

Muito úteis para organização do código e evitar vazamento de dados!

In [4]:
from sklearn.datasets import load_boston, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA,TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from joblib import dump, load

X, y = load_breast_cancer(return_X_y=True, as_frame=True)

In [5]:
X = X.iloc[: , :3]
X.columns = ['mean_perimeter', 'mean_radius', 'mean_texture']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2021)

Unnamed: 0,mean_perimeter,mean_radius,mean_texture
258,15.660,23.20,110.20
132,16.160,21.54,106.20
108,22.270,19.67,152.80
297,11.760,18.14,75.00
274,17.930,24.48,115.20
...,...,...,...
44,13.170,21.81,85.42
128,15.100,16.39,99.58
57,14.710,21.59,95.55
341,9.606,16.84,61.64


In [21]:
union = FeatureUnion([("pca", PCA(n_components=1)),
                      ("svd", TruncatedSVD(n_components=1))])

pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('reduce_dim', union),
        ('classifier', LogisticRegression())
        ])

pipe

Pipeline(steps=[('scaler', StandardScaler()),
                ('reduce_dim',
                 FeatureUnion(transformer_list=[('pca', PCA(n_components=1)),
                                                ('svd',
                                                 TruncatedSVD(n_components=1))])),
                ('classifier', LogisticRegression())])

In [22]:
pipe = pipe.fit(X_train, y_train)
print('Testing score: ', pipe.score(X_test, y_test))

Testing score:  0.8951048951048951


In [23]:
# Salvando o pipeline
dump(pipe, 'pipe.joblib')

['pipe.joblib']

In [24]:
pipe_load = load('pipe.joblib')
pipe_load.predict(X_test)[0]

1

#### Feature Union

Feature Union permite realizar redução de dimensionalidade do dataset

In [9]:
union = FeatureUnion([("pca", PCA(n_components=1))])
X_union = [[0., 1., 3], [2., 2., 5]]
union.fit_transform(X_union)

array([[ 1.5],
       [-1.5]])

In [8]:
X_union

[[0.0, 1.0, 3], [2.0, 2.0, 5]]

#### Fit/ Transform

3 tipos de operações:
- fit: apenas para fazer o fit da operação (ex., calcular a média e o desvio padrão de um dataset). Usada no training set.
- transform: aplica a transformação calculada por um fit anterior. Usado no test/ validation set. 
- fit_transform: faz o fit e já aplica a transformação. Equivalente a .fit() seguido de .transform()


In [20]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
data = pd.DataFrame([[1, 0], [0, -1], [1, 1], [1, 1]])
scaler = StandardScaler()
scaler.fit_transform(data)

array([[ 0.57735027, -0.30151134],
       [-1.73205081, -1.50755672],
       [ 0.57735027,  0.90453403],
       [ 0.57735027,  0.90453403]])

In [14]:
data

Unnamed: 0,0,1
0,1,0
1,0,-1
2,1,1
3,1,1


In [16]:
scaler.mean_

array([0.75, 0.25])

In [17]:
scaler.var_

array([0.1875, 0.6875])

In [18]:
print(scaler.transform([[12, 1]]))

[[25.98076211  0.90453403]]
