[Walkthrough link](https://web.archive.org/web/20210507043615/https://iaml.it/blog/optimizing-sklearn-pipelines)

In [1]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
data = load_boston()
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'])


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [2]:
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge

In [3]:
# manually implement pipelines without dedicated module
# see how many repetitive activities are necessary 
scaler=StandardScaler()
pca=PCA()
ridge=Ridge()

In [5]:
# chain different components, manually passing training dataset
X_train = scaler.fit_transform(X_train)
X_train = pca.fit_transform(X_train)
ridge.fit(X_train, y_train)

In [6]:
# NOW USING PIPELINE, list of ordered elements - with a name and object instance 
from sklearn.pipeline import Pipeline
pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('reduce_dim', PCA()),
        ('regressor', Ridge())
        ])

In [7]:
# can be trained and tested quickly:
pipe = pipe.fit(X_train, y_train)
print('Test score: ', pipe.score(X_test, y_test))

Test score:  -10763.02647201185


In [8]:
# can be indexed to see single values e.g. for PCA
print(pipe.steps[1][1].explained_variance_)

[1.0026455 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455
 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455]


In [9]:
# fit_transforms during training
# predict/transform during test 

#### Pipeline Tuning for Hyperparameters

In [10]:
# how accuracy varies with different number of components
import numpy as np
n_features_to_test = np.arange(1,11)

In [11]:
# exponential range of values for regularization factor
alpha_to_test = 2.0**np.arange(-6,+6)

In [12]:
# can combine similar to gridsearch
# to see optimal number of features from PCA and in regularization factor
params = {'reduce_dim__n_components': n_features_to_test,\
              'regressor__alpha': alpha_to_test}

In [13]:
from sklearn.model_selection import GridSearchCV
gridsearch = GridSearchCV(pipe, params, verbose=1).fit(X_train, y_train)
print('Final score is: ', gridsearch.score(X_test, y_test))

Fitting 5 folds for each of 120 candidates, totalling 600 fits
Final score is:  -8468.70041026658


In [14]:
gridsearch.best_params_

{'reduce_dim__n_components': 10, 'regressor__alpha': 16.0}

In [None]:
# note when naming parameters:
# namePipelineStep - double underscore - name of parameters in that step 
# e.g. above: reduce_dim__n_components, regressor__alpha

Pipeline Tuning (Advanced)
* to decide which algo to use, performing data normalization
* adding as hyperparameter in the grid test 

In [15]:
scalers_to_test = [StandardScaler(), RobustScaler(), QuantileTransformer()]

params = {'scaler': scalers_to_test,
        'reduce_dim__n_components': n_features_to_test,\
        'regressor__alpha': alpha_to_test}

In [16]:
#can apply to dimensionality reduction steps - but can take on differetn parameters
# e.g. PCA vs SelectKBest
params = [
        {'scaler': scalers_to_test,
         'reduce_dim': [PCA()],
         'reduce_dim__n_components': n_features_to_test,\
         'regressor__alpha': alpha_to_test},

        {'scaler': scalers_to_test,
         'reduce_dim': [SelectKBest(f_regression)],
         'reduce_dim__k': n_features_to_test,\
         'regressor__alpha': alpha_to_test}
        ]


In [19]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
gridsearch = GridSearchCV(pipe, params, verbose=1).fit(X_train, y_train)
print('Final score is: ', gridsearch.score(X_test, y_test))

Fitting 5 folds for each of 720 candidates, totalling 3600 fits
Final score is:  -10692.46965053082


In [21]:
gridsearch.best_params_

{'reduce_dim': SelectKBest(score_func=<function f_regression at 0x1272e2dc0>),
 'reduce_dim__k': 10,
 'regressor__alpha': 4.0,
 'scaler': RobustScaler()}

In [None]:
# note that this is a simple example, and more complex will take longer
# RANDOMIZED grid search may be better suited 

Feature Unions
* doing two different things in parallel 
* e.g. doing PCA and SelectK best concurrently

In [1]:
from sklearn.svm import SVC
from sklearn.datasets import load_iris

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

In [2]:
iris = load_iris()

X, y = iris["data"], iris["target"]

In [3]:
pca = PCA(n_components=2)

selection = SelectKBest(k=3)

In [4]:
# builds a transformer for PCA and univariate k-best
combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

In [5]:
svm = SVC(kernel='linear')

In [9]:
pipeline = Pipeline([("features", combined_features),
                    ("svm", svm)])

param_grid = {"features__pca__n_components": [1,2,3],
                "features__univ_select__k":[1,2,3],
                "svm__C":[0.1,1,10]}

grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10, refit=True)

In [10]:
grid_search.fit(X,y)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5; 1/27] START features__pca__n_components=1, features__univ_select__k=1, svm__C=0.1
[CV 1/5; 1/27] END features__pca__n_components=1, features__univ_select__k=1, svm__C=0.1;, score=0.933 total time=   0.0s
[CV 2/5; 1/27] START features__pca__n_components=1, features__univ_select__k=1, svm__C=0.1
[CV 2/5; 1/27] END features__pca__n_components=1, features__univ_select__k=1, svm__C=0.1;, score=0.933 total time=   0.0s
[CV 3/5; 1/27] START features__pca__n_components=1, features__univ_select__k=1, svm__C=0.1
[CV 3/5; 1/27] END features__pca__n_components=1, features__univ_select__k=1, svm__C=0.1;, score=0.867 total time=   0.0s
[CV 4/5; 1/27] START features__pca__n_components=1, features__univ_select__k=1, svm__C=0.1
[CV 4/5; 1/27] END features__pca__n_components=1, features__univ_select__k=1, svm__C=0.1;, score=0.933 total time=   0.0s
[CV 5/5; 1/27] START features__pca__n_components=1, features__univ_select__k=1, svm__C

In [11]:
print(grid_search.best_params_)

{'features__pca__n_components': 2, 'features__univ_select__k': 3, 'svm__C': 1}
