In [None]:
%matplotlib nbagg
import matplotlib.pyplot as plt
import numpy as np

Preprocessing and Pipelines
=============================

<img src="figures/pipeline.svg" width=60%>

In [None]:
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)

Cross-validated pipelines including scaling, we need to estimate mean and standard deviation separately for each fold.
To do that, we build a pipeline.

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [None]:
pipeline = Pipeline([("scaler", StandardScaler()), ("svm", SVC())])
# or for short:
make_pipeline(StandardScaler(), SVC())

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
pipeline.predict(X_test)

<img src="figures/pipeline_cross_validation.svg" width=40%>

Cross-validation with a pipeline
---------------------------------

In [None]:
from sklearn.cross_validation import cross_val_score
cross_val_score(pipeline, X_train, y_train)

Grid Search with a pipeline
===========================

In [None]:
from sklearn.grid_search import GridSearchCV

param_grid = {'svm__C': 10. ** np.arange(-3, 3),
              'svm__gamma' : 10. ** np.arange(-3, 3)}

grid_pipeline = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=-1)

In [None]:
grid_pipeline.fit(X_train, y_train)

In [None]:
grid_pipeline.score(X_test, y_test)

# Exercises
Add random features to the iris dataset using ``np.random.uniform`` and ``np.hstack``.

Build a pipeline using the SelectKBest univariate feature selection from the sklearn.feature_selection module and the LinearSVC on the iris dataset.

Use GridSearchCV to adjust C and the number of features selected in SelectKBest.

In [None]:
# %load solutions/pipeline_iris.py