In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import Imputer

# Pipeline

### Pipeline 

In [2]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('clf', LogisticRegression())
])

pipeline.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [4]:
preds = pipeline.predict(X_test)

print(preds[:5])

[ 71.  71. 258. 292.  72.]


In [7]:
from sklearn.externals import joblib

filename = 'model.sav'
joblib.dump(pipeline, filename)
 

loaded_model = joblib.load(filename)

preds_ = loaded_model.predict(X_test)

print(preds_[:5])

[ 71.  71. 258. 292.  72.]


### Pipeline + GridSearch: распознавание цифр.

In [8]:
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [9]:
digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target

In [10]:
X_digits[0]

array([ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0., 13., 15., 10.,
       15.,  5.,  0.,  0.,  3., 15.,  2.,  0., 11.,  8.,  0.,  0.,  4.,
       12.,  0.,  0.,  8.,  8.,  0.,  0.,  5.,  8.,  0.,  0.,  9.,  8.,
        0.,  0.,  4., 11.,  0.,  1., 12.,  7.,  0.,  0.,  2., 14.,  5.,
       10., 12.,  0.,  0.,  0.,  0.,  6., 13., 10.,  0.,  0.,  0.])

In [11]:
regr = LogisticRegression()
pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', regr)])

In [12]:
param_grid = {
    'pca__n_components': [5, 20, 30, 40, 50, 64],
    'logistic__C': [0.01, 0.1, 1, 10]}

search = GridSearchCV(pipe, param_grid,cv=5, return_train_score=False)

search.fit(X_digits, y_digits)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.922):
{'logistic__C': 10, 'pca__n_components': 40}


In [13]:
search.predict(X_digits)

array([0, 1, 2, ..., 8, 9, 8])

### Пример для текстов: https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines