# Sklearn Pipeline Example

In [24]:
#Importing Packages
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [16]:
#Prepare Data
X,y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test= train_test_split(X,y, random_state=1, test_size=0.2)

### Creating Pipeline

In [19]:
#Method 1:
pipe= make_pipeline(StandardScaler(), RandomForestClassifier(random_state=123))
print(pipe)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=123))])


In [20]:
#Method 2: 
pipe2= Pipeline([("scaler", StandardScaler()), ("clsf", RandomForestClassifier())])
print(pipe2)

Pipeline(steps=[('scaler', StandardScaler()),
                ('clsf', RandomForestClassifier())])


There is one different things between method 1 and method 2 this is we gave it name the method 2 manually. 

### Applying To The Data (We will continue with method 1)

In [21]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=123))])

In [25]:
pred= pipe.predict(X_test)

In [26]:
accuracy_score(y_test, pred)

0.9666666666666667

So, that's it. It is simple. 

### Hyperparametre Tuning Of Pipeline With GridSearch

In [40]:
from sklearn.model_selection import GridSearchCV

In [41]:
param_grid= dict(randomforestclassifier__min_samples_split=[2,3,4,5,6], 
                 randomforestclassifier__max_depth=[3,4,5,6])
grid= GridSearchCV(pipe, param_grid=param_grid)

In [44]:
model_grid= grid.fit(X_train, y_train)

In [45]:
pred_grid= model_grid.predict(X_test)

In [46]:
print("Best parameters of grid search cv: {}".format(model_grid.best_params_))
print("Best score of grid search cv: {}".format(accuracy_score(y_test, pred_grid)))

Best parameters of grid search cv: {'randomforestclassifier__max_depth': 3, 'randomforestclassifier__min_samples_split': 3}
Best score of grid search cv: 0.9666666666666667


### Saving The Model With Pickle

In [39]:
import pickle

In [47]:
pickle.dump(model_grid, open("iris_pipeline.pkl", "wb"))

**model_grid** saved succesfully! We can load it like following: <br/>
pickle_load(open("iris_pipeline.pkl", "rb"))