# Pipelines in sklearn

In [55]:
# importing all the required libraries

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [56]:
# Loading the inbuilt dataset provided in seaborn library
iris_df = load_iris()


In [57]:
iris_df.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [58]:
iris_df.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [59]:
X_train , X_test , y_train , y_test = train_test_split(iris_df.data,iris_df.target,test_size=0.3,random_state=0)

### Steps to be followed in Pipeline Creation

1. DataPreprocessing by using StandardScaler
2. Reduce Dimension using PCA
3. Apply classifier

In [60]:
# Pipeline 1
pipeline_lr = Pipeline([('scalar1',StandardScaler()),
                        ('pca1',PCA(n_components=2)),
                        ('lr_classifier',LogisticRegression(random_state=0))])

In [61]:
# Pipeline 2
pipeline_dt = Pipeline([('scaler2',StandardScaler()),
                        ('pca2',PCA(n_components=2)),
                        ('dt_classifier',DecisionTreeClassifier())])

In [62]:
# Pipeline 3
pipeline_randomforest = Pipeline([('scaler3',StandardScaler()),
                        ('pca3',PCA(n_components=2)),
                        ('rf_classifier',RandomForestClassifier())])

In [63]:
# encapsulating in list 
pipelines = [pipeline_lr,pipeline_dt,pipeline_randomforest]

In [64]:
for p in pipelines:
    print(p)

Pipeline(steps=[('scalar1', StandardScaler()), ('pca1', PCA(n_components=2)),
                ('lr_classifier', LogisticRegression(random_state=0))])
Pipeline(steps=[('scaler2', StandardScaler()), ('pca2', PCA(n_components=2)),
                ('dt_classifier', DecisionTreeClassifier())])
Pipeline(steps=[('scaler3', StandardScaler()), ('pca3', PCA(n_components=2)),
                ('rf_classifier', RandomForestClassifier())])


In [65]:
# Making a dictionary of pipelines and classifier for easy access
pipe_dict = {0: 'Logistic Regression' , 1 : 'Decision Tree' , 2 :'Random Forest'}

In [66]:
# fitting the pipelines
for p in pipelines :
    p.fit(X_train,y_train)


In [67]:
for i,model in enumerate(pipelines):
    print("{} Model Test Accuracy is : {}".format(pipe_dict[i],model.score(X_test,y_test)))

Logistic Regression Model Test Accuracy is : 0.8666666666666667
Decision Tree Model Test Accuracy is : 0.9111111111111111
Random Forest Model Test Accuracy is : 0.9333333333333333


# Piplines Perform Hyperparameter tuning using GridSearchCV

In [68]:
import numpy as np
from sklearn.model_selection import GridSearchCV

In [73]:
# Create Pipeline
pipe = Pipeline=([ ("classifier", RandomForestClassifier()) ])

# Create Dictionary with candidate learning learning algorithims and their hyperparameters
grid_param = [{
                    "classifier" : [LogisticRegression()],
                    "classifier__penalty" : ['l2','l1'],
                    "classifier__C" : np.logspace(0,4,10)
                },
    
                {
                    "classifier" : [LogisticRegression()],
                    "classifier__penalty" : ['l2'],
                    "classifier__C" : np.logspace(0,4,10),
                    "classifier__solver" : ['newton-cg', 'saga', 'sag', 'liblinear'] # These Solvers dont allow L1 penalty
                },
    
                {
                    "classifier" : [RandomForestClassifier()],
                    "classifier__n_estimators" : [10,100,1000],
                    "classifier_max_depth" : [5,10,15,25,30],
                    "classifier_max_samples_leaf" : [1,2,3,4,56,20,100],
                    "classifier__max_leaf_nodes" : [2,5,10]
                }]

# Creating a gridsearch of the pipeline and  then fitting the best model

gd = GridSearchCV(pipe, grid_param, cv=5, verbose=0, n_jobs=-1)




In [74]:
best_model = gd.fit(X_train,y_train)

TypeError: estimator should be an estimator implementing 'fit' method, [('classifier', RandomForestClassifier())] was passed

In [72]:
# Create a pipeline
pipe = Pipeline=([ ("classifier", RandomForestClassifier()) ])
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2','l1'],
                 "classifier__C": np.logspace(0, 4, 10)
                 },
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2'],
                 "classifier__C": np.logspace(0, 4, 10),
                 "classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
                 },
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_depth":[5,8,15,25,30,None],
                 "classifier__min_samples_leaf":[1,2,5,10,15,100],
                 "classifier__max_leaf_nodes": [2, 5,10]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

TypeError: estimator should be an estimator implementing 'fit' method, [('classifier', RandomForestClassifier())] was passed