In [1]:
# Loading libraries

import pandas as pd
import numpy as np

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [2]:
iris = load_iris()

In [None]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [None]:
# Missing Value
# Oultier Treatment
# Encoding
# Normally we do above methods, before X and y split. We can also bring the above methods in to pipeline.

In [3]:
X=iris.data
y=iris.target

In [4]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=10)


In [None]:
# Now we are going to create ML piplines
# 1. Data preprocressing using Standard Scaler
# 2. Redeuce Dimension using PCA
# 3. Apply Classifier Algo

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
pca = PCA(n_components=2)
pca_scaled = pca.fit_transform(X_train_scaled)

In [None]:
random_clf = RandomForestClassifier()
random_clf.fit(pca_scaled,y_train)

RandomForestClassifier()

In [None]:
random_clf.score(pca_scaled,y_train)

1.0

In [None]:
pca_test = pca.transform(X_test_scaled)

In [None]:
random_clf.score(pca_test,y_test)

0.9555555555555556

In [None]:
# Now, we are converting above steps to a ML pipeline.

# Pipeline Method

In [7]:
pipline_lr = Pipeline([('scaler1',StandardScaler()),
                      ('pca1',PCA(n_components=2)),
                      ('lr_clf',LogisticRegression())])

# Inside Pipeline object, there is a list of tuples. One tuple will have one name for the object created and the object itself.
# When creating a Pipeline, always make sure to define the model last, as pipeline works as synchronous way, so scaling and pca must be done first, then only it can be given to the model.
# Scaler and pca will do fit transform and model will do fit.
pipline_dt = Pipeline([('scaler2',StandardScaler()),
                      ('pca2',PCA(n_components=2)),
                      ('dt_clf',DecisionTreeClassifier())])

pipline_rf = Pipeline([('scaler3',StandardScaler()),
                      ('pca3',PCA(n_components=2)),
                      ('rf_clf',RandomForestClassifier())])

In [8]:
pipelines=[pipline_lr,pipline_dt,pipline_rf]
for pipe in pipelines:
  pipe.fit(X_train,y_train)            

# Only doing fit is necessary for pipelines, and it will automatically do fit transfrom to scaing and pca, but only do fit in the model. Its an advantage of Pipeline.

In [None]:
# Enumerate Function

a=[100,200,300,400]
for i in a:
  print('The a value :', i)

for index,i in enumerate(a):
  print(f'The a value {i} for index {index}')

# Enumerate is used in for loops for printing both index and value.

The a value : 100
The a value : 200
The a value : 300
The a value : 400
The a value 100 for index 0
The a value 200 for index 1
The a value 300 for index 2
The a value 400 for index 3


In [None]:
pipe_dict = {0:'Logistic Regression',1:'Decision Tree',2:'Random Forest'}

In [None]:
for i,model in enumerate(pipelines):
  print(f" The test Accuracy for the {pipe_dict[i]} is {model.score(X_test,y_test)}")

 The test Accuracy for the Logistic Regression is 0.9333333333333333
 The test Accuracy for the Decision Tree is 0.9555555555555556
 The test Accuracy for the Random Forest is 0.9333333333333333


# Pipeline and Grid Search CV

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [11]:
clf = make_pipeline(StandardScaler(),GridSearchCV(LogisticRegression(),param_grid={'solver':['liblinear'],'penalty':['l1','l2']},cv=10))


In [13]:
clf.fit(X_train,y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('gridsearchcv',
                 GridSearchCV(cv=10, estimator=LogisticRegression(),
                              param_grid={'penalty': ['l1', 'l2'],
                                          'solver': ['liblinear']}))])

In [14]:
clf.score(X_test,y_test)

0.9333333333333333

In [20]:
clf.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('standardscaler', StandardScaler()),
                ('gridsearchcv',
                 GridSearchCV(cv=10, estimator=LogisticRegression(),
                              param_grid={'penalty': ['l1', 'l2'],
                                          'solver': ['liblinear']}))])>

In [None]:
param_grid_list = {'lr_params':[{'penalty': ['l1', 'l2'],'solver': ['liblinear']}],
                   'rf_params':[{'n_estimators': [4, 6], 'max_features': ['log2', 'sqrt','auto'], 'criterion': ['entropy', 'gini'],'max_depth': [2,10]}],
                   'dt_params':[{'criterion':["gini","entropy"], 'splitter':['best','random'],'max_depth':[3,4,5],'min_samples_split':[2,3,4],'max_features':["auto","sqrt","log2"]}],
                  }

In [None]:
pipeline_1=['piplines_lr','piplines_rf','piplines_dt']

In [None]:
param_grid = ['lr_params','rf_params','dt_params']

In [None]:
model_1 = [LogisticRegression(),RandomForestClassifier(),DecisionTreeClassifier()]

In [None]:
pipe_models = {0:'Logistic Regression',1:'Random Forest',2:'Decision Tree'}

In [None]:
for i,model in enumerate(pipeline_1):
  model = make_pipeline(StandardScaler(), GridSearchCV(model_1[i],param_grid= param_grid_list[param_grid[i]],cv=10))
  model.fit(X_train,y_train)
  print(f'Test Accuracy Score of {pipe_models[i]} is {model.score(X_test,y_test)}')
  print("Best Params for this model",model.get_params)
  print("=====================================")

Test Accuracy Score of Logistic Regression is 0.9333333333333333
Best Params for this model <bound method Pipeline.get_params of Pipeline(steps=[('standardscaler', StandardScaler()),
                ('gridsearchcv',
                 GridSearchCV(cv=10, estimator=LogisticRegression(),
                              param_grid=[{'penalty': ['l1', 'l2'],
                                           'solver': ['liblinear']}]))])>
Test Accuracy Score of Random Forest is 0.9333333333333333
Best Params for this model <bound method Pipeline.get_params of Pipeline(steps=[('standardscaler', StandardScaler()),
                ('gridsearchcv',
                 GridSearchCV(cv=10, estimator=RandomForestClassifier(),
                              param_grid=[{'criterion': ['entropy', 'gini'],
                                           'max_depth': [2, 10],
                                           'max_features': ['log2', 'sqrt',
                                                            'auto'],
   