In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler,StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
import pickle

In [2]:
iris=datasets.load_iris()

In [3]:
X_train,X_test,y_train,y_test=train_test_split(iris.data,iris.target,test_size=0.2,random_state=2)

In [4]:
pipeline_lr=Pipeline([('scaler1',StandardScaler()),
                    ('pca1',PCA(n_components=2)),
                    ('lr_classifier',LogisticRegression())])

In [5]:
pipeline_dt=Pipeline([('pca2',PCA(n_components=2)),
                     ('dt_classifier',DecisionTreeClassifier())])


In [6]:
pipeline_rf=Pipeline([('pca3',PCA(n_components=2)),
                     ('rf_classifier',RandomForestClassifier())])

In [7]:
pipelines = [pipeline_lr, pipeline_dt, pipeline_rf]
best_accuracy=0.0
best_classifier=0
best_pipeline=""
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}


In [8]:
for pipe in pipelines:
    pipe.fit(X_train,y_train)

In [9]:
for i,model in enumerate(pipelines):
    print('Model_name={} Scoring={}'.format(pipe_dict[i],model.score(X_test,y_test)))

Model_name=Logistic Regression Scoring=0.8666666666666667
Model_name=Decision Tree Scoring=0.9333333333333333
Model_name=RandomForest Scoring=0.9333333333333333


In [10]:
for i,model in enumerate(pipelines):
    if model.score(X_test,y_test)>best_accuracy:
        best_accuracy=model.score(X_test,y_test)
        best_pipeline=model
        best_classifier=i
print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))



Classifier with best accuracy:Decision Tree


### HyperParameter Tuning

In [13]:
pipe=Pipeline([('pca',PCA(n_components=2)),
              ('classifier',RandomForestClassifier())])

In [14]:
grid_param={'classifier__n_estimators':[10,50,100,500],
           'classifier__max_depth':[3,5,7,10],
           'classifier__min_samples_leaf':[1,2,5,10,15,100],
           'classifier__max_leaf_nodes':[2,5,10]}
          

In [15]:
gridsearch=GridSearchCV(pipe,grid_param,cv=5,verbose=1,n_jobs=-1)
best_model=gridsearch.fit(X_train,y_train)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   41.1s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:  1.3min finished


In [16]:
best_model.best_estimator_

Pipeline(steps=[('pca', PCA(n_components=2)),
                ('classifier',
                 RandomForestClassifier(max_depth=5, max_leaf_nodes=10,
                                        min_samples_leaf=2, n_estimators=10))])

In [17]:
pipe=Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=2)),
                ('classifier',
                 RandomForestClassifier(max_depth=5, max_leaf_nodes=10,
                                        min_samples_leaf=2, n_estimators=10))])

In [18]:
clf=pipe.fit(X_train,y_train)

In [19]:
clf.score(X_test,y_test)

0.9333333333333333

In [20]:
pickle_out=open('classifier.pkl','wb')
pickle.dump(clf,pickle_out)
pickle_out.close()

In [21]:
pickle_in=open('classifier.pkl','rb')
classifier=pickle.load(pickle_in)

In [23]:
a=[1,2,3,4]
a=np.array(a).reshape(1,-1)
out=classifier.predict(a)

In [24]:
out

array([1])

In [27]:
if out==0:
    print('Setosa')