In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np,warnings
warnings.filterwarnings('ignore')

In [None]:
# def scale_dataset(df,oversample=False):
#     x = df[df.cols[:-1]].values
#     y = df[df.cols[-1]].values
#     scaler = StandardScaler()
#     X = scaler.fit_transform(X)
#     if oversample:
#         ros = RandomOverSampler()
#         X,y = ros.fit_resample(X,y)
        
#     data np.hstack((X,np.reshape(y,(-1,1))))
    
#     return data,X,y
# train, X_train, y_train = scale_dataset(train,oversample=True)
# valid, X_valid, y_valid = scale_dataset(valied,oversample=True)
# test, X_test, y_test = scale_dataset(test,oversample=False)

In [2]:
iris_df = load_iris()
iris_df.data
X_train,X_test,y_train,y_test = train_test_split(iris_df.data,iris_df.target,test_size=0.3,random_state=0)

## Pipelines Creation
1. Data Preprocessing by using Standard Scaler
2. Reduce Dimension using PCA
3. Apply  Classifier

In [3]:
pipeline_lr = Pipeline([('scalar1',StandardScaler()),('pca1',PCA(n_components=2)),
                      ('lr_classifier',LogisticRegression(random_state=0))])
pipeline_dt = Pipeline([('scalar2',StandardScaler()),('pca2',PCA(n_components=2)),
                     ('dt_classifier',DecisionTreeClassifier())])
pipeline_randomforest = Pipeline([('scalar3',StandardScaler()),('pca3',PCA(n_components=2)),
                     ('rf_classifier',RandomForestClassifier())])

In [4]:
pipelines = [pipeline_lr, pipeline_dt, pipeline_randomforest]
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}
for pipe in pipelines:
    pipe.fit(X_train, y_train)
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(X_test,y_test)))

Logistic Regression Test Accuracy: 0.8666666666666667
Decision Tree Test Accuracy: 0.9111111111111111
RandomForest Test Accuracy: 0.9111111111111111


In [5]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [6]:
for i,model in enumerate(pipelines):
    if model.score(X_test,y_test)>best_accuracy:
        best_accuracy=model.score(X_test,y_test)
        best_pipeline=model
        best_classifier=i
print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))

Classifier with best accuracy:Decision Tree


In [7]:
pipe = Pipeline([("classifier", RandomForestClassifier())])
grid_param = [
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2','l1'],
                 "classifier__C": np.logspace(0, 4, 10)
                 },
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2'],
                 "classifier__C": np.logspace(0, 4, 10),
                 "classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
                 },
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_depth":[5,8,15,25,30,None],
                 "classifier__min_samples_leaf":[1,2,5,10,15,100],
                 "classifier__max_leaf_nodes": [2, 5,10]}]

In [None]:
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

In [None]:
print(best_model.best_estimator_)
print("The mean accuracy of the model is:",best_model.score(X_test,y_test))

# MAKE A PIPELINE

In [None]:
pipe = make_pipeline((RandomForestClassifier()))
grid_param = [
                {"randomforestclassifier": [RandomForestClassifier()],
                 "randomforestclassifier__n_estimators": [10, 100, 1000],
                 "randomforestclassifier__max_depth":[5,8,15,25,30,None],
                 "randomforestclassifier__min_samples_leaf":[1,2,5,10,15,100],
                 "randomforestclassifier__max_leaf_nodes": [2, 5,10]}]
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)
best_model.score(X_test,y_test)