<h2>Pipelines</h2>

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [5]:
iris_data = load_iris()

In [8]:
X_train,X_test,y_train,y_test=train_test_split(iris_data.data,iris_data.target,test_size=0.3,random_state=0)

In [9]:
pipeline_lr = Pipeline([('scaler1', StandardScaler()), 
                        ('pca1', PCA(n_components=2)),
                        ('lr', LogisticRegression())]) 

In [11]:
pipeline_rf = Pipeline([('scaler2', StandardScaler()), 
                        ('pca2', PCA(n_components=2)),
                        ('rf', RandomForestClassifier())]) 

In [12]:
pipelines = [pipeline_lr,pipeline_rf]

In [13]:
for pipe in pipelines:
    pipe.fit(X_train,y_train)

In [15]:
model_dict = {0:"Logistic Regression", 1:"Random Forest"}

In [16]:
for i,model in enumerate(pipelines):
    print(f"{model_dict[i]} -- {model.score(X_test,y_test)}") 

Logistic Regression -- 0.9111111111111111
Random Forest -- 0.9111111111111111


<h3>GridSearch</h3>

In [20]:
import numpy as np
from sklearn.model_selection import GridSearchCV

In [22]:
# Create a pipeline
pipe = Pipeline([("classifier", RandomForestClassifier())])
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"classifier": [LogisticRegression(max_iter=200)],
                 "classifier__penalty": ['l2','l1'],
                 "classifier__C": np.logspace(0, 4, 10)
                 },
                {"classifier": [LogisticRegression(max_iter=200)],
                 "classifier__penalty": ['l2'],
                 "classifier__C": np.logspace(0, 4, 10),
                 "classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
                 },
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_depth":[5,8,15,25,30,None],
                 "classifier__min_samples_leaf":[1,2,5,10,15,100],
                 "classifier__max_leaf_nodes": [2, 5,10]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)



In [23]:
best_model.best_params_

{'classifier': LogisticRegression(max_iter=200, solver='saga'),
 'classifier__C': 1.0,
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga'}