In [1]:
#Another simple yet powerful technique we can pair with pipelines
#to improve performance is grid search, which attempts to optimize
#model hyperparameter combinations.

In [8]:
#import modules 
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn import tree
from sklearn.grid_search import GridSearchCV

In [3]:
# Load and split the data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)


In [4]:
# Construct pipeline
pipe = Pipeline([('scl', StandardScaler()),
            ('pca', PCA(n_components=2)),
            ('clf', tree.DecisionTreeClassifier(random_state=42))])


Since our model uses a decision tree estimator, we will use grid search to optimize the following hyperparameters:

1.criterion - This is the function used to evaluate the quality of the split; we will use both options available in Scikit-learn: Gini impurity and information gain (entropy)

2.min_samples_leaf - This is the minimum number of samples required for a valid leaf node; we will use the integer range 1 to 5

3.max_depth - The is the maximum depth of the tree; we will use the integer range 1 to 5

4.min_samples_split - This is the minimum number of samples required in order to split a non-leaf node; we will use the integer range 1 to 5

5.presort - This indicates whether or not to presort the data in order to speed up the location of best splits during fitting; this does not have any effect on the resulting model accuracy (only on training times), but has been included for the benefit of using a True/False hyperparameter in our grid search model (fun, right?!?)


In [10]:
param_range = [1, 2, 3, 4, 5]

In [13]:
grid_params = [{'clf__criterion': ['gini', 'entropy'],
        'clf__min_samples_leaf': param_range,
        'clf__max_depth': param_range,
        'clf__min_samples_split': param_range[1:],
        'clf__presort': [True, False]}]

In [14]:
# Construct grid search
gs = GridSearchCV(estimator=pipe,
            param_grid=grid_params,
            scoring='accuracy',
            cv=10)

In [15]:
# Fit using grid search
gs.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max...        min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'clf__criterion': ['gini', 'entropy'], 'clf__min_samples_leaf': [1, 2, 3, 4, 5], 'clf__max_depth': [1, 2, 3, 4, 5], 'clf__min_samples_split': [2, 3, 4, 5], 'clf__presort': [True, False]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [16]:
# Best accuracy
print('Best accuracy: %.3f' % gs.best_score_)


Best accuracy: 0.925


In [17]:
# Best params
print('\nBest params:\n', gs.best_params_)


Best params:
 {'clf__criterion': 'gini', 'clf__max_depth': 2, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'clf__presort': True}


Before we apply GridSearchCV the Accuracy was 0.867(If you perform it in the normal way) but when we apply GridSearchCV the Accuracy changed to 0.925.This difference in our simple example should be evidence enough to suggest that Scikit-learn defaults should not be followed blindly.