<a href="https://colab.research.google.com/github/DJ-Manjaray/Data_Science_Practices/blob/main/Comparing_randomized_search_and_grid_search_for_hyperparameter_estimation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from time import time

import numpy as np
import scipy.stats as stats

from sklearn.datasets import load_digits
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
# get some data
X, y = load_digits(return_X_y=True, n_class=3)

# build a classifier
clf = SGDClassifier(loss="hinge", penalty="elasticnet", fit_intercept=True)

In [3]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print(
                "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    results["mean_test_score"][candidate],
                    results["std_test_score"][candidate],
                )
            )
            print("Parameters: {0}".format(results["params"][candidate]))
            print("")

In [4]:
# specify parameters and distributions to sample from
param_dist = {
    "average": [True, False],
    "l1_ratio": stats.uniform(0, 1),
    "alpha": stats.loguniform(1e-2, 1e0),
}

In [5]:
# run randomized search
n_iter_search = 15
random_search = RandomizedSearchCV(
    clf, param_distributions=param_dist, n_iter=n_iter_search
)

In [6]:
start = time()
random_search.fit(X, y)
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_iter_search)
)
report(random_search.cv_results_)

RandomizedSearchCV took 1.37 seconds for 15 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.989 (std: 0.007)
Parameters: {'alpha': 0.02796719275904662, 'average': False, 'l1_ratio': 0.32926885577193055}

Model with rank: 1
Mean validation score: 0.989 (std: 0.015)
Parameters: {'alpha': 0.02312119697425407, 'average': False, 'l1_ratio': 0.21912139452497004}

Model with rank: 3
Mean validation score: 0.987 (std: 0.007)
Parameters: {'alpha': 0.14646129330815819, 'average': False, 'l1_ratio': 0.06890152076660849}



In [7]:
# use a full grid over all parameters
param_grid = {
    "average": [True, False],
    "l1_ratio": np.linspace(0, 1, num=10),
    "alpha": np.power(10, np.arange(-2, 1, dtype=float)),
}

In [8]:
# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y)

print(
    "GridSearchCV took %.2f seconds for %d candidate parameter settings."
    % (time() - start, len(grid_search.cv_results_["params"]))
)
report(grid_search.cv_results_)

GridSearchCV took 2.76 seconds for 60 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.991 (std: 0.008)
Parameters: {'alpha': 0.01, 'average': False, 'l1_ratio': 0.1111111111111111}

Model with rank: 2
Mean validation score: 0.991 (std: 0.006)
Parameters: {'alpha': 0.01, 'average': False, 'l1_ratio': 0.0}

Model with rank: 3
Mean validation score: 0.991 (std: 0.010)
Parameters: {'alpha': 0.1, 'average': False, 'l1_ratio': 0.1111111111111111}

Model with rank: 3
Mean validation score: 0.991 (std: 0.010)
Parameters: {'alpha': 0.1, 'average': False, 'l1_ratio': 0.2222222222222222}

