## Purpose: ##
1. Use  the GridSearchCV from sklearn.model_selection 
2. use the RandomizedSearchCV from sklearn.model_selection
Data used: Abalone data

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
import pandas as pd

In [27]:
import pandas as pd
import numpy as np
from pathlib import Path

columns = ["Sex", "Length", "Diameter", "Height", "Whole", "Shucked", "Viscera", "Shell", "Rings"]
sourcepath= Path("D:/1000_ML_projects/1000_Github_ML/Abalone/Data/abalone.data")
data = pd.read_csv(sourcepath, names=columns)
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole,Shucked,Viscera,Shell,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [28]:
# split off targets:
targets = data.iloc[:,-1]
data = data.iloc[:,:-1]

# one-hot-encode Sex:
data = pd.get_dummies(data)
data.head()

Unnamed: 0,Length,Diameter,Height,Whole,Shucked,Viscera,Shell,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,0,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,0,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,1,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,0,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,0,1,0


In [29]:
from sklearn.model_selection import train_test_split

random_seed = 42
X_train, X_test, y_train, y_test = train_test_split(data.to_numpy(), targets.to_numpy(), test_size= 0.2, random_state=random_seed)

In [30]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3341, 10), (836, 10), (3341,), (836,))

### Full Grid Search 

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

pipe_lin_reg = Pipeline([
    ("pca", PCA()),
    ("ssc", StandardScaler()),
    ("kmeans", KMeans()),
    ("lin_reg", LinearRegression())
])

params = {
    "pca__n_components" : np.arange(10),
    "kmeans__n_clusters" : np.arange(10, 150, 10),
    "kmeans__max_iter" : np.arange(500, 800, 100),
}

clf = GridSearchCV(pipe_lin_reg, params, n_jobs=-1, cv=10)

clf.fit(X_train, y_train)

print(f"Best parameters: {clf.best_params_}")
print(f"Best training-score: {clf.best_score_:.3f}")
print(f"Score on test data: {clf.score(X_test, y_test):.3f}")


### Gridsearch Results:

params = { <br>
    "pca__n_components" : np.arange(10),<br>
    "kmeans__n_clusters" : np.arange(10, 150, 10),<br>
    "kmeans__max_iter" : np.arange(500, 800, 100),<br>
}<br>
<br>
clf = GridSearchCV(pipe_lin_reg, params, n_jobs=-1, cv=10)<br>
<br>
Run-Time: 1m24.3s<br>
Best parameters: {'kmeans__max_iter': 700, 'kmeans__n_clusters': 130, 'pca__n_components': 8}<br>
Best training-score: 0.579<br>
Score on test data: 0.569<br>

### Randomized Grid Search

In [None]:
import scipy as sp

params = {
    "pca__n_components" : np.floor(sp.stats.uniform(loc=1, scale=9).rvs(size=10)).astype(int),
    "kmeans__n_clusters" : np.arange(10, 150, 10),
    "kmeans__max_iter" : np.arange(500, 800, 100),
}


from sklearn.model_selection import RandomizedSearchCV

rand = RandomizedSearchCV(pipe_lin_reg, params, n_iter=10, cv=5, scoring="accuracy", random_state=1)

rand.fit(X_train, y_train)

print(f"Best parameters: {clf.best_params_}")
print(f"Best training-score: {clf.best_score_:.3f}")
print(f"Score on test data: {clf.score(X_test, y_test):.3f}")


In [53]:
pd.DataFrame(rand.cv_results_).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_pca__n_components,param_kmeans__n_clusters,param_kmeans__max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.484508,0.005573,0.0016,0.0004902129,8,100,700,"{'pca__n_components': 8, 'kmeans__n_clusters':...",,,,,,,,1
1,0.301068,0.002608,0.0012,0.0004000664,3,30,600,"{'pca__n_components': 3, 'kmeans__n_clusters':...",,,,,,,,2
2,0.265059,0.000633,0.001,1.507891e-07,1,20,500,"{'pca__n_components': 1, 'kmeans__n_clusters':...",,,,,,,,3
3,0.320472,0.002728,0.0012,0.0004000903,3,40,600,"{'pca__n_components': 3, 'kmeans__n_clusters':...",,,,,,,,4
4,0.26606,0.003522,0.001,2.780415e-07,4,10,500,"{'pca__n_components': 4, 'kmeans__n_clusters':...",,,,,,,,5


### Randomized Grid Search Results:

 RandomizedSearchCV(pipe_lin_reg, params, n_iter=10, cv=5, scoring="accuracy", random_state=1)<br>
 <br>
Run-Time: 19.8s <br>
Best parameters: {'kmeans__max_iter': 700, 'kmeans__n_clusters': 120, 'pca__n_components': 8} <br>
Best training-score: 0.575 <br>
Score on test data: 0.568 <br>

## Summary
The results found by the randomized grid search are almost equal in score on the test data. <br>
Randomized gridsearch is about 6-times faster.

# Example from Scikit #
(https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py)

In [1]:
import numpy as np

from time import time
import scipy.stats as stats
from sklearn.utils.fixes import loguniform

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.linear_model import SGDClassifier

# get some data
X, y = load_digits(return_X_y=True, n_class=3)

# build a classifier
clf = SGDClassifier(loss="hinge", penalty="elasticnet", fit_intercept=True)


# Utility function to report the n_top best scores:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print(
                "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    results["mean_test_score"][candidate],
                    results["std_test_score"][candidate],
                )
            )
            print("Parameters: {0}".format(results["params"][candidate]))
            print("")


# specify parameters of the underlying classifier (here SVM) and distributions to sample these parameters from:
param_dist = {
    "average": [True, False],
    "l1_ratio": stats.uniform(0, 1),
    "alpha": loguniform(1e-2, 1e0),
}

# run randomized search
n_iter_search = 15
random_search = RandomizedSearchCV(
    clf, param_distributions=param_dist, n_iter=n_iter_search
)

start = time()
random_search.fit(X, y)
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_iter_search)
)
report(random_search.cv_results_)

# use a full grid over all parameters
param_grid = {
    "average": [True, False],
    "l1_ratio": np.linspace(0, 1, num=10),
    "alpha": np.power(10, np.arange(-2, 1, dtype=float)),
}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y)

print(
    "GridSearchCV took %.2f seconds for %d candidate parameter settings."
    % (time() - start, len(grid_search.cv_results_["params"]))
)
report(grid_search.cv_results_)

RandomizedSearchCV took 0.47 seconds for 15 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.993 (std: 0.007)
Parameters: {'alpha': 0.046934044280212736, 'average': False, 'l1_ratio': 0.19629351658525807}

Model with rank: 2
Mean validation score: 0.985 (std: 0.010)
Parameters: {'alpha': 0.028418283193819404, 'average': False, 'l1_ratio': 0.16962065911451396}

Model with rank: 3
Mean validation score: 0.983 (std: 0.018)
Parameters: {'alpha': 0.020479448070335967, 'average': False, 'l1_ratio': 0.4415834151838487}

GridSearchCV took 2.14 seconds for 60 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.993 (std: 0.015)
Parameters: {'alpha': 0.01, 'average': False, 'l1_ratio': 0.7777777777777777}

Model with rank: 2
Mean validation score: 0.993 (std: 0.007)
Parameters: {'alpha': 0.01, 'average': False, 'l1_ratio': 0.2222222222222222}

Model with rank: 3
Mean validation score: 0.991 (std: 0.014)
Parameters: {'alpha': 0.01, 'average': False, 