# Hyperparameter tuning(GridSearchCv)
The process of choosing the optimal parameter for a model is called hypertuning.

In [17]:
from sklearn import svm, datasets
from sklearn import linear_model
iris = datasets.load_iris()

In [5]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [6]:
import pandas as pd
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df["flower"] = iris.target
df["flower"] = df["flower"].apply(lambda x: iris.target_names[x])
df[47:52]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
47,4.6,3.2,1.4,0.2,setosa
48,5.3,3.7,1.5,0.2,setosa
49,5.0,3.3,1.4,0.2,setosa
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor


In [8]:
#Using tradition approach of train test split
# I cant rely on this method because my score changes base on my samples
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
                                                    test_size=0.3)

In [9]:
# Using SVM model
model = svm.SVC(kernel="rbf", C=30, gamma="auto")
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9555555555555556

In [10]:
#That is why we use KFold cross validation
# Tried cross_val_score for 5 folds
from sklearn.model_selection import cross_val_score

In [11]:
cross_val_score(svm.SVC(kernel="linear", C=10, gamma="auto"), 
                iris.data, iris.target, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [12]:
cross_val_score(svm.SVC(kernel="rbf", C=10, gamma="auto"), iris.data, 
                iris.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [13]:
cross_val_score(svm.SVC(kernel="rbf", C=20, gamma="auto"), iris.data, 
                iris.target, cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

In [14]:
import numpy as np

In [15]:
# repeating the same process as above using a for loop
kernels = ["rbf", "linear"]
C = [1, 10, 20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(svm.SVC(kernel=kval, C=cval, 
                                            gamma="auto"), iris.data, 
                                            iris.target, cv=5)
        avg_scores[kval + "_"+ str(cval)] = np.average(cv_scores)

avg_scores

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

In [19]:
linear_model.LogisticRegression()

LogisticRegression()

In [20]:
# Making the above step simpler with GridsearchCV(Gridsearch still uses CV)
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(svm.SVC(gamma="scale"),
                  {"C": [1.0, 10, 20],
                  "kernel": ["rbf", "linear"]},
                  cv=5, return_train_score=False)

clf.fit(iris.data, iris.target)
# GridSearch Cv are not easy to view
clf.cv_results_

{'mean_fit_time': array([0.00098605, 0.00243082, 0.0025372 , 0.0010314 , 0.00118127,
        0.00084333]),
 'std_fit_time': array([0.00085073, 0.00169037, 0.00442825, 0.00052536, 0.00020314,
        0.000429  ]),
 'mean_score_time': array([0.00091443, 0.00072832, 0.00067215, 0.00027657, 0.00091157,
        0.00019965]),
 'std_score_time': array([0.00036804, 0.0007359 , 0.00063108, 0.00055313, 0.00013809,
        0.0003993 ]),
 'param_C': masked_array(data=[1.0, 1.0, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1.0, 'kernel': 'rbf'},
  {'C': 1.0, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'lin

In [21]:
# We export tp pandas DataFrame
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000986,0.000851,0.000914,0.000368,1.0,rbf,"{'C': 1.0, 'kernel': 'rbf'}",0.966667,0.966667,0.966667,0.933333,1.0,0.966667,0.021082,5
1,0.002431,0.00169,0.000728,0.000736,1.0,linear,"{'C': 1.0, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.002537,0.004428,0.000672,0.000631,10.0,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.001031,0.000525,0.000277,0.000553,10.0,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.001181,0.000203,0.000912,0.000138,20.0,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
5,0.000843,0.000429,0.0002,0.000399,20.0,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,5


In [13]:
# Getting the parameter values and mean score
df[["param_C", "param_kernel", "mean_test_score"]]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1.0,rbf,0.966667
1,1.0,linear,0.98
2,10.0,rbf,0.98
3,10.0,linear,0.973333
4,20.0,rbf,0.98
5,20.0,linear,0.966667


In [14]:
# Viewing the other properties of the classifier
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'inverse_transform',
 'multimetric_',
 'n_features_in_',
 'n_jobs',
 'n_splits

In [22]:
clf.best_score_

0.9800000000000001

In [23]:
clf.best_params_

{'C': 1.0, 'kernel': 'linear'}

## Randomizied SearchVC
randomized searched Cv will not try every combination of the parameter values.. it will only try random combination of these parameter values.

In [17]:
# We asked the randomSearchcv to do only two combination
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svm.SVC(gamma="scale"),
                       {"C": [1, 10, 20],
                       "kernel": ["rbf", "linear"]},
                       cv=5,
                       return_train_score=False,
                       n_iter=2)
rs.fit(iris.data, iris.target)
pd.DataFrame(rs.cv_results_)[["param_C", "param_kernel", "mean_test_score"]]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,linear,0.98
1,1,rbf,0.966667


### Choosing the best model for a Given problem

In [18]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [19]:
# Defining the parameter grid
model_params = {
    "svm":{
        "model": svm.SVC(gamma="scale"),
        "params": {"C":[1, 10, 20], "kernel":["rbf", "linear"]}
    },
    "random_forest": {
        "model": RandomForestClassifier(),
        "params": {"n_estimators": [1, 5, 10]
        }
    },
    "logistic_regression": {
        "model": LogisticRegression(solver="liblinear", multi_class="auto"),
        "params": { "C": [1,5, 10 ]
            
        }
    }
}

In [20]:
scores = []
for model_name, mp in model_params.items():
    clf = GridSearchCV(mp["model"], mp["params"], cv=5, return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append({
        "model": model_name,
        "best_score": clf.best_score_,
        "best_params": clf.best_params_
    })

In [21]:
df = pd.DataFrame(scores, columns=["model", "best_score", "best_params"])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'linear'}"
1,random_forest,0.966667,{'n_estimators': 5}
2,logistic_regression,0.966667,{'C': 5}


### Exercise: Machine Learning Finding Optimal Model and Hyperparameters

For digits dataset in sklearn.dataset, please try the following classifiers and find out the one that gives best performance. Also find the optimal parameters for that classifier.

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier