# Using GridSearchCV find the best model and do a hyperparameter tuning

In [1]:
from sklearn import datasets, svm
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

iris = datasets.load_iris()

# 1 - Approach without GridSearch

In [2]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.2)
model_svm = svm.SVC(kernel = 'rbf', C = 30 , gamma = 'auto')# ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'] = "rbf"
model_svm.fit(X_train, y_train)
model_svm.score(X_test, y_test)

0.9666666666666667

# 2- Approach with Cross Validation

In [3]:
import numpy as np
print(np.average(cross_val_score(svm.SVC(kernel = 'rbf', C = 30 , gamma = 'auto'), iris.data, iris.target, cv = 5 )))
print(np.average(cross_val_score(svm.SVC(kernel = 'linear', C = 1 , gamma = 'auto'), iris.data, iris.target, cv = 5 )))
print(np.average(cross_val_score(svm.SVC(kernel = 'poly', C = 10 , gamma = 'auto'), iris.data, iris.target, cv = 5 )))

0.96
0.9800000000000001
0.9666666666666666


In [4]:
kernels = ['rbf', 'linear']
c = [1, 10, 30]
avg_score = {}

for kval in kernels:
    for cval in c:
        cv_scores = cross_val_score(svm.SVC(kernel = kval, C = cval , gamma = 'auto'), iris.data, iris.target, cv = 5 )
        avg_score['Kernel: ' + kval,' c_value: ' + str(cval)] = np.average(cv_scores)
    
avg_score

{('Kernel: rbf', ' c_value: 1'): 0.9800000000000001,
 ('Kernel: rbf', ' c_value: 10'): 0.9800000000000001,
 ('Kernel: rbf', ' c_value: 30'): 0.96,
 ('Kernel: linear', ' c_value: 1'): 0.9800000000000001,
 ('Kernel: linear', ' c_value: 10'): 0.9733333333333334,
 ('Kernel: linear', ' c_value: 30'): 0.96}

# 3- Approch using GridSearch

In [23]:
gs = GridSearchCV(
    svm.SVC(gamma='auto'),
    {
        'C':[1, 10, 30, 100],
        'kernel':['rbf', 'linear']
    },
    cv = 5, return_train_score = False
)

gs.fit(iris.data, iris.target)

In [12]:
# gs.cv_results_

{'mean_fit_time': array([0.00157709, 0.00238905, 0.00161099]),
 'std_fit_time': array([0.00135718, 0.00219252, 0.00252445]),
 'mean_score_time': array([0.00098572, 0.00068712, 0.00031986]),
 'std_score_time': array([0.00086858, 0.00056323, 0.00063972]),
 'param_kernel': masked_array(data=['rbf', 'linear', 'linear'],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'param_C': masked_array(data=[10, 10, 100],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'kernel': 'rbf', 'C': 10},
  {'kernel': 'linear', 'C': 10},
  {'kernel': 'linear', 'C': 100}],
 'split0_test_score': array([0.96666667, 1.        , 1.        ]),
 'split1_test_score': array([1., 1., 1.]),
 'split2_test_score': array([0.96666667, 0.9       , 0.9       ]),
 'split3_test_score': array([0.96666667, 0.96666667, 0.93333333]),
 'split4_test_score': array([1., 1., 1.]),
 'mean_test_score': array([0.98      , 0.97333333, 0.9

In [24]:
# Show results
gs.cv_results_

df_gs = pd.DataFrame(gs.cv_results_)
df_gs

df_gs[['param_C', 'param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,30,rbf,0.96
5,30,linear,0.96
6,100,rbf,0.96
7,100,linear,0.966667


In [25]:
print(gs.best_params_)
print(gs.best_score_)

{'C': 1, 'kernel': 'rbf'}
0.9800000000000001


# Randomized SearchGrid

In [26]:
rs = RandomizedSearchCV(
    svm.SVC(gamma = 'auto'),
    {
        'C':[1, 10, 30, 100],
        'kernel':['rbf', 'linear']
    },
    cv =  5, return_train_score= False,
    n_iter = 3
)

rs.fit(iris.data, iris.target)
# Show results
rs.cv_results_

df_rs = pd.DataFrame(rs.cv_results_)
df_rs

df_rs[['param_C', 'param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,100,linear,0.966667
2,30,rbf,0.96


# How about different models

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


model_param_all = {

    'svm':{
        'model': svm.SVC(gamma= 'auto'),
        'param':{
            'C': [1, 10, 20],
            'kernel': ['rbf', 'linear']
                }
           },

    'random_forest':{
        'model':RandomForestClassifier(),
        'param':{
            'n_estimators': [1, 5, 10]
                }
        },

    'logistic_regression':{
        'model':LogisticRegression(solver = 'liblinear'),
        'param': {
            'C': [1, 5, 10]
        }
    }
}

In [28]:
scores = []
for model_name, model_param in model_param_all.items():
    multigrid = GridSearchCV(model_param['model'], model_param['param'], cv = 5, return_train_score = False)

    multigrid.fit(iris.data, iris.target)

    scores.append({
        'model': model_name,
        'best_score': multigrid.best_score_,
        'best_params': multigrid.best_params_
    })


df_multi_grid = pd.DataFrame(scores, columns = ['model', 'best_score', 'best_params'])
df_multi_grid

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.966667,{'n_estimators': 10}
2,logistic_regression,0.966667,{'C': 5}
