# finding the best model and hyper parameter tuning using GridSearchCV

In [1]:
import pandas as pd
import numpy as np

In [3]:
from sklearn import datasets
iris=datasets.load_iris()

In [6]:
df=pd.DataFrame(data=iris.data,columns=iris.feature_names)

In [7]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [8]:
dir(iris)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [11]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [17]:
df["flowers"]=iris.target

In [20]:
df["flowers"]=df["flowers"].apply(lambda x: iris.target_names[x])

In [21]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flowers
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


## Approach 1: Use train_test_split and manually tune parameters by trial and error

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(iris.data,iris.target,test_size=0.2)

In [31]:
from sklearn.svm import SVC
model=SVC(kernel='rbf',C=30,gamma='auto')
model.fit(X_train,y_train)

SVC(C=30, gamma='auto')

In [32]:
model.predict(X_test)

array([2, 2, 0, 2, 0, 0, 1, 0, 0, 0, 2, 1, 1, 2, 1, 0, 0, 0, 1, 1, 1, 2,
       1, 1, 1, 1, 1, 0, 1, 1])

In [33]:
model.score(X_test,y_test)

0.9666666666666667

## Approach 2: Use K Fold Cross validation

In [34]:
from sklearn.model_selection import cross_val_score


In [36]:
cross_val_score(SVC(kernel="linear",gamma="auto"),iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [39]:
cross_val_score(SVC(kernel='rbf',C=10,gamma='auto'),iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [41]:
cross_val_score(SVC(kernel='rbf',C=20,gamma='auto'),iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

### Above approach is tiresome and very manual. We can use for loop as an alternative

In [54]:

kernels = ['rbf', 'linear']
C = [1,10,20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(SVC(kernel=kval,C=cval,gamma='auto'),iris.data, iris.target, cv=5)
        avg_scores[kval + '_' + str(cval)] = np.average(cv_scores)

avg_scores




{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

From above results we can say that rbf with C=1 or 10 or linear with C=1 will give best performance

# Approach 3: Use GridSearchCV

GridSearchCV does exactly same thing as for loop above but in a single line of code

In [60]:
from sklearn.model_selection import GridSearchCV
model=GridSearchCV(SVC(gamma="auto"),{'C': [1,10,20],'kernel': ['rbf','linear']},cv=5,return_train_score=False)
model.fit(iris.data,iris.target)
model.cv_results_

{'mean_fit_time': array([0.0020082 , 0.00160456, 0.00160055, 0.00079865, 0.        ,
        0.        ]),
 'std_fit_time': array([0.00160672, 0.00196518, 0.00196027, 0.00159731, 0.        ,
        0.        ]),
 'mean_score_time': array([0.00061746, 0.00080056, 0.        , 0.00160065, 0.00160041,
        0.00080018]),
 'std_score_time': array([0.00050421, 0.00160112, 0.        , 0.00196038, 0.00196009,
        0.00160036]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'}],


In [62]:
df = pd.DataFrame(model.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002008,0.001607,0.000617,0.000504,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.001605,0.001965,0.000801,0.001601,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.001601,0.00196,0.0,0.0,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.000799,0.001597,0.001601,0.00196,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.0,0.0,0.0016,0.00196,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.0,0.0,0.0008,0.0016,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


In [63]:
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


In [65]:
model.best_params_

{'C': 1, 'kernel': 'rbf'}

In [67]:
model.best_score_

0.9800000000000001

In [68]:
dir(model)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_select_best_index',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'inverse_transform',
 'multimetric_',
 'n_features_

###### Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation



In [69]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(SVC(gamma='auto'), {'C': [1,10,20],'kernel': ['rbf','linear']}, cv=5, return_train_score=False, n_iter=2)

rs.fit(iris.data, iris.target)
pd.DataFrame(rs.cv_results_)[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,20,rbf,0.966667
1,10,linear,0.973333


## How about different models with different hyperparameters?

In [75]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model_params = {
    'svm': {
        'algo': SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'algo': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'algo': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [77]:
scores = []

for model_name, mp in model_params.items():
    model =  GridSearchCV(mp['algo'], mp['params'], cv=5, return_train_score=False)
    model.fit(iris.data, iris.target)
    scores.append({
        'algo': model_name,
        'best_score': model.best_score_,
        'best_params': model.best_params_
    })
    
df = pd.DataFrame(scores,columns=['algo','best_score','best_params'])

In [78]:
df

Unnamed: 0,algo,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.96,{'n_estimators': 10}
2,logistic_regression,0.966667,{'C': 5}


## Based on above, I can conclude that SVM with C=1 and kernel='rbf' is the best model for solving my problem of iris flower classification