In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [5]:
from sklearn import datasets
iris = datasets.load_iris()

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


CV is no of cross validations

In [25]:
cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'),iris.data,iris.target,cv = 4)

array([1.        , 0.94871795, 0.86111111, 1.        ])

In [26]:
cross_val_score(SVC(gamma='auto'), iris.data,iris.target,cv=3)

array([0.98039216, 0.96078431, 0.97916667])

In [28]:
cross_val_score(RandomForestClassifier(n_estimators=40),iris.data, iris.target,cv=3)

array([0.98039216, 0.92156863, 1.        ])

Hyperparameter Tuning using GridSearchCV

In [29]:
from sklearn.model_selection import GridSearchCV

In [39]:
clf = GridSearchCV(SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf','linear'],
}, cv = 5 , return_train_score=False)
clf.fit(iris.data, iris.target)
clf.cv_results_
df = pd.DataFrame(clf.cv_results_)
df

# C is tradeoff paramater which is inversely proportional to Regularization Parameter
# kernel is  a method of using a linear classifier to solve a non-linear problem.

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000804,0.0004014853,0.000302,0.000247,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.000703,0.0002458875,0.000302,0.000246,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.000502,7.893059e-07,0.000301,0.000246,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.000302,0.000246294,0.000201,0.000246,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.000502,2.431402e-07,0.000301,0.000246,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.000302,0.0002463332,0.000301,0.000246,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,5


In [41]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [42]:
clf.best_score_

0.98

Randomized Search CV
The problem with GridSearchCV is that it tries all the combinations and hence the computation cost might be high and heres where randomized SearchCV comes into play as it reduces the cost of computation.

In [44]:
from sklearn.model_selection import RandomizedSearchCV

In [62]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(SVC(gamma='auto'), {
        'C': [1,10,20],
        'kernel': ['rbf','linear']
    }, 
    cv=5, 
    return_train_score=False, 
    n_iter=3
)
rs.fit(iris.data, iris.target)
rs.cv_results_
df1 = pd.DataFrame(rs.cv_results_)
df1
df1[['param_C' , 'param_kernel' , 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,10,linear,0.973333
1,20,linear,0.966667
2,1,linear,0.98


Now for Different Models and Different Parameters

In [66]:
model_params = {
    'svm' :{ 'model' : SVC(gamma = 'auto'),
              'params' : {
                           'C' : [1,10,20],
                        'kernel' : ['linear','rbf']
              }
    },
    
    'random_forest': {
                     'model': RandomForestClassifier(),
                      'params' : {
                        'n_estimators': [1,5,10]
                         }
    },
    
    'logistic_regression' :{
                   'model' : LogisticRegression(solver='liblinear',multi_class='auto'),
                   'params' : {
                               'C' : [1,5,10]
              }   
    }
}



In [69]:
scores = []
for model_name , mp in model_params.items():
    clf = GridSearchCV(mp['model'],mp['params'], cv=5, return_train_score = False)
    clf.fit(iris.data,iris.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'linear'}"
1,random_forest,0.966667,{'n_estimators': 5}
2,logistic_regression,0.966667,{'C': 5}


From the above, we can make out that SVM is the best model and the best parameters for it are C=1 and kernel equal to linear