### Finding best model and hyper parameter tunning using GridSearchCV
For iris flower dataset in sklearn library, we are going to find out best model and best hyper parameters using GridSearchCV

In [15]:
from sklearn import svm, datasets
iris = datasets.load_iris()

In [16]:
import pandas as pd
df = pd.DataFrame(iris.data,columns=iris.feature_names)
df['flower'] = iris.target
df['flower'] = df['flower'].apply(lambda x: iris.target_names[x])
df[5:10]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


### Approach 1: Use train_test_split and manually tune parameters by trial and error 

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

In [5]:
model = svm.SVC(kernel='rbf',C=30,gamma='auto')
model.fit(X_train,y_train)
model.score(X_test, y_test)

1.0

### Approach 2: Use K Fold Cross validation
 Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation 

In [6]:
from sklearn.model_selection import cross_val_score

In [7]:
cross_val_score(svm.SVC(kernel='linear',C=10,gamma='auto'),iris.data, iris.target, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [8]:
cross_val_score(svm.SVC(kernel='rbf',C=10,gamma='auto'),iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [9]:
cross_val_score(svm.SVC(kernel='rbf',C=20,gamma='auto'),iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

In [11]:
import numpy as np
kernals=['linear', 'rbf']
C=[1,10,20]
avg_score={}
for kval in kernals:
    for cval in C:
        cv_score=cross_val_score(svm.SVC(kernel=kval,C=cval,gamma='auto'),iris.data, iris.target, cv=5)
        avg_score[kval+'_'+str(cval)]=np.average(cv_score)

avg_score

{'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666,
 'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668}

### Grid Search CV 

In [12]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(svm.SVC(gamma='auto'),{
    'C':[1, 10, 20],
    'kernel':['linear','rbf']
},cv=5, return_train_score=False)
clf.fit(iris.data, iris.target)
clf.cv_results_

{'mean_fit_time': array([0.00119996, 0.00119553, 0.00100479, 0.00080705, 0.0009984 ,
        0.00040059]),
 'std_fit_time': array([3.97951620e-04, 7.53440171e-04, 1.02076570e-05, 4.03821647e-04,
        2.01942648e-05, 4.90622625e-04]),
 'mean_score_time': array([0.00060639, 0.00080137, 0.00019922, 0.00040021, 0.        ,
        0.00060048]),
 'std_score_time': array([0.00049538, 0.00040072, 0.00039845, 0.00049016, 0.        ,
        0.00049029]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'linear'},
  {'C': 1, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'},
  {'C':

In [13]:
cv_resuls=pd.DataFrame(clf.cv_results_)
cv_resuls

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0012,0.000398,0.000606,0.000495,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.001196,0.000753,0.000801,0.000401,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.001005,1e-05,0.000199,0.000398,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
3,0.000807,0.000404,0.0004,0.00049,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
4,0.000998,2e-05,0.0,0.0,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
5,0.000401,0.000491,0.0006,0.00049,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5


In [14]:
cv_resuls[['param_C','param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,linear,0.98
1,1,rbf,0.98
2,10,linear,0.973333
3,10,rbf,0.98
4,20,linear,0.966667
5,20,rbf,0.966667


### Randomized Search CV

In [18]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svm.SVC(gamma='auto'),{
    'C':[1, 10, 20],
    'kernel':['linear','rbf']
},cv=5, return_train_score=False, n_iter=2)
rs.fit(iris.data, iris.target)
pd.DataFrame(rs.cv_results_)[['param_C', 'param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,10,linear,0.973333
1,1,linear,0.98


### Choosing best Model 

In [19]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [20]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [22]:
scores=[]
for model_name, mp in model_params.items():
    clf=GridSearchCV(mp['model'],mp['params'], cv=5, return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append({
    'model':model_name,
    'best_score': clf.best_score_,
    'best_params':clf.best_params_
    })

In [23]:
pd.DataFrame(scores)

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.966667,{'n_estimators': 5}
2,logistic_regression,0.966667,{'C': 5}


# Exercise

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.model_selection import GridSearchCV
import pandas as pd

In [25]:
from sklearn.datasets import load_digits
digits = load_digits()

In [26]:
dir(digits)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

### Choosing best model

In [27]:
model_params={
        'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'DecisionTreeClassifier':{
        'model':DecisionTreeClassifier(),
        'params':{
            'criterion':['gini','entropy'],
            'splitter':['best','random']
        }    
    },
    'SVC':{
        'model':SVC(),
        'params':{
            'C': [1, 10, 20],
            'kernel': ['linear', 'poly','rbf'],
            'gamma':['auto', 'scale']
        }    
    },
    'RandomForestClassifier':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators': [10, 100],
            'criterion': ['gini', 'entropy']
        }    
    },
	'GaussianNB':{
        'model':GaussianNB(),
        'params':{
            
        }    
    },
    'MultinomialNB':{
        'model':MultinomialNB(),
        'params':{
            
        }    
    }
}


In [28]:
scores=[]
for model_name, mp in model_params.items():
    clf=GridSearchCV(mp['model'],mp['params'], cv=5, return_train_score=False)
    clf.fit(digits.data, digits.target)
    scores.append({
    'model':model_name,
    'best_score': clf.best_score_,
    'best_params':clf.best_params_
    })
pd.DataFrame(scores)

Unnamed: 0,model,best_score,best_params
0,logistic_regression,0.922114,{'C': 1}
1,DecisionTreeClassifier,0.816383,"{'criterion': 'entropy', 'splitter': 'best'}"
2,SVC,0.97385,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}"
3,RandomForestClassifier,0.941589,"{'criterion': 'entropy', 'n_estimators': 100}"
4,GaussianNB,0.806928,{}
5,MultinomialNB,0.87035,{}
