## Hyper parameter Tuning

In [1]:
from sklearn.datasets import load_digits
digits = load_digits()
digits

{'data': array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ..., 10.,  0.,  0.],
        [ 0.,  0.,  0., ..., 16.,  9.,  0.],
        ...,
        [ 0.,  0.,  1., ...,  6.,  0.,  0.],
        [ 0.,  0.,  2., ..., 12.,  0.,  0.],
        [ 0.,  0., 10., ..., 12.,  1.,  0.]]),
 'target': array([0, 1, 2, ..., 8, 9, 8]),
 'frame': None,
 'feature_names': ['pixel_0_0',
  'pixel_0_1',
  'pixel_0_2',
  'pixel_0_3',
  'pixel_0_4',
  'pixel_0_5',
  'pixel_0_6',
  'pixel_0_7',
  'pixel_1_0',
  'pixel_1_1',
  'pixel_1_2',
  'pixel_1_3',
  'pixel_1_4',
  'pixel_1_5',
  'pixel_1_6',
  'pixel_1_7',
  'pixel_2_0',
  'pixel_2_1',
  'pixel_2_2',
  'pixel_2_3',
  'pixel_2_4',
  'pixel_2_5',
  'pixel_2_6',
  'pixel_2_7',
  'pixel_3_0',
  'pixel_3_1',
  'pixel_3_2',
  'pixel_3_3',
  'pixel_3_4',
  'pixel_3_5',
  'pixel_3_6',
  'pixel_3_7',
  'pixel_4_0',
  'pixel_4_1',
  'pixel_4_2',
  'pixel_4_3',
  'pixel_4_4',
  'pixel_4_5',
  'pixel_4_6',
  'pixel_4_7',
  'pixel_5_0',
  'pixel_5_1',
 

In [2]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [3]:
import pandas as pd
df = pd.DataFrame(digits.data, columns=digits.feature_names)
df.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0


In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.3)

### Simply calculating score using normal method

#### SVC(With and Without K Fold Cross Validation)

In [5]:
model = SVC(kernel='rbf', gamma='auto', C=30)
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.3888888888888889

In [6]:
import numpy as np
from sklearn.model_selection import cross_val_score
kernels=['rbf', 'linear']
C=[1,10,20,30]
avg_scores={}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(SVC(kernel=kval, C=cval, gamma='auto'), digits.data, digits.target, cv=5)
        avg_scores[kval+'_'+str(cval)]=np.average(cv_scores)
avg_scores

{'rbf_1': 0.448545341999381,
 'rbf_10': 0.47636645001547506,
 'rbf_20': 0.47636645001547506,
 'rbf_30': 0.47636645001547506,
 'linear_1': 0.9476973073351903,
 'linear_10': 0.9476973073351903,
 'linear_20': 0.9476973073351903,
 'linear_30': 0.9476973073351903}

#### RandomForestClassifier(With and Without K Fold Cross Validation)

In [7]:
model = RandomForestClassifier(n_estimators=20)
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.9685185185185186

In [8]:
n_estimators=[10,20,30]
avg_scores={}
for n in n_estimators:
    cv_scores = cross_val_score(RandomForestClassifier(n_estimators=n), digits.data, digits.target, cv=5)
    avg_scores['n_estimator_'+str(n)]=np.average(cv_scores)
avg_scores

{'n_estimator_10': 0.9109811203961622,
 'n_estimator_20': 0.9271154441349427,
 'n_estimator_30': 0.9337991333952337}

#### LogisticRegression(With and Without K Fold Cross Validation)

In [9]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    model = LogisticRegression()
    model.fit(x_train, y_train)
model.score(x_test, y_test)

0.9481481481481482

In [10]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    solver=['liblinear', 'saga']
    C=[1,10,20,30]
    avg_scores={}
    for s in solver:
        for cval in C:
            cv_scores = cross_val_score(LogisticRegression(solver=s, C=cval), digits.data, digits.target, cv=5)
            avg_scores[s+'_'+str(cval)]=np.average(cv_scores)
avg_scores

{'liblinear_1': 0.9221138966264315,
 'liblinear_10': 0.9182234602290311,
 'liblinear_20': 0.9182203652120086,
 'liblinear_30': 0.9165521510368306,
 'saga_1': 0.9198901268956978,
 'saga_10': 0.9182219127205199,
 'saga_20': 0.9187759207675642,
 'saga_30': 0.9193345713401422}

#### Naive Bayes GaussianNB(With and Without K Fold Cross Validation)

In [11]:
model = GaussianNB()
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.8518518518518519

In [12]:
var_smoothing=[10,0.1,0.001,1e-5,1e-7,1e-9]
avg_scores={}
for v in var_smoothing:
    cv_scores = cross_val_score(GaussianNB(var_smoothing=v), digits.data, digits.target, cv=5)
    avg_scores['var_smoothing->'+str(v)]=np.average(cv_scores)
avg_scores

{'var_smoothing->10': 0.8608666047663263,
 'var_smoothing->0.1': 0.8870504487774683,
 'var_smoothing->0.001': 0.8631151346332405,
 'var_smoothing->1e-05': 0.8475410089755494,
 'var_smoothing->1e-07': 0.8325177963478799,
 'var_smoothing->1e-09': 0.8069281956050759}

#### Naive Bayes MultinomialNB(With and Without K Fold Cross Validation)

In [13]:
model = MultinomialNB()
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.9018518518518519

In [14]:
alpha=[1,10,20,30]
avg_scores={}
for a in alpha:
    cv_scores = cross_val_score(MultinomialNB(alpha=a), digits.data, digits.target, cv=5)
    avg_scores['alpha->'+str(a)]=np.average(cv_scores)
avg_scores

{'alpha->1': 0.8703497369235531,
 'alpha->10': 0.8742463633549985,
 'alpha->20': 0.8736892602909316,
 'alpha->30': 0.8770287836583102}

#### DecisionTreeClassifier(With and Without K Fold Cross Validation)

In [15]:
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.8314814814814815

In [16]:
avg_scores={}
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    max_depths = [10,30,50,70,90]
    min_samples_split = [5,10]
    avg_scores={}
    for md in max_depths:
        for ms in min_samples_split:
            cv_scores = cross_val_score(DecisionTreeClassifier(max_depth=md, min_samples_split=ms), digits.data, digits.target, cv=5)
            avg_scores['max_depths->'+str(md)+'_min_samples_split->'+str(ms)]=np.average(cv_scores)
avg_scores

{'max_depths->10_min_samples_split->5': 0.7796796657381615,
 'max_depths->10_min_samples_split->10': 0.7702259362426493,
 'max_depths->30_min_samples_split->5': 0.7841380377592077,
 'max_depths->30_min_samples_split->10': 0.7713494274218509,
 'max_depths->50_min_samples_split->5': 0.7874620860414732,
 'max_depths->50_min_samples_split->10': 0.7752305787681832,
 'max_depths->70_min_samples_split->5': 0.7880191891055401,
 'max_depths->70_min_samples_split->10': 0.7680099040544723,
 'max_depths->90_min_samples_split->5': 0.7757969668833179,
 'max_depths->90_min_samples_split->10': 0.7780037140204271}

### Using GridSearchCV for the checking of various parameters

#### SVC

In [17]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(SVC(gamma='auto'),{
    'C':[1,10,20,30],
    'kernel':['rbf', 'linear']
}, cv=5, return_train_score=False)

clf.fit(digits.data, digits.target)
df = pd.DataFrame(clf.cv_results_)
df[['param_C', 'param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.448545
1,1,linear,0.947697
2,10,rbf,0.476366
3,10,linear,0.947697
4,20,rbf,0.476366
5,20,linear,0.947697
6,30,rbf,0.476366
7,30,linear,0.947697


#### RandomForestClassifier

In [18]:
clf = GridSearchCV(RandomForestClassifier(),{
    'n_estimators':[10,20,30]
}, cv=5, return_train_score=False)

clf.fit(digits.data, digits.target)
df = pd.DataFrame(clf.cv_results_)
df[['param_n_estimators', 'mean_test_score']]

Unnamed: 0,param_n_estimators,mean_test_score
0,10,0.90598
1,20,0.926561
2,30,0.931572


#### LogisticRegression

In [19]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf = GridSearchCV(LogisticRegression(),{
        'solver':['liblinear', 'saga'],
        'C':[1,10,20,30]
    }, cv=5, return_train_score=False)

    clf.fit(digits.data, digits.target)
    df = pd.DataFrame(clf.cv_results_)
df[['param_solver','param_C', 'mean_test_score']]

Unnamed: 0,param_solver,param_C,mean_test_score
0,liblinear,1,0.922114
1,saga,1,0.918777
2,liblinear,10,0.918223
3,saga,10,0.91822
4,liblinear,20,0.91822
5,saga,20,0.917665
6,liblinear,30,0.916552
7,saga,30,0.918777


#### GaussianNB

In [20]:
clf = GridSearchCV(GaussianNB(),{
    'var_smoothing':[10,0.1,0.001,1e-5,1e-7,1e-9]
}, cv=5, return_train_score=False)

clf.fit(digits.data, digits.target)
df = pd.DataFrame(clf.cv_results_)
df[['param_var_smoothing', 'mean_test_score']]

Unnamed: 0,param_var_smoothing,mean_test_score
0,10.0,0.860867
1,0.1,0.88705
2,0.001,0.863115
3,1e-05,0.847541
4,0.0,0.832518
5,0.0,0.806928


#### MultinomialNB

In [21]:
clf = GridSearchCV(MultinomialNB(),{
    'alpha':[1,10,20,30]
}, cv=5, return_train_score=False)

clf.fit(digits.data, digits.target)
df = pd.DataFrame(clf.cv_results_)
df[['param_alpha', 'mean_test_score']]

Unnamed: 0,param_alpha,mean_test_score
0,1,0.87035
1,10,0.874246
2,20,0.873689
3,30,0.877029


#### DecisionTreeClassifier

In [22]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf = GridSearchCV(DecisionTreeClassifier(),{
        'max_depth' : [10,30,50,70,90],
        'min_samples_split' : [5,10]
    }, cv=5, return_train_score=False)

    clf.fit(digits.data, digits.target)
    df = pd.DataFrame(clf.cv_results_)
df[['param_max_depth','param_min_samples_split', 'mean_test_score']]

Unnamed: 0,param_max_depth,param_min_samples_split,mean_test_score
0,10,5,0.776905
1,10,10,0.773565
2,30,5,0.776911
3,30,10,0.774121
4,50,5,0.77968
5,50,10,0.77357
6,70,5,0.781357
7,70,10,0.770783
8,90,5,0.785238
9,90,10,0.771342


##### Using RandomizedSearchCV for DecisionTreeClassfier(similar for the rest)

In [23]:
from sklearn.model_selection import RandomizedSearchCV
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    rs = RandomizedSearchCV(DecisionTreeClassifier(),{
        'max_depth' : [10,30,50,70,90],
        'min_samples_split' : [5,10]
        },
                        cv=5,
                       return_train_score=False,
                       n_iter=2
                       )
rs.fit(digits.data, digits.target)
pd.DataFrame(rs.cv_results_)[['param_max_depth','param_min_samples_split', 'mean_test_score']]

Unnamed: 0,param_max_depth,param_min_samples_split,mean_test_score
0,50,5,0.774127
1,90,10,0.773007


# Choosing best model for classification

In [24]:
model_params = {
    'SVC':{
        'model':SVC(gamma='auto'),
        'params':{
            'C':[1,10,20,30],
            'kernel':['rbf', 'linear']
        }
    },
    'RandomForestClassifier':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[10,20,30]
        }
    },
    'LogisticRegression':{
        'model':LogisticRegression(multi_class='auto'),
        'params':{
            'solver':['liblinear', 'saga'],
            'C':[1,10,20,30]
        }
    },
    'Naive Bayes GaussianNB':{
        'model':GaussianNB(),
        'params':{
            'var_smoothing':[10,0.1,0.001,1e-5,1e-7,1e-9]
        }
    },
    'Naive Bayes MultinomialNB':{
        'model':MultinomialNB(),
        'params':{
            'alpha':[1,10,20,30]
        }
    },
    'DecisionTreeClassifier':{
        'model':DecisionTreeClassifier(),
        'params':{
            'max_depth' : [10,30,50,70,90],
            'min_samples_split' : [5,10]
        }
    }
}

In [25]:
scores=[]
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for model_name,mp in model_params.items():
        clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
        clf.fit(digits.data, digits.target)
        scores.append({
            'model':model_name,
            'best score':clf.best_score_,
            'best parameters':clf.best_params_
        })
pd.DataFrame(scores)

Unnamed: 0,model,best score,best parameters
0,SVC,0.947697,"{'C': 1, 'kernel': 'linear'}"
1,RandomForestClassifier,0.935475,{'n_estimators': 30}
2,LogisticRegression,0.922114,"{'C': 1, 'solver': 'liblinear'}"
3,Naive Bayes GaussianNB,0.88705,{'var_smoothing': 0.1}
4,Naive Bayes MultinomialNB,0.877029,{'alpha': 30}
5,DecisionTreeClassifier,0.785799,"{'max_depth': 90, 'min_samples_split': 5}"
