In [7]:
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
digits = load_digits()

In [65]:
dir(digits)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

# 1. Train Test Split

In [3]:
from sklearn.model_selection import train_test_split

In [41]:
X_train,X_test,y_train,y_test=train_test_split(digits.data,digits.target,test_size=0.3,random_state=10)

In [42]:
lr=LogisticRegression()
lr.fit(X_train,y_train)
lr.score(X_test,y_test)

0.9592592592592593

In [43]:
svm=SVC(kernel='rbf') #{'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} 
svm.fit(X_train,y_train)
svm.score(X_test,y_test)

0.9833333333333333

In [44]:
rf=RandomForestClassifier(n_estimators=30)
rf.fit(X_train,y_train)
rf.score(X_test,y_test)

0.9611111111111111

# 2. K-fold Cross Validation 

In [67]:
from sklearn.model_selection import KFold, StratifiedKFold

In [163]:
kf=KFold(n_splits=10)
kf

KFold(n_splits=10, random_state=None, shuffle=False)

In [138]:
folds=StratifiedKFold(n_splits=3)
folds

StratifiedKFold(n_splits=3, random_state=None, shuffle=False)

In [62]:
data=[1,2,3,4,5,6,7,8,9]

In [63]:
for train_index,test_index in kf.split(data):
    print(train_index,test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [164]:
def get_score(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    return model.score(X_test,y_test)

In [165]:
lr=[]
svm=[]
rf=[]

for train_index,test_index in kf.split(digits.data):
    X_train, X_test, y_train, y_test = digits.data[train_index], digits.data[test_index], digits.target[train_index], digits.target[test_index]
    lr.append(get_score(LogisticRegression(), X_train, X_test, y_train, y_test))
    svm.append(get_score(SVC(), X_train, X_test, y_train, y_test))
    rf.append(get_score(RandomForestClassifier(), X_train, X_test, y_train, y_test))

In [166]:
lr #accuracy scores of the models during each iteration

[0.9055555555555556,
 0.9777777777777777,
 0.8777777777777778,
 0.9555555555555556,
 0.9444444444444444,
 0.9666666666666667,
 0.9555555555555556,
 0.9329608938547486,
 0.8770949720670391,
 0.9217877094972067]

In [167]:
svm

[0.9444444444444444,
 1.0,
 0.9333333333333333,
 0.9833333333333333,
 0.9833333333333333,
 0.9888888888888889,
 0.9888888888888889,
 0.994413407821229,
 0.9664804469273743,
 0.9497206703910615]

In [168]:
rf

[0.9,
 0.9777777777777777,
 0.9388888888888889,
 0.9555555555555556,
 0.9555555555555556,
 0.9722222222222222,
 0.9722222222222222,
 0.9608938547486033,
 0.9385474860335196,
 0.9441340782122905]

In [169]:
from sklearn.model_selection import cross_val_score

cross_val_score(LogisticRegression(), digits.data, digits.target, cv=10)  #same steps did using for loop, cv--no.of folds

array([0.90555556, 0.96111111, 0.87777778, 0.92777778, 0.94444444,
       0.96666667, 0.95      , 0.93854749, 0.87150838, 0.93854749])

In [146]:
lr_stratified=[]
svm_stratified=[]
rf_stratified=[]

for train_index,test_index in folds.split(digits.data,digits.target):
    X_train, X_test, y_train, y_test = digits.data[train_index], digits.data[test_index], digits.target[train_index], digits.target[test_index]
    lr_stratified.append(get_score(LogisticRegression(), X_train, X_test, y_train, y_test))
    svm_stratified.append(get_score(SVC(), X_train, X_test, y_train, y_test))
    rf_stratified.append(get_score(RandomForestClassifier(), X_train, X_test, y_train, y_test))

In [175]:
lr_stratified

[0.9215358931552587, 0.9415692821368948, 0.9165275459098498]

In [148]:
svm_stratified

[0.9649415692821369, 0.9799666110183639, 0.9649415692821369]

In [150]:
rf_stratified

[0.9465776293823038, 0.9465776293823038, 0.9248747913188647]

## Method 2

In [172]:
X=digits.data
y=digits.target

In [176]:
kfold=KFold(n_splits=5, shuffle=True, random_state=None) #shuffle=False and random state gives error as shuffling should happen for random state
cv_scores = cross_val_score(estimator=LogisticRegression(), X=X, y=y, cv=kfold)

In [178]:
print(f'CV Scores          :{cv_scores}')
print(f'Mean Accuracy      : {cv_scores.mean()}')
print(f'Standard Deviation : {cv_scores.std()}')

CV Scores          :[0.975      0.96944444 0.97493036 0.96935933 0.94428969]
Mean Accuracy      : 0.9666047663262148
Standard Deviation : 0.011431617258855572


# 3. Leave One Out Cross Validation (LOOCV)

In [179]:
from sklearn.model_selection import LeaveOneOut

In [180]:
loocv = LeaveOneOut()
cv_scores = cross_val_score(estimator=LogisticRegression(), X=X, y=y, cv=loocv)

print(f'CV Scores          :{cv_scores}')
print(f'Mean Accuracy      : {cv_scores.mean()}')
print(f'Standard Deviation : {cv_scores.std()}')

CV Scores          :[1. 1. 1. ... 1. 1. 1.]
Mean Accuracy      : 0.9649415692821369
Standard Deviation : 0.18392753234212622


## Standard Deviation is high in LOOCV