In [None]:
# While building machine learning models, we randomly split the dataset into training and test sets where a maximum percentage of
# the data is taken into the training set. Though the test dataset is small, there is still some chance that we left some
# important data in there that might have improved the model. And there is a problem of high variance in the training set. 
# To solve this, problems we use the idea of K-fold cross-validation.

In [None]:
# Cross-validation is a technique that is used to evaluate machine learning models by resampling the training data for improving
# performance.

In [None]:
# In K-fold Cross-Validation, the training set is randomly split into K(usually between 5 to 10) subsets known as folds. 
# Where K-1 folds are used to train the model and the other fold is used to test the model. This technique improves the high 
# variance problem in a dataset as we are randomly selecting the training and test folds.

#### The steps required to perform K-fold cross-validation are given below-

In [None]:
 # Step 1: Split the entire data randomly in k folds(usually between 5 to 10). The higher number of splits leads to less biased
 #   model.

# Step 2: Then fit the model with k-1 folds and test it with the remaining Kth fold. Record the performance metric.

# Step 3: Repeat step 2 until every k-fold serves as the test set.

# Step 4: Take the average of all the recorded scores. This will serve as the final performance metric of your model.

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.datasets import load_digits

  return f(*args, **kwds)


In [3]:
digit = load_digits()

In [7]:
digit

{'data': array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ..., 10.,  0.,  0.],
        [ 0.,  0.,  0., ..., 16.,  9.,  0.],
        ...,
        [ 0.,  0.,  1., ...,  6.,  0.,  0.],
        [ 0.,  0.,  2., ..., 12.,  0.,  0.],
        [ 0.,  0., 10., ..., 12.,  1.,  0.]]),
 'target': array([0, 1, 2, ..., 8, 9, 8]),
 'target_names': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 'images': array([[[ 0.,  0.,  5., ...,  1.,  0.,  0.],
         [ 0.,  0., 13., ..., 15.,  5.,  0.],
         [ 0.,  3., 15., ..., 11.,  8.,  0.],
         ...,
         [ 0.,  4., 11., ..., 12.,  7.,  0.],
         [ 0.,  2., 14., ..., 12.,  0.,  0.],
         [ 0.,  0.,  6., ...,  0.,  0.,  0.]],
 
        [[ 0.,  0.,  0., ...,  5.,  0.,  0.],
         [ 0.,  0.,  0., ...,  9.,  0.,  0.],
         [ 0.,  0.,  3., ...,  6.,  0.,  0.],
         ...,
         [ 0.,  0.,  1., ...,  6.,  0.,  0.],
         [ 0.,  0.,  1., ...,  6.,  0.,  0.],
         [ 0.,  0.,  0., ..., 10.,  0.,  0.]],
 
        [[ 0

In [8]:
from sklearn.model_selection import train_test_split


In [12]:
x_train,x_test,y_train,y_test=train_test_split(digit.data,digit.target,test_size=0.30)


### Logistic Regression

In [13]:
model = LogisticRegression(solver='liblinear',multi_class='ovr')
model.fit(x_train,y_train)
model.score(x_test,y_test)*100

95.92592592592592

### support vector machine

In [16]:
SVM = SVC(kernel='linear')
SVM.fit(x_train,y_train)
SVM.score(x_test,y_test)*100


97.5925925925926

### Random forest classifire

In [17]:
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
rf.score(x_test,y_test)*100



93.88888888888889

### K-Fold

In [18]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=4)
kf

KFold(n_splits=4, random_state=None, shuffle=False)

In [20]:
for train_index,test_index in kf.split([0,1,2,3,4,5,6,8,9]):
    print(train_index,test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 5 6 7 8] [3 4]
[0 1 2 3 4 7 8] [5 6]
[0 1 2 3 4 5 6] [7 8]


### Lest create model function

In [21]:
def get_model(model,x_train,x_test,y_train,y_test):
    model.fit(x_train,y_train)
    return model.score(x_test,y_test)*100

In [22]:
print('Accuracy score of logostic regreesion: ',get_model(LogisticRegression(),x_train,x_test,y_train,y_test))
print('Accureacy score of SVM: ',get_model(SVC(),x_train,x_test,y_train,y_test))
print('Accuracy score of Random forest: ',get_model(RandomForestClassifier(),x_train,x_test,y_train,y_test))




Accuracy score of logostic regreesion:  95.92592592592592




Accureacy score of SVM:  39.62962962962963
Accuracy score of Random forest:  93.88888888888889




### Now Apply the same method using KFold

In [23]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=4)

In [24]:
scoreof_Logistic=[]
scoreof_SVM=[]
scoreof_rf=[]


for train_index,test_index in kf.split(digit.data):
    X_train,X_test,y_train,y_test= digit.data[train_index],digit.data[test_index],digit.target[train_index],digit.target[test_index]
   
    scoreof_Logistic.append(get_model(LogisticRegression(),X_train,X_test,y_train,y_test))
    scoreof_SVM.append(get_model(SVC(),X_train,X_test,y_train,y_test))
    scoreof_rf.append(get_model(RandomForestClassifier(),X_train,X_test,y_train,y_test))



In [26]:
scoreof_Logistic

[92.66666666666666, 92.65033407572383, 95.7683741648107, 90.20044543429844]

In [27]:
scoreof_SVM

[44.666666666666664,
 39.643652561247215,
 43.207126948775056,
 51.224944320712694]

In [28]:
scoreof_rf

[90.0, 88.41870824053451, 93.98663697104676, 89.75501113585747]

### Magic of ML Sarts from here :-

In [29]:
from sklearn.model_selection import cross_val_score

In [35]:
cross_val_score(SVM,digit.data,digit.target,cv=4)*100

array([95.59471366, 94.23503326, 96.86800895, 92.80898876])

In [34]:
cross_val_score(model,digit.data,digit.target,cv=4)*100

array([93.6123348 , 92.01773836, 96.42058166, 90.33707865])

In [37]:
cross_val_score(rf,digit.data,digit.target,cv=4)*100

array([92.2907489 , 89.13525499, 95.30201342, 88.98876404])