# **Machine Learning –: K Fold Cross Validation**
-------
-------

### Loading the Dataset

In [1]:
from sklearn.datasets import load_digits 


In [2]:
digits = load_digits()

### Data Preparation

In [3]:
X = digits.data
y = digits.target

X

array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]])

In [4]:
y

array([0, 1, 2, ..., 8, 9, 8])

### Splitting the Dataset

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2)

### Model Creation 
- ### Applying **logistic** regression

In [7]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver = 'lbfgs', max_iter = 3000)

lr.fit(X_train, y_train)

print(lr.score(X_test, y_test))


0.9583333333333334


- ### Applying **SVM** algorithm

In [8]:
from sklearn.svm import SVC

svm = SVC() 

svm.fit(X_train, y_train) 
 
print(svm.score(X_test, y_test))


0.9888888888888889


- ### Applying **Random forest** algorithm

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=40) 

rf.fit(X_train, y_train) 
 
print(rf.score(X_test, y_test)) 


0.9666666666666667


- ### Applying **K fold Cross** validation

In [10]:
from sklearn.model_selection import KFold 

kf = KFold(n_splits = 3) 

kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [11]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]): 
    
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


### **Logistic regression** model performance using **cross_val_score**

In [12]:
from sklearn.model_selection import cross_val_score 


In [13]:
a = LogisticRegression(solver = 'lbfgs', max_iter = 5000) 

scores1 = cross_val_score(a, digits.data, digits.target, cv = 3)

scores1

array([0.91986644, 0.94323873, 0.91652755])

### **SVM** model performance using **cross_val_score** 

In [14]:
b = SVC() 

scores2 = cross_val_score(b, digits.data, digits.target, cv=3)

scores2

array([0.96494157, 0.97996661, 0.96494157])

###  **Random forest** model performance using **cross_val_score** 

In [15]:
c = RandomForestClassifier()

scores3 = cross_val_score(c,digits.data , digits.target , cv = 3)

scores3

array([0.94991653, 0.95325543, 0.92988314])

## Checking average of all model scores

In [16]:
import numpy as np 

In [17]:
a  = LogisticRegression(solver = 'lbfgs', max_iter = 5000) 
b = SVC() 
c = RandomForestClassifier(n_estimators=40) 
 
scores1 = cross_val_score(a, digits.data, digits.target, cv=3) 
scores2 = cross_val_score(b, digits.data, digits.target, cv=3) 
scores3 = cross_val_score(c, digits.data, digits.target, cv=3) 
 
print(np.average(scores1)) 
print(np.average(scores2)) 
print(np.average(scores3)) 


0.9265442404006677
0.9699499165275459
0.9393433500278241
