# Machine Learning Tutorial Python 12 - K Fold Cross Validation

Tutorial made from 'Machine Learning Tutorial Python 12 - K Fold Cross Validation' (codebasics) by Marcus Mariano

Link -> https://www.youtube.com/watch?v=gJo0uNL-5Qw&t=423s

|       `Option 1`     |   `Option 2`  |  `Option 3`  |     
| :---:      |    :----:   |         :---:          |    
|Use all available data for training and test on same dataset|Split available dataset into training and test sets|K fold cross validation|    
|    wrong way   |  good way  |   very good way   |    
   

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.datasets import load_digits

digits = load_digits()

## Dataset Digits

In [2]:
digits

{'data': array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ..., 10.,  0.,  0.],
        [ 0.,  0.,  0., ..., 16.,  9.,  0.],
        ...,
        [ 0.,  0.,  1., ...,  6.,  0.,  0.],
        [ 0.,  0.,  2., ..., 12.,  0.,  0.],
        [ 0.,  0., 10., ..., 12.,  1.,  0.]]),
 'target': array([0, 1, 2, ..., 8, 9, 8]),
 'target_names': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 'images': array([[[ 0.,  0.,  5., ...,  1.,  0.,  0.],
         [ 0.,  0., 13., ..., 15.,  5.,  0.],
         [ 0.,  3., 15., ..., 11.,  8.,  0.],
         ...,
         [ 0.,  4., 11., ..., 12.,  7.,  0.],
         [ 0.,  2., 14., ..., 12.,  0.,  0.],
         [ 0.,  0.,  6., ...,  0.,  0.,  0.]],
 
        [[ 0.,  0.,  0., ...,  5.,  0.,  0.],
         [ 0.,  0.,  0., ...,  9.,  0.,  0.],
         [ 0.,  0.,  3., ...,  6.,  0.,  0.],
         ...,
         [ 0.,  0.,  1., ...,  6.,  0.,  0.],
         [ 0.,  0.,  1., ...,  6.,  0.,  0.],
         [ 0.,  0.,  0., ..., 10.,  0.,  0.]],
 
        [[ 0

In [3]:
digits['data']

array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]])

In [4]:
digits['target']

array([0, 1, 2, ..., 8, 9, 8])

In [5]:
len(digits['target'])

1797

## Option 1

### Three Machine Learning algorithm to best accuracy


In [7]:
# Every time run this code X and y, training and test have changed
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(digits.data, 
                                                    digits.target, 
                                                    test_size=0.3)

#### Logistic Regression

In [10]:
# 
# lr = LogisticRegression()  # 0.9722222222222222
lr = LogisticRegression(solver='liblinear', multi_class='ovr')

lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.9537037037037037

#### SVM

In [11]:
# 
svm = SVC(gamma='auto')

svm.fit(X_train, y_train)
svm.score(X_test, y_test)

0.3277777777777778

#### Random Forest

In [13]:
# 
rf = RandomForestClassifier(n_estimators=100)

rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.975925925925926

## Option 2

### K fold cross validation

#### Basic example

In [14]:
# 
from sklearn.model_selection import KFold

kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [15]:
# Take the dateset (1 to 9) and divide into 70% training and 30% test, 3 times.
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


#### Function to get score from differents models

In [17]:
def get_score(model, X_train, X_test, y_train, y_test):
    
    model.fit(X_train, y_train)
    
    return model.score(X_test, y_test)

In [18]:
get_score(lr, X_train, X_test, y_train, y_test)

0.9537037037037037

In [19]:
get_score(svm, X_train, X_test, y_train, y_test)

0.3277777777777778

In [20]:
get_score(rf, X_train, X_test, y_train, y_test)

0.9703703703703703

#### Use StratifiedKFold for our digits example

In [21]:
from sklearn.model_selection import StratifiedKFold

folds = StratifiedKFold(n_splits=3)

scores_logistic = []
scores_svm = []
scores_rf = []

for train_index, test_index in folds.split(digits.data, 
                                           digits.target):
    X_train, X_test, y_train, y_test = digits.data[train_index], digits.data[test_index], \
                                       digits.target[train_index], digits.target[test_index]
    
#     scores_logistic.append(get_score(LogisticRegression(solver='liblinear', multi_class='ovr'), X_train, X_test, y_train, y_test))  
#     scores_svm.append(get_score(SVC(gamma='auto'), X_train, X_test, y_train, y_test))
#     scores_rf.append(get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))
    scores_logistic.append(get_score(lr, X_train, X_test, y_train, y_test))  
    scores_svm.append(get_score(svm, X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(rf, X_train, X_test, y_train, y_test))


In [22]:
scores_logistic

[0.8953488372093024, 0.9499165275459098, 0.9093959731543624]

In [23]:
scores_svm

[0.39368770764119604, 0.41068447412353926, 0.4597315436241611]

In [24]:
scores_rf

[0.9335548172757475, 0.9582637729549248, 0.9312080536912751]

In [25]:
# 
np.mean(scores_logistic), np.mean(scores_svm), np.mean(scores_rf) 

(0.9182204459698582, 0.4213679084629655, 0.9410088813073157)

### Use cross_val_score function
cross_val_score uses stratifield kfold by default

In [27]:
from sklearn.model_selection import cross_val_score

#### Logistic regression model performance using cross_val_score


In [53]:
cross_val_score(LogisticRegression(solver='liblinear', 
                                   multi_class='ovr'),                 
                digits.data, 
                digits.target, 
                cv=3)

array([0.89534884, 0.94991653, 0.90939597])

In [29]:
cross_val_score(lr, 
                digits.data, 
                digits.target, 
                cv=3)

array([0.89534884, 0.94991653, 0.90939597])

#### SVM model performance using cross_val_score

In [50]:
cross_val_score(SVC(gamma='auto'), 
                digits.data, 
                digits.target,
                cv=3)

array([0.39368771, 0.41068447, 0.45973154])

In [30]:
cross_val_score(svm, 
                digits.data, 
                digits.target,
                cv=3)

array([0.39368771, 0.41068447, 0.45973154])

#### Random Forest model performance using cross_val_score

In [31]:
cross_val_score(RandomForestClassifier(n_estimators=40),
                digits.data, 
                digits.target,
                cv=3)

array([0.92358804, 0.95325543, 0.92449664])

In [32]:
cross_val_score(rf,
                digits.data, 
                digits.target,
                cv=3)

array([0.93521595, 0.95492487, 0.92449664])

### Parameter tunning using k fold cross validation

In [33]:
scores1 = cross_val_score(RandomForestClassifier(n_estimators=5),
                          digits.data, 
                          digits.target, 
                          cv=10)
np.average(scores1)

0.8831249465509396

In [34]:
scores2 = cross_val_score(RandomForestClassifier(n_estimators=20),
                          digits.data, 
                          digits.target, 
                          cv=10)
np.average(scores2)

0.9331662913338473

In [35]:
scores3 = cross_val_score(RandomForestClassifier(n_estimators=30),
                          digits.data, 
                          digits.target, 
                          cv=10)
np.average(scores3)

0.9421546229652138

In [36]:
scores4 = cross_val_score(RandomForestClassifier(n_estimators=60),
                          digits.data, 
                          digits.target, 
                          cv=10)
np.average(scores4)

0.9516527762190501