# Advanced Cross Validation 

Performing model valuation within Scikit Learn

In [1]:
# We import scikit
import pandas as pd
import numpy as np

## Generate Some Data

In [2]:
m=10

In [3]:
X_values = np.random.random((m, 3))

In [4]:
X_values

array([[0.76376126, 0.81010685, 0.24485174],
       [0.43245336, 0.33573722, 0.52347221],
       [0.42831417, 0.70186685, 0.38315087],
       [0.46080431, 0.4395558 , 0.9629351 ],
       [0.48836881, 0.76805298, 0.32944367],
       [0.48004637, 0.28876473, 0.00781829],
       [0.2208972 , 0.20737684, 0.38909493],
       [0.07644328, 0.06373335, 0.3774626 ],
       [0.37051296, 0.67151102, 0.97431857],
       [0.99654278, 0.55389955, 0.5608341 ]])

In [5]:
y = np.random.random((m, 1))

In [6]:
type(y)

numpy.ndarray

In [7]:
# We can extract the data and put it into a dataframe
X = pd.DataFrame(X_values)

## Train Test Split

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
# Now we split the data with 30% for testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1278)

In [10]:
print("Training data:", X_train.shape, y_train.shape,)
print("Test data:", X_test.shape, y_test.shape)

Training data: (7, 3) (7, 1)
Test data: (3, 3) (3, 1)


In [11]:
X_train

Unnamed: 0,0,1,2
3,0.460804,0.439556,0.962935
7,0.076443,0.063733,0.377463
8,0.370513,0.671511,0.974319
1,0.432453,0.335737,0.523472
4,0.488369,0.768053,0.329444
0,0.763761,0.810107,0.244852
5,0.480046,0.288765,0.007818


In [12]:
X_test

Unnamed: 0,0,1,2
6,0.220897,0.207377,0.389095
9,0.996543,0.5539,0.560834
2,0.428314,0.701867,0.383151


In [13]:
y_train

array([[0.78412679],
       [0.26812062],
       [0.12318572],
       [0.74884122],
       [0.12961291],
       [0.80877192],
       [0.30511004]])

In [14]:
y_test

array([[0.98576997],
       [0.13619804],
       [0.17189158]])

## K-Fold Cross Validation

In [15]:
from sklearn.model_selection import KFold, cross_val_score

### Trivial Case to Show Split

In [16]:
X = ["a", "a", "a", "b", "b", "c", "c", "c", "c", "c"]
k_fold = KFold(n_splits=5)
for train_indices, test_indices in k_fold.split(X):
    print('Train: %s | test: %s' % (train_indices, test_indices))

Train: [2 3 4 5 6 7 8 9] | test: [0 1]
Train: [0 1 4 5 6 7 8 9] | test: [2 3]
Train: [0 1 2 3 6 7 8 9] | test: [4 5]
Train: [0 1 2 3 4 5 8 9] | test: [6 7]
Train: [0 1 2 3 4 5 6 7] | test: [8 9]


### Training a Support Vector Classifier to classify the Digits Dataset

In [17]:
# Load digits dataset
from sklearn import datasets, svm
digits = datasets.load_digits()

In [18]:
X_digits = digits.data
y_digits = digits.target

In [19]:
X_digits.shape

(1797, 64)

In [20]:
svc = svm.SVC(C=1, kernel='linear')

In [21]:
# Calibrate model using all but the last 100 rows of the datasets
model = svc.fit(X_digits[:-100], y_digits[:-100])

In [22]:
# Test the model using the last 100 rows
model.score(X_digits[-100:], y_digits[-100:])

0.98

In [23]:
k_fold = KFold(n_splits=5)

In [24]:
# Loop over different validation sets
for train, test in k_fold.split(X_digits):
    print(train.shape,test.shape)
    svc.fit(X_digits[train], y_digits[train]).score(X_digits[test], y_digits[test])

(1437,) (360,)
(1437,) (360,)
(1438,) (359,)
(1438,) (359,)
(1438,) (359,)


In [25]:
res = cross_val_score(svc, X_digits, y_digits, cv=k_fold, n_jobs=-1)

In [26]:
print(res)

[0.96388889 0.92222222 0.9637883  0.9637883  0.93036212]


In [27]:
res.mean()

0.9488099659548128