# Aim

- look at different types of cross validation in sklearn

# Notes

- KFold cross-validators are objects that have a method .split()
- the method .split() creates a generator function that provides the indexes of train and test sets for each step.

## Make example data

## KFold

In [107]:
import numpy as np
X = np.array([[1, 2], [3, 4], [1, 3], [3, 5]])
y = np.array([1, 2, 3, 4])

In [108]:
X

array([[1, 2],
       [3, 4],
       [1, 3],
       [3, 5]])

In [109]:
y

array([1, 2, 3, 4])

In [110]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=2) # instantiate kfold class into an object with set number of splits. 

print(kf.get_n_splits(X)) # use to find no of splits.

print(kf)

2
KFold(n_splits=2, random_state=None, shuffle=False)


Here we print the indices of test and train splits. The iteration (generator) is over each fold. 

In [111]:
for train_index, test_index in kf.split(X):
   print("TRAIN:", train_index, "TEST:", test_index)
   X_train, X_test = X[train_index], X[test_index]
   y_train, y_test = y[train_index], y[test_index]

TRAIN: [2 3] TEST: [0 1]
TRAIN: [0 1] TEST: [2 3]


In [112]:
X[train_index]

array([[1, 2],
       [3, 4]])

In [113]:
y[train_index]

array([1, 2])

In [114]:
X[test_index]

array([[1, 3],
       [3, 5]])

In [115]:
y[test_index]

array([3, 4])

We extend the function to print all data splits. Here are the X and y values for the final fold of generator for both training and testing sets:

In [116]:
for train_index, test_index in kf.split(X):
    print("TRAIN INDEX:", train_index, "TEST INDEX:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print('----')
    print('TRAINNG DATA:')
    print(X_train)
    print('y=',y_train)
    print('Testing DATA:')
    print(X_test)
    print('y=',y_test)
    print('---------------'*3)
    y_train, y_test = y[train_index], y[test_index]

TRAIN INDEX: [2 3] TEST INDEX: [0 1]
----
TRAINNG DATA:
[[1 3]
 [3 5]]
y= [3 4]
Testing DATA:
[[1 2]
 [3 4]]
y= [1 2]
---------------------------------------------
TRAIN INDEX: [0 1] TEST INDEX: [2 3]
----
TRAINNG DATA:
[[1 2]
 [3 4]]
y= [1 2]
Testing DATA:
[[1 3]
 [3 5]]
y= [3 4]
---------------------------------------------


In [118]:
X

array([[1, 2],
       [3, 4],
       [1, 3],
       [3, 5]])

In [117]:
y

array([1, 2, 3, 4])

# Stratified Kfold

This splits data taking into account the classes contained in each group. Each fold *preserves percentage* of labels in each fold.

In [119]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
X = np.array([[1, 2], [3, 4], [1, 3], [3, 5]])
y = np.array([21, 21, 11, 11])
skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(X, y) # inputs here do not matter

2

When using .split() method you must also now give 'y'. Presumably because it needs to know to split ratios. 

In [120]:
for train_index, test_index in skf.split(X,y):
    print("TRAIN INDEX:", train_index, "TEST INDEX:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print('----')
    print('TRAINNG DATA:')
    print(X_train)
    print('y=',y_train)
    print('Testing DATA:')
    print(X_test)
    print('y=',y_test)
    print('---------------'*3)
    y_train, y_test = y[train_index], y[test_index]

TRAIN INDEX: [1 3] TEST INDEX: [0 2]
----
TRAINNG DATA:
[[3 4]
 [3 5]]
y= [21 11]
Testing DATA:
[[1 2]
 [1 3]]
y= [21 11]
---------------------------------------------
TRAIN INDEX: [0 2] TEST INDEX: [1 3]
----
TRAINNG DATA:
[[1 2]
 [1 3]]
y= [21 11]
Testing DATA:
[[3 4]
 [3 5]]
y= [21 11]
---------------------------------------------


# Time series splitting (Kfold preserving order of data)

In [121]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
X = np.array([[1, 2], [3, 4], [1, 3], [3, 5], [1, 4], [3, 6]])
y = np.array([10, 20, 30, 40, 50, 60])
tscv = TimeSeriesSplit(n_splits=5) # cannot have fold greater than samples!

In [122]:
print(tscv)  

TimeSeriesSplit(max_train_size=None, n_splits=5)


In [123]:
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [0] TEST: [1]
TRAIN: [0 1] TEST: [2]
TRAIN: [0 1 2] TEST: [3]
TRAIN: [0 1 2 3] TEST: [4]
TRAIN: [0 1 2 3 4] TEST: [5]


In [124]:
for train_index, test_index in tscv.split(X):
    print("TRAIN INDEX:", train_index, "TEST INDEX:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print('----')
    print('TRAINNG DATA:')
    print(X_train)
    print('y=',y_train)
    print('Testing DATA:')
    print(X_test)
    print('y=',y_test)
    print('---------------'*3)
    y_train, y_test = y[train_index], y[test_index]

TRAIN INDEX: [0] TEST INDEX: [1]
----
TRAINNG DATA:
[[1 2]]
y= [10]
Testing DATA:
[[3 4]]
y= [20]
---------------------------------------------
TRAIN INDEX: [0 1] TEST INDEX: [2]
----
TRAINNG DATA:
[[1 2]
 [3 4]]
y= [10 20]
Testing DATA:
[[1 3]]
y= [30]
---------------------------------------------
TRAIN INDEX: [0 1 2] TEST INDEX: [3]
----
TRAINNG DATA:
[[1 2]
 [3 4]
 [1 3]]
y= [10 20 30]
Testing DATA:
[[3 5]]
y= [40]
---------------------------------------------
TRAIN INDEX: [0 1 2 3] TEST INDEX: [4]
----
TRAINNG DATA:
[[1 2]
 [3 4]
 [1 3]
 [3 5]]
y= [10 20 30 40]
Testing DATA:
[[1 4]]
y= [50]
---------------------------------------------
TRAIN INDEX: [0 1 2 3 4] TEST INDEX: [5]
----
TRAINNG DATA:
[[1 2]
 [3 4]
 [1 3]
 [3 5]
 [1 4]]
y= [10 20 30 40 50]
Testing DATA:
[[3 6]]
y= [60]
---------------------------------------------


# Passing method to GridSearchCV

- as standard GridSearchCV uses Kfold or SKfold (in case of classifiers)
- you can also set up your own CV with the methods above

Classification problem e.g.

In [133]:
X = np.array([[1, 2], [3, 4], [1, 3], [3, 5]])
y = np.array([21, 21, 11, 11])
skf = StratifiedKFold(n_splits=2)

In [140]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

iris = datasets.load_iris()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} # create parameter grid to search over
svc = svm.SVC(gamma="scale")                           # initialise classifier
clf = GridSearchCV(svc, parameters, cv=skf)              # setup gridsearch
clf.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=2, random_state=None, shuffle=False),
       error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [151]:
X = np.array([[1, 2], [3, 4], [1, 3], [3, 5], [1, 4], [3, 6]])
y = np.array([10, 20, 10, 20, 10, 20])
tscv = TimeSeriesSplit(n_splits=2)

In [152]:
iris = datasets.load_iris()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} # create parameter grid to search over
svc = svm.SVC(gamma="scale")                           # initialise classifier
clf = GridSearchCV(svc, parameters, cv=tscv)              # setup gridsearch
clf.fit(X, y)

GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=2),
       error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

**Note that you might get problems with a timeseries cross validation split of clasisfication problem!!** (if all catagories are not in the splits!!) The TimeSeriesSplit is not explicitly checking that it finds each class in each training set!

We can get into from each of the CV splits from training too:

In [153]:
sorted(clf.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'mean_train_score',
 'param_C',
 'param_kernel',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split0_train_score',
 'split1_test_score',
 'split1_train_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score',
 'std_train_score']

In [147]:
clf.cv_results_['split0_test_score']

array([1., 1., 1., 1.])

In [148]:
clf.cv_results_['split0_train_score']



array([1., 1., 1., 1.])