In [27]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit


In [11]:
X, y = datasets.load_iris(return_X_y=True)

In [12]:
# X is an array of 150 dimensions (arrays), each of which has 4 dimenisons

X.shape

(150, 4)

In [13]:
y.shape

(150,)

In [14]:
# split data
##random state:  Controls the shuffling applied to the data before applying the split. Pass an int for reproducible output across multiple function calls.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


X_train.shape, y_train.shape

((120, 4), (120,))

In [15]:
X_test.shape, y_test.shape

((30, 4), (30,))

When evaluating different settings (“hyperparameters”) for estimators, such as the C setting that must be manually set for an SVM, there is still a risk of overfitting on the test set because the parameters can be tweaked until the estimator performs optimally. This way, knowledge about the test set can “leak” into the model and evaluation metrics no longer report on generalization performance. To solve this problem, yet another part of the dataset can be held out as a so-called “validation set”: training proceeds on the training set, after which evaluation is done on the validation set, and when the experiment seems to be successful, final evaluation can be done on the test set.

However, by partitioning the available data into three sets, we drastically reduce the number of samples which can be used for learning the model, and the results can depend on a particular random choice for the pair of (train, validation) sets.

A solution to this problem is a procedure called cross-validation (CV for short). A test set should still be held out for final evaluation, but the validation set is no longer needed when doing CV. In the basic approach, called k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). 

In [17]:
clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)

In [18]:
scores

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [19]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.98 accuracy with a standard deviation of 0.02


In [3]:
my_data = ["a","b","c","d","e","f","g","h","i","j", "k", "l"]
kf = KFold(n_splits=10)
kf

KFold(n_splits=10, random_state=None, shuffle=False)

In [26]:
for train, test in kf.split(my_data):
    print("%s %s" % (train, test))

[ 2  3  4  5  6  7  8  9 10 11] [0 1]
[ 0  1  4  5  6  7  8  9 10 11] [2 3]
[ 0  1  2  3  5  6  7  8  9 10 11] [4]
[ 0  1  2  3  4  6  7  8  9 10 11] [5]
[ 0  1  2  3  4  5  7  8  9 10 11] [6]
[ 0  1  2  3  4  5  6  8  9 10 11] [7]
[ 0  1  2  3  4  5  6  7  9 10 11] [8]
[ 0  1  2  3  4  5  6  7  8 10 11] [9]
[ 0  1  2  3  4  5  6  7  8  9 11] [10]
[ 0  1  2  3  4  5  6  7  8  9 10] [11]


In [28]:
ss = ShuffleSplit(n_splits=10, test_size=0.20, random_state=0)

In [42]:
for train_index, test_index in ss.split(my_data):
    print("%s %s" % (train_index, test_index))


[10  2  8  1  7  9  3  0  5] [ 6 11  4]
[ 4  9  0 11  7  6  1 10  8] [5 2 3]
[ 2  7  5 11  0  3  4  9  8] [ 6  1 10]
[ 9  5  6  1  0  7 10 11  3] [2 4 8]
[ 5 11  2  8  6  3  7  4 10] [1 9 0]
[ 9  8  1 11  0  6 10  5  4] [3 7 2]
[ 2  5  3 11  9  7 10  1  8] [0 6 4]
[ 7  9  6  8 10  4  5  3  0] [ 1  2 11]
[ 1  0  6  5  2  3 11  8  4] [ 9  7 10]
[ 8  6  4  2  1 10  5  0  3] [11  9  7]


In [40]:
for train_index, test_index in ss.split(my_data):
    print([my_data[train_index[j]] for j in range(len(train_index))], 
         [my_data[test_index[i]] for i in range(len(test_index))])
#     print("%s %s" % (train_index, test_index))
#     print("%s %s" % (train_index, test_index))
            

['k', 'c', 'i', 'b', 'h', 'j', 'd', 'a', 'f'] ['g', 'l', 'e']
['e', 'j', 'a', 'l', 'h', 'g', 'b', 'k', 'i'] ['f', 'c', 'd']
['c', 'h', 'f', 'l', 'a', 'd', 'e', 'j', 'i'] ['g', 'b', 'k']
['j', 'f', 'g', 'b', 'a', 'h', 'k', 'l', 'd'] ['c', 'e', 'i']
['f', 'l', 'c', 'i', 'g', 'd', 'h', 'e', 'k'] ['b', 'j', 'a']
['j', 'i', 'b', 'l', 'a', 'g', 'k', 'f', 'e'] ['d', 'h', 'c']
['c', 'f', 'd', 'l', 'j', 'h', 'k', 'b', 'i'] ['a', 'g', 'e']
['h', 'j', 'g', 'i', 'k', 'e', 'f', 'd', 'a'] ['b', 'c', 'l']
['b', 'a', 'g', 'f', 'c', 'd', 'l', 'i', 'e'] ['j', 'h', 'k']
['i', 'g', 'e', 'c', 'b', 'k', 'f', 'a', 'd'] ['l', 'j', 'h']


In [1]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
cv = KFold(n_splits=10, random_state=1, shuffle=True)
print(cv)

KFold(n_splits=10, random_state=1, shuffle=True)


In [6]:
kf = KFold(n_splits=10)
for train_index, test_index in kf.split(my_data):
    print("TRAIN:", train_index, "TEST:", test_index)


TRAIN: [ 2  3  4  5  6  7  8  9 10 11] TEST: [0 1]
TRAIN: [ 0  1  4  5  6  7  8  9 10 11] TEST: [2 3]
TRAIN: [ 0  1  2  3  5  6  7  8  9 10 11] TEST: [4]
TRAIN: [ 0  1  2  3  4  6  7  8  9 10 11] TEST: [5]
TRAIN: [ 0  1  2  3  4  5  7  8  9 10 11] TEST: [6]
TRAIN: [ 0  1  2  3  4  5  6  8  9 10 11] TEST: [7]
TRAIN: [ 0  1  2  3  4  5  6  7  9 10 11] TEST: [8]
TRAIN: [ 0  1  2  3  4  5  6  7  8 10 11] TEST: [9]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 11] TEST: [10]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10] TEST: [11]
