In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

iris = datasets.load_iris()
iris.data.shape, iris.target.shape

((150, 4), (150,))

In [2]:
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.4, random_state=0
)

X_train.shape, y_train.shape

((90, 4), (90,))

In [3]:
X_test.shape, y_test.shape

((60, 4), (60,))

In [4]:
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)
# does worse, hasn't seen test data before

0.9666666666666667

In [5]:
# does much better, has seen this before, studied and memorized it
clf.score(X_train, y_train)
# important to do this before you do anything

0.9888888888888889

In [6]:
# SIMPLER CROSS VALIDATION
# cv is a way to use training data in order to get good 
# estimates of how well our model will perform on data we haven't seen before
from sklearn.model_selection import cross_val_score

In [7]:
# will take our data, split into diff parts, train model on one half of data and test on other
# then test on first half and train on second half
# in order to get somewhat unbiased estimate of how well data will do outside
clf = svm.SVC(kernel='linear', C=1)
# cv determines # of scores
scores = cross_val_score(clf, iris.data, iris.target, cv=2)
scores

array([0.98666667, 0.94666667])

In [8]:
# can take all cv scores and compute mean and variance
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.97 (+/- 0.04)


In [9]:
# can provide own scoring method
from sklearn import metrics
scores = cross_val_score(
    clf, iris.data, iris.target, cv=5, scoring='f1_macro'
)
scores

array([0.96658312, 1.        , 0.96658312, 0.96658312, 1.        ])

In [10]:
# can provide own cross-validation
# for example shuffle split, shuffle data
from sklearn.model_selection import ShuffleSplit
n_samples = iris.data.shape[0]
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
cross_val_score(clf, iris.data, iris.target, cv=cv)

array([0.97777778, 0.97777778, 1.        ])

In [11]:
# finally, can also do cross validation predict, don't do this often
# will get unbiased prediction from estimator that you can use
from sklearn.model_selection import cross_val_predict
predicted = cross_val_predict(clf, iris.data, iris.target, cv=10)
predicted.shape

(150,)

In [12]:
# all these predicted estimates are for your train set, but they've been predicted by models not 
# trained on them, not cheating
metrics.accuracy_score(iris.target, predicted)

0.9733333333333334

In [13]:
# MORE COMPLEX CROSS VALIDATIONS
# Cross validation iterators
from sklearn.model_selection import KFold

In [14]:
kf = KFold(n_splits=4, shuffle=True)
X = ["a", "b", "c", "d"]
# split Xs and provide indeces for train and test set
# use indeces to take model and train on training data and test on testing data
# get everything tested in unbiased matter
for train, test in kf.split(X):
    print("%s %s" % (train, test))

[0 2 3] [1]
[0 1 2] [3]
[1 2 3] [0]
[0 1 3] [2]


In [15]:
# Stratification
# folds preserve the percentage of samples in class
# also good for class imbalances
from sklearn.model_selection import StratifiedKFold

In [16]:
X = np.ones(10)
y = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
skf = StratifiedKFold(n_splits=3)
for train, test in skf.split(X, y):
    print("%s %s" % (train, test)) # each class preserved in all sets

[2 3 6 7 8 9] [0 1 4 5]
[0 1 3 4 5 8 9] [2 6 7]
[0 1 2 4 5 6 7] [3 8 9]


In [17]:
# collecting data from patients, some is from patient
# idea is that you would like to know if model train on a particular set of groups will generalize well to unseen groups
# don't want data from group 1 in train and test set
# ex: taking heart rate from patients, 20 patients each one gives you 10 samples
# don't want data from patient A to predict heart rate of patient A, cheating
# do Group KFold in this case
from sklearn.model_selection import GroupKFold
X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]
y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"]
groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]

gkf = GroupKFold(n_splits=3)
for train, test in gkf.split(X, y,groups=groups):
    print("%s %s" % (train, test))

[0 1 2 3 4 5] [6 7 8 9]
[0 1 2 6 7 8 9] [3 4 5]
[3 4 5 6 7 8 9] [0 1 2]


In [18]:
# Time Series Split
# want to use past examples to predict future examples
from sklearn.model_selection import TimeSeriesSplit
X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
y = np.array([1, 2, 3, 4, 5, 6])
tscv = TimeSeriesSplit(n_splits=3)
print(tscv) 

for train, test in tscv.split(X):
    print("%s %s" % (train, test))

TimeSeriesSplit(max_train_size=None, n_splits=3)
[0 1 2] [3]
[0 1 2 3] [4]
[0 1 2 3 4] [5]
