# Splitting data into training and test subsets

In [0]:
#Import functions
import numpy as np
from sklearn.model_selection import train_test_split

In [0]:
#Define data structure for X (features) and y (lables)
X, y = np.arange(10).reshape((5, 2)), range(5)

In [0]:
#Show X
X

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

In [0]:
#Show y
list(y)

[0, 1, 2, 3, 4]

In [0]:
#Split data into training and test subsets: train =2/3; test =1/3; randomized
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [0]:
#Show training data X_train (features)
X_train

array([[4, 5],
       [0, 1],
       [6, 7]])

In [0]:
#Show training data y_train (labels)
y_train

[2, 0, 3]

In [0]:
#Show X_train (features)
X_test

array([[2, 3],
       [8, 9]])

In [0]:
#Show y_test (labels)
y_test

[1, 4]

In [0]:
#Split y data (lable) into training and test subsets: default; not shuffled
train_test_split(y, shuffle=False)

[[0, 1, 2], [3, 4]]

**General approach**
Split the data into 80% training and 20% testing.
Set the random_state to a value of your choice. This ensures results are consistent.

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Cross-validation

A separate validation dataset is not needed. In **k-fold cross-validation** the training set is split into k smaller sets. For each of the k “folds”:

*   A model is trained using *k-1* of the folds as training data;
*   the resulting model is validated on the remaining part of the data.

In [0]:
#Import functions
import numpy as np
from sklearn.model_selection import KFold

In [0]:
#Define data X and y
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])

In [0]:
#2-Fold split
kf = KFold(n_splits=2)

In [0]:
#Function "split" provides the indices
for train_index, test_index in kf.split(X):
  print("TRAIN:", train_index, "TEST:", test_index)
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]

TRAIN: [2 3] TEST: [0 1]
TRAIN: [0 1] TEST: [2 3]


In [0]:
#Print the k-fold function; default is not shuffled (parameter)
print(kf)

KFold(n_splits=2, random_state=None, shuffle=False)


**ShuffleSplit** yields indices to split data into training and test sets. It is a random permutation cross-validator,

In [0]:
#Import function
from sklearn.model_selection import ShuffleSplit

In [0]:
#Define data X and y
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [3, 4], [5, 6]])
y = np.array([1, 2, 1, 2, 1, 2])

In [0]:
#Perform ShuffleSplit with 5 splits, test data size 25%
rs = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)

In [0]:
#Function "split" provides the indices
for train_index, test_index in rs.split(X):
  print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [1 3 0 4] TEST: [5 2]
TRAIN: [4 0 2 5] TEST: [1 3]
TRAIN: [1 2 4 0] TEST: [3 5]
TRAIN: [3 4 1 0] TEST: [5 2]
TRAIN: [3 5 1 0] TEST: [2 4]


In [0]:
#Perform ShuffleSplit with 5 splits, test data size 25% and train data size 50%
rs = ShuffleSplit(n_splits=5, train_size=0.5, test_size=.25,random_state=0)
for train_index, test_index in rs.split(X):
  print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [1 3 0] TEST: [5 2]
TRAIN: [4 0 2] TEST: [1 3]
TRAIN: [1 2 4] TEST: [3 5]
TRAIN: [3 4 1] TEST: [5 2]
TRAIN: [3 5 1] TEST: [2 4]
