# A Deeper Dive into Data Partitioning!
- This is required for cross validation of a model's accuracy.
- Helps to mitigate researcher overfitting by now including a test (holdout) set to truly evaluate model's accuracy.

In [74]:
# imports
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from torch.utils.data import random_split

## Creating Fake dataset for practice.
#### In this Project:
- Techniques to partition data into 3 sets (train, dev, test (aka holdout)) will be covered.
    1. Manually dividing up data.
    2. sklearn and built in functionality to divide up data.

- The dataset generated is way too small to do this pratically, it is for demonstration purposes only.

In [75]:
# creating fake data
fake_data = np.tile(np.array([1,2,3,4]),(10,1)) + np.tile(10*np.arange(1,11),(4,1)).T
fake_labels = np.arange(10)>4 # divides labels between true and false

# printing data and labels
print(fake_data, "\n", fake_labels)

[[ 11  12  13  14]
 [ 21  22  23  24]
 [ 31  32  33  34]
 [ 41  42  43  44]
 [ 51  52  53  54]
 [ 61  62  63  64]
 [ 71  72  73  74]
 [ 81  82  83  84]
 [ 91  92  93  94]
 [101 102 103 104]] 
 [False False False False False  True  True  True  True  True]


## Partitioning Data Using `train_test_split`
- This function only splits the dataset into 2 sets - training and testing.
- There will be extra steps involved to divide testing into 2 sets - testing and devset (aka developer set).

In [76]:
# setting up partitions ratios
train_ratio = 0.8
dev_ratio = 0.1
test_ratio = 0.1

partitions_array = [train_ratio, dev_ratio, test_ratio]

# splitting data
train_data, temp_split_data, train_labels, temp_split_labels = train_test_split(fake_data, fake_labels, train_size=partitions_array[0])

# evenly splitting the temp set into dev and test sets
devset_data, test_data, devset_labels, test_labels = train_test_split(temp_split_data, temp_split_labels, train_size=0.5)

# more complicated way to determine split (only works if value for train is in first index)
'''
split = partitions_array[1] / np.sum(partitions_array[1:])
devset_data, test_data, devset_labels, test_labels = train_test_split(temp_split_data, temp_split_labels, train_size= split)
'''

# printing resulting sets
print("Fake dataset\n", fake_data, fake_labels, "\n")
print("Train set\n", train_data, train_labels, "\n")
print("Devset", devset_data, devset_labels, "\n")
print("Test set", test_data, test_labels, "\n")

Fake dataset
 [[ 11  12  13  14]
 [ 21  22  23  24]
 [ 31  32  33  34]
 [ 41  42  43  44]
 [ 51  52  53  54]
 [ 61  62  63  64]
 [ 71  72  73  74]
 [ 81  82  83  84]
 [ 91  92  93  94]
 [101 102 103 104]] [False False False False False  True  True  True  True  True] 

Train set
 [[21 22 23 24]
 [31 32 33 34]
 [41 42 43 44]
 [81 82 83 84]
 [91 92 93 94]
 [61 62 63 64]
 [71 72 73 74]
 [11 12 13 14]] [False False False  True  True  True  True False] 

Devset [[51 52 53 54]] [False] 

Test set [[101 102 103 104]] [ True] 



## Partitioning Data with `random_split`
- This function is from pytorch.utils.data.
- The dataset must be converted to Tensors for things to work correctly.
- I really like this way.

In [85]:
# converting data to tensors
fake_data_T = torch.Tensor(fake_data)
fake_labels_T = torch.Tensor(fake_labels)

# creating torch dataset (want to maintain relationship between data and labels)
fake_dataset = TensorDataset(fake_data_T, fake_labels_T)

# creating data loader
fake_dataset_dl = DataLoader(fake_dataset, batch_size=2, shuffle=True)

# splitting data using random_split
split_datasets = random_split(fake_dataset, lengths= [0.8,0.1,0.1])

print("Split datasets\n")
for i, dataset in enumerate(split_datasets):
    print(f"Dataset {i+1}\n")
    for data in dataset:
        print(data)
    print("\n")

Split datasets

Dataset 1

(tensor([91., 92., 93., 94.]), tensor(1.))
(tensor([51., 52., 53., 54.]), tensor(0.))
(tensor([41., 42., 43., 44.]), tensor(0.))
(tensor([61., 62., 63., 64.]), tensor(1.))
(tensor([21., 22., 23., 24.]), tensor(0.))
(tensor([81., 82., 83., 84.]), tensor(1.))
(tensor([101., 102., 103., 104.]), tensor(1.))
(tensor([31., 32., 33., 34.]), tensor(0.))


Dataset 2

(tensor([11., 12., 13., 14.]), tensor(0.))


Dataset 3

(tensor([71., 72., 73., 74.]), tensor(1.))




## Partitioning Data Using numpy (manual)
- Less straightforward, but you don't have to convert data to tensors.
- In this method, 'boundaries' will be determined from the desired ratio / proportion of the data sets.
- A list of indices from the main dataset is created, with the indices being in a randomized order.
    - From this list the first **n** data points will go into the t*raining* set, the next **m** will go into the *dev set*, and the last **x** will go into the *test* set.

In [78]:
# setting partition ratio / proportion
train_ratio = 0.8
dev_ratio = 0.1
test_ratio = 0.1

partitions_array = np.array([train_ratio, dev_ratio, test_ratio])
print("Partitions:", partitions_array)

# converting ratios to integers
partition_boundaries = np.cumsum(partitions_array*len(fake_labels)).astype(int)
print("Partition boundaries:", partition_boundaries)

# selecting random indices.
random_indices = np.random.permutation(range(len(fake_labels)))
print("Random indices:", random_indices)

# splitting data
train_data_N = fake_data[random_indices[:partition_boundaries[0]],:]
train_labels_N = fake_labels[random_indices[:partition_boundaries[0]]]

devset_data_N = fake_data[random_indices[partition_boundaries[0]:partition_boundaries[1]],:]
devset_labels_N = fake_labels[random_indices[partition_boundaries[0]:partition_boundaries[1]]]

test_data_N = fake_data[random_indices[partition_boundaries[1]:],:]
test_labels_N = fake_labels[random_indices[partition_boundaries[1]:]]

# printing resulting sets
print(f"\nTraining set: size = {train_data_N.shape}\n", train_data_N, train_labels_N)
print(f"\nDevset: size = {devset_data_N.shape}\n", devset_data_N, devset_labels_N)
print(f"\nTest set: size = {test_data_N.shape}\n", test_data_N, test_labels_N)

Partitions: [0.8 0.1 0.1]
Partition boundaries: [ 8  9 10]
Random indices: [9 5 8 4 1 7 3 0 2 6]

Training set: size = (8, 4)
 [[101 102 103 104]
 [ 61  62  63  64]
 [ 91  92  93  94]
 [ 51  52  53  54]
 [ 21  22  23  24]
 [ 81  82  83  84]
 [ 41  42  43  44]
 [ 11  12  13  14]] [ True  True  True False False  True False False]

Devset: size = (1, 4)
 [[31 32 33 34]] [False]

Test set: size = (1, 4)
 [[71 72 73 74]] [ True]
