In [None]:
# %% Deep learning - Section 8.67
#    Splitting data into train, devset, test

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# Fake dataset

fake_data   = np.tile( np.array([1,2,3,4]),(10,1) ) + np.tile( 10*np.arange(1,11),(4,1) ).T
fake_labels = np.arange(10)>4

print(fake_data)
print()
print(fake_labels)


In [None]:
# %% Partition with scikitlearn
#    Basically split training data and else, and then split else into two subsets

# Partition proportions (train,dev,test)
partition = [.8,.1,.1]

# Split data into train and temporary test
train_data,testTMP_data,train_labels,testTMP_labels = train_test_split(fake_data,fake_labels,train_size=partition[0])

# Split temporary test into devset and test set
split = partition[1] / np.sum(partition[1:])
dev_data,test_data,dev_labels,test_labels = train_test_split(testTMP_data,testTMP_labels,train_size=split)

# Print sizes and data
print(f'Training data size: {train_data.shape}')
print(f'Devset data size: {dev_data.shape}')
print(f'Test data size: {test_data.shape}')
print()

print(f'Training data: \n{train_data}\n')
print(f'Devset data: \n{dev_data}\n')
print(f'Test data: \n{test_data}')


In [None]:
# %% Manual partition

# Proportions
partition = np.array([.8,.1,.1])
print(f'Partition proportions: {partition}')
print()

# Convert to integers
partition_int = np.cumsum(partition*len(fake_labels)).astype(int)
print(f'Partition boundaries: {partition_int}')
print()

# Randomise indices
random_indices = np.random.permutation(range(len(fake_labels)))
print(f'Randomised indices: {random_indices}')


In [None]:
# %% Manual selection

# Get rows for training data
train_data   = fake_data[random_indices[:partition_int[0]],:]
train_labels = fake_labels[random_indices[:partition_int[0]]]

# Get rows for training data
dev_data   = fake_data[random_indices[partition_int[0]:partition_int[1]],:]
dev_labels = fake_labels[random_indices[partition_int[0]:partition_int[1]]]

# Get rows for training data
test_data   = fake_data[random_indices[partition_int[1]:],:]
test_labels = fake_labels[random_indices[partition_int[1]:]]

# Print sizes and data
print(f'Training data size: {train_data.shape}')
print(f'Devset data size: {dev_data.shape}')
print(f'Test data size: {test_data.shape}')
print()

print(f'Training data: \n{train_data}\n')
print(f'Devset data: \n{dev_data}\n')
print(f'Test data: \n{test_data}')
