In [None]:
# %% Deep learning - Section 18.172
#    Creating and using custom DataLoaders

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [2]:
# %% Libraries and modules
import numpy                  as np
import matplotlib.pyplot      as plt
import torch
import torch.nn               as nn
import seaborn                as sns
import copy
import torch.nn.functional    as F
import pandas                 as pd
import scipy.stats            as stats
import sklearn.metrics        as skm
import time
import sys
import imageio.v2             as imageio
import torchvision
import torchvision.transforms as T

from torch.utils.data                 import DataLoader,TensorDataset,Dataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from scipy.stats                      import zscore
from sklearn.decomposition            import PCA
from scipy.signal                     import convolve2d
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')
plt.style.use('default')


In [None]:
# %% Order of operations when loading your (own) data and applying transforms

# 1) Import data
# 2) Create a custom DataSet class
# 3) Define the transformations
# 4) Create a DataSet with data and transformations
# 5) Create a DataLoader (same old)

# N.B. Steps 1 and 2 can be merged if importing a torchvision dataset because
# that already allows to add transformations (see code 18.171)


In [None]:
# %% Data

# Load data
data = np.loadtxt(open('sample_data/mnist_train_small.csv','rb'),delimiter=',')

# Split labels from data (only keep the first 8 images for illustration purpose)
labels = data[:8,0]
data   = data[:8,1:]

# Normalise data (original range is (0,255), technichally not needed as the
# transformation also does this)
data_norm = data / np.max(data)

# Reshape to actual 2D images
data_norm = data_norm.reshape(data_norm.shape[0],1,28,28)

print(data_norm.shape)
print(labels.shape)

# Convert to tensor (float and integers)
data_tensor   = torch.tensor(data_norm).float()
labels_tensor = torch.tensor(labels).long()



In [5]:
# The custom dataset class is modeled after the official PyTorch class

??torch.utils.data.TensorDataset


In [6]:
# %% Custom DataSet class

# Since the original class does not include transforms, this is what we want to
# add here

class customDataSet(Dataset):
    def __init__(self,tensors,transform=None):

        # Check that all data have a corresponding label
        assert all(tensors[0].size(0)==t.size(0) for t in tensors), "Size mismatch between tensors"

        # Assign inputs
        self.tensors   = tensors
        self.transform = transform

    def __getitem__(self,index):

        # Return transformed version is there is a transformation
        if self.transform:
            x = self.transform(self.tensors[0][index])
        else:
            x = self.tensors[0][index]

        # Return labels
        y = self.tensors[1][index]

        # Return (data,label) tuple
        return x,y

    def __len__(self):

        return self.tensors[0].size(0)


In [None]:
# %% From Data, do DataSet, to DataLoader

# Create a list of transformations (several transforms work only on PIL-format
# data, so it's common to transform to PIL, apply transformations, then
# transform back to tensor)
transforms = T.Compose([ T.ToPILImage(),
                         T.RandomVerticalFlip(p=.5),
                         T.RandomRotation(90),
                         T.ToTensor() ])

# Convert to custom DataSet (if you had actual data, also the test data would
# need the transform)
train_data = customDataSet((data_tensor,labels_tensor),transforms)
print(type(train_data))

# Note how we haven't actually increased the amount of data
print(len(train_data))

# Convert into DataLoader objects
train_loader = DataLoader(train_data,batch_size=8,shuffle=False)


In [None]:
# %% Plotting

X,y = next(iter(train_loader))

phi = (1 + np.sqrt(5)) / 2
fig,axs = plt.subplots(2,8,figsize=(2*phi*5,5))

for i in range(8):

    axs[0,i].imshow(data_tensor[i,0,:,:].detach(),cmap='gray')
    axs[1,i].imshow(X[i,0,:,:].detach(),cmap='gray')

    for row in range(2):
        axs[row,i].set_xticks([])
        axs[row,i].set_yticks([])

axs[0,0].set_ylabel('Original')
axs[1,0].set_ylabel('torch dataset')

plt.savefig('figure31_custom_dataloader.png')
plt.show()
files.download('figure31_custom_dataloader.png')
