In [None]:
# %% Deep learning - Section 12.117
#    Anatomy of a torch dataset and dataloader

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [None]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')


In [None]:
# %% Data

n_observations = 100
n_features     = 20

data = np.random.randn(n_observations,n_features)


In [None]:
# %% Convert to pytorch tensor

data_T = torch.tensor(data)

# Print some info for comparison
print('Numpy data:')
print(type(data))        # class of variable
print(data.shape)        # size
print(data.dtype)        # type of data
print()

print('Tensor data:')
print(type(data_T))      # class of variable
print(data_T.size())     # size (thanks python for the inconsistencies)
print(data_T.dtype)      # type of data
print()


In [None]:
# %% Data type conversion

# Sometimes you need to convert data into specific types, for example float for
# numerical data, and int for labels (.float() and .long())
data_T2 = torch.tensor(data,dtype=torch.float)
data_T2 = torch.tensor(data).float()
print(data_T2.dtype)

data_T3 = torch.tensor(data,dtype=torch.long)
data_T3 = torch.tensor(data).long()
print(data_T3.dtype)


In [None]:
# %% Convert tensor into PyTorch Dataset

# Note that if you input the numpy 'data' it doesn't work, it requires a tensor
dataset = TensorDataset(data_T)

# The dataset is now a two-element tuple containing data and labels (here labels
# are still missing)
print(dataset.tensors)
print(len(dataset.tensors))


In [None]:
# %% Add some labels

# Generate and convert into a column vector (otherwise it's just an un-ordered
# list of elements)
labels = torch.ceil(torch.linspace(.01,4,n_observations))
labels = labels.reshape((len(labels),1))

# Make another dataset
dataset = TensorDataset(data_T,labels)

# The dataset is now a two-element tuple containing data and labels
print(dataset.tensors)
print(len(dataset.tensors))
print(dataset.tensors[0].size())
print(dataset.tensors[1].size())
print()
print(np.shape(np.random.randint(5,size=n_observations)))


In [None]:
# %% DataLoader objects

batch_size = 15
dataloader = DataLoader(dataset,batch_size=batch_size,shuffle=False,drop_last=False)

# Show size of DataLoader
dataloader.dataset.tensors[0].size()

# Show size of each batch by calling dataloader as an iterable
for dat,lab in dataloader:
    print('BATCH INFO:')
    print(dat.size())
    print(lab.size())
    print()

# Inspect labels as well
for dat,lab in dataloader:
    print(lab.T)
    print()


In [None]:
# %% Same as above but with shuffling and drop last option

batch_size = 15
dataloader = DataLoader(dataset,batch_size=batch_size,shuffle=True,drop_last=True)

# Show size of DataLoader
dataloader.dataset.tensors[0].size()

# Show size of each batch by calling dataloader as an iterable
for dat,lab in dataloader:
    print('BATCH INFO:')
    print(dat.size())
    print(lab.size())
    print()

# Inspect labels (notice that if shuffle is on, the dataloader will be
# reshuffled any time is called as an iterable, you don't need to recall the
# DataLoader() function; try by commenting out that line)
for dat,lab in dataloader:
    print(lab.T)
    print()


In [None]:
# %% How to get only one batch

# For example, for testing
dat,lab = next(iter(dataloader))

print(dat.T)
print(lab.T)
