# Dataset & Dataloader

* epoch: Forward & backward pass of ALL training samples
* batch_size: no. of training samples in one forward & backward pass
* number of iterations: no. of passes, each pass using batch_size number of samples  
Example: 100 samples, batch_size=20 -> 100 / 20 = 5 iterations for 1 epoch

In [1]:
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math

## Custom Dataset Class

In [2]:
class WineDataset(Dataset):
    def __init__(self):
        # data loading
        xy = np.loadtxt('data/wine.csv', delimiter=',', dtype=np.float32, skiprows=1)
        self.X = torch.from_numpy(xy[:, 1:])
        self.y = torch.from_numpy(xy[:, [0]])
        self.n_sample = xy.shape[0]
        
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]
    
    def __len__(self):
        return self.n_sample

In [3]:
dataset = WineDataset()

In [4]:
first_data = dataset[0]
features, labels = first_data
print(features, labels)

tensor([1.4230e+01, 1.7100e+00, 2.4300e+00, 1.5600e+01, 1.2700e+02, 2.8000e+00,
        3.0600e+00, 2.8000e-01, 2.2900e+00, 5.6400e+00, 1.0400e+00, 3.9200e+00,
        1.0650e+03]) tensor([1.])


## Dataloader

In [5]:
dataloader = DataLoader(dataset=dataset, batch_size=4, shuffle=True, num_workers=0)
# batch_size: number of samples per gradient update
# shuffle: whether to shuffle the samples
# num_workers: number of processes to use for data loading.
# num_workers = 0 is equivalent to no multiprocessing
# Only use num_workers > 0 inside if __name__ == '__main__'

In [6]:
dataiter = iter(dataloader)
data = dataiter.next()
features, labels = data
print(features, labels)

tensor([[1.2860e+01, 1.3500e+00, 2.3200e+00, 1.8000e+01, 1.2200e+02, 1.5100e+00,
         1.2500e+00, 2.1000e-01, 9.4000e-01, 4.1000e+00, 7.6000e-01, 1.2900e+00,
         6.3000e+02],
        [1.4380e+01, 1.8700e+00, 2.3800e+00, 1.2000e+01, 1.0200e+02, 3.3000e+00,
         3.6400e+00, 2.9000e-01, 2.9600e+00, 7.5000e+00, 1.2000e+00, 3.0000e+00,
         1.5470e+03],
        [1.2770e+01, 2.3900e+00, 2.2800e+00, 1.9500e+01, 8.6000e+01, 1.3900e+00,
         5.1000e-01, 4.8000e-01, 6.4000e-01, 9.9000e+00, 5.7000e-01, 1.6300e+00,
         4.7000e+02],
        [1.3780e+01, 2.7600e+00, 2.3000e+00, 2.2000e+01, 9.0000e+01, 1.3500e+00,
         6.8000e-01, 4.1000e-01, 1.0300e+00, 9.5800e+00, 7.0000e-01, 1.6800e+00,
         6.1500e+02]]) tensor([[3.],
        [1.],
        [3.],
        [3.]])


In [7]:
num_epochs = 2
total_sample = len(dataset)
n_iters = math.ceil(total_sample / 4)
print(total_sample, n_iters)

178 45


## Dummy Training Loop

In [8]:
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(dataloader):
        if (i + 1) % 5 == 0:
            print(f"epoch: {epoch+1}, iter: {i+1}/{n_iters}, inputs: {inputs.shape}, labels: {labels.shape}")

epoch: 1, iter: 5/45, inputs: torch.Size([4, 13]), labels: torch.Size([4, 1])
epoch: 1, iter: 10/45, inputs: torch.Size([4, 13]), labels: torch.Size([4, 1])
epoch: 1, iter: 15/45, inputs: torch.Size([4, 13]), labels: torch.Size([4, 1])
epoch: 1, iter: 20/45, inputs: torch.Size([4, 13]), labels: torch.Size([4, 1])
epoch: 1, iter: 25/45, inputs: torch.Size([4, 13]), labels: torch.Size([4, 1])
epoch: 1, iter: 30/45, inputs: torch.Size([4, 13]), labels: torch.Size([4, 1])
epoch: 1, iter: 35/45, inputs: torch.Size([4, 13]), labels: torch.Size([4, 1])
epoch: 1, iter: 40/45, inputs: torch.Size([4, 13]), labels: torch.Size([4, 1])
epoch: 1, iter: 45/45, inputs: torch.Size([2, 13]), labels: torch.Size([2, 1])
epoch: 2, iter: 5/45, inputs: torch.Size([4, 13]), labels: torch.Size([4, 1])
epoch: 2, iter: 10/45, inputs: torch.Size([4, 13]), labels: torch.Size([4, 1])
epoch: 2, iter: 15/45, inputs: torch.Size([4, 13]), labels: torch.Size([4, 1])
epoch: 2, iter: 20/45, inputs: torch.Size([4, 13]), la

## Other Datasets

In [9]:
# some famous datasets are available in torchvision.datasets
# e.g. MNIST, Fashion-MNIST, CIFAR10, COCO

train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=torchvision.transforms.ToTensor(), download=True)

train_loader = DataLoader(dataset=train_dataset, batch_size=3, shuffle=True)

# look at one random sample
dataiter = iter(train_loader)
data = dataiter.next()
inputs, targets = data
print(inputs.shape, targets.shape)

torch.Size([3, 1, 28, 28]) torch.Size([3])
