In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision
import math
import numpy as np
from sklearn.datasets import load_wine

# Datasets and Dataloader 
Work with the whole dataset might be *very time&memory consuming*, so a better way for a large dataset is **to divide the samples into mini-batches** and **do optimization based on current batch**

PyTorch provide special classes that can do the batch separation/optimization for us.  
Some terms:  
- *epoch* - forward and backward pass of ALL training samples (might be many batches)  
- *batch size* - number of samples in one forward & backward pass  
- *number of iterations* - number of passes to make one epoch (each pass using [batch size] number of samples) e.g. 100 samples, batch size=20 --> 5 iterations on 1 epoch

In [25]:
# we have to implement 3 things
class WineDataset(Dataset):
    def __init__(self):
        # data loading
        self.dataset = load_wine()
        self.X = torch.from_numpy(dataset.data.astype(np.float32))
        self.y = torch.from_numpy(dataset.target.astype(np.float32))
        # we need column vector
        self.y = y.view(-1, 1)
        
    def __getitem__(self, index):
        # dataset[index] -> (object, label/target)
        return (self.X[index], self.y[index])

    def __len__(self):
        # length of our dataset (number of samples)
        return self.X.shape[0]

In [27]:
dataset = WineDataset()
print('length of dataset is', len(dataset))
dataset[5]

length of dataset is 178


(tensor([1.4200e+01, 1.7600e+00, 2.4500e+00, 1.5200e+01, 1.1200e+02, 3.2700e+00,
         3.3900e+00, 3.4000e-01, 1.9700e+00, 6.7500e+00, 1.0500e+00, 2.8500e+00,
         1.4500e+03]),
 tensor([0.]))

In [38]:
# 
dataloader = DataLoader(dataset=dataset,
                        batch_size=32,
                        shuffle=True,
                        #num_workers=2 using some workers might make train faster
                         ) 

Now we can use our dataloader as an iterator

In [40]:
number_of_epoches = 2
for epoch in range(number_of_epoches):
    print()
    print(f'epoch is {epoch+1}  -----------------------------------------')
    for i, (features, target) in enumerate(dataloader):
        print(f'batch №{i+1}: size of batch is {features.shape}')
        # targets = {target.view(-1)}')


epoch is 1  -----------------------------------------
batch №1: size of batch is torch.Size([32, 13])
batch №2: size of batch is torch.Size([32, 13])
batch №3: size of batch is torch.Size([32, 13])
batch №4: size of batch is torch.Size([32, 13])
batch №5: size of batch is torch.Size([32, 13])
batch №6: size of batch is torch.Size([18, 13])

epoch is 2  -----------------------------------------
batch №1: size of batch is torch.Size([32, 13])
batch №2: size of batch is torch.Size([32, 13])
batch №3: size of batch is torch.Size([32, 13])
batch №4: size of batch is torch.Size([32, 13])
batch №5: size of batch is torch.Size([32, 13])
batch №6: size of batch is torch.Size([18, 13])


Also we can use *PyTorch built-in datasets*

In [None]:
MNIST_dataset = torchvision.datasets.MNIST()