## Loading Dataset in Batches

In [1]:
import torch

In [2]:
# Creating a tensor
a = torch.rand((10,3))
print(a)

tensor([[0.0762, 0.2023, 0.4843],
        [0.8552, 0.5819, 0.9306],
        [0.0195, 0.9490, 0.5095],
        [0.7226, 0.9338, 0.2665],
        [0.2333, 0.0095, 0.8025],
        [0.0247, 0.5006, 0.8099],
        [0.7040, 0.2090, 0.6323],
        [0.0022, 0.7018, 0.1988],
        [0.3765, 0.0320, 0.4911],
        [0.1608, 0.6095, 0.4241]])


In [3]:
from torch.utils.data import DataLoader

In [4]:
data = DataLoader(a, batch_size=3)
"""
This loads the whole dataset but in small chunks that can be used as mini-batches
"""
for i, data in enumerate(data):
    print(f"{1}) {data}")

1) tensor([[0.0762, 0.2023, 0.4843],
        [0.8552, 0.5819, 0.9306],
        [0.0195, 0.9490, 0.5095]])
1) tensor([[0.7226, 0.9338, 0.2665],
        [0.2333, 0.0095, 0.8025],
        [0.0247, 0.5006, 0.8099]])
1) tensor([[0.7040, 0.2090, 0.6323],
        [0.0022, 0.7018, 0.1988],
        [0.3765, 0.0320, 0.4911]])
1) tensor([[0.1608, 0.6095, 0.4241]])


## Creating a custom inheritance of pytorch of Dataset

In [5]:
# Joining 2 data set
# We need to create a custom joinDataset class that inheritate Dataset

from torch.utils.data import Dataset

class joinDataset(Dataset):
    """
    When we Create inheritance of Dataset class we need to have these 3 compulsory methods:
    1. __init__
    2. __len__
    3. __getitem__
    """
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [6]:
# creating datasets or tensors to join
d1 = torch.rand((6,3)) # think this is feature dataset
d2 = torch.rand((6)) # and this is target dataset

dataset = joinDataset(d1, d2)

In [7]:
for i in dataset:
    print(i)

(tensor([0.8812, 0.9455, 0.1917]), tensor(0.3988))
(tensor([0.8008, 0.0030, 0.2193]), tensor(0.4160))
(tensor([0.1905, 0.7179, 0.4375]), tensor(0.4118))
(tensor([0.6937, 0.7987, 0.8955]), tensor(0.6417))
(tensor([0.5297, 0.1944, 0.1129]), tensor(0.1690))
(tensor([0.8605, 0.7493, 0.7651]), tensor(0.7931))


In [8]:
torch.manual_seed(1) # fixing the random seed so that the suffled dataset will contain same row no matter how many time I rerun 
data_set = DataLoader(dataset, batch_size= 2, shuffle = True)

In [9]:
for i, batch in enumerate(data_set):
    print(f"{i}. {batch}")

0. [tensor([[0.8812, 0.9455, 0.1917],
        [0.8008, 0.0030, 0.2193]]), tensor([0.3988, 0.4160])]
1. [tensor([[0.5297, 0.1944, 0.1129],
        [0.6937, 0.7987, 0.8955]]), tensor([0.1690, 0.6417])]
2. [tensor([[0.1905, 0.7179, 0.4375],
        [0.8605, 0.7493, 0.7651]]), tensor([0.4118, 0.7931])]


In [11]:
""" For every epochs it the batchs will get suffled """
for epoch in range(2):
    print(f"epoch {epoch}")
    for i, batch in enumerate(data_set):
        print(f"{i}. {batch}")

epoch 0
0. [tensor([[0.8812, 0.9455, 0.1917],
        [0.5297, 0.1944, 0.1129]]), tensor([0.3988, 0.1690])]
1. [tensor([[0.1905, 0.7179, 0.4375],
        [0.8008, 0.0030, 0.2193]]), tensor([0.4118, 0.4160])]
2. [tensor([[0.8605, 0.7493, 0.7651],
        [0.6937, 0.7987, 0.8955]]), tensor([0.7931, 0.6417])]
epoch 1
0. [tensor([[0.8008, 0.0030, 0.2193],
        [0.6937, 0.7987, 0.8955]]), tensor([0.4160, 0.6417])]
1. [tensor([[0.1905, 0.7179, 0.4375],
        [0.8812, 0.9455, 0.1917]]), tensor([0.4118, 0.3988])]
2. [tensor([[0.5297, 0.1944, 0.1129],
        [0.8605, 0.7493, 0.7651]]), tensor([0.1690, 0.7931])]
