### Building Input Pipelines with PyTorch

In [1]:
import torch
from torch.utils.data import DataLoader

In [3]:
t = torch.arange(16,dtype=torch.float16)
t

tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
        14., 15.], dtype=torch.float16)

In [8]:
t = t.reshape((4,-1))

In [9]:
# create a simple dataloader out of it

dataloader = DataLoader(t)
dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f5763972380>

In [14]:
#fetch from the dataloader
for i in dataloader:
    print(i)




tensor([[0., 1., 2., 3.]], dtype=torch.float16)
tensor([[4., 5., 6., 7.]], dtype=torch.float16)
tensor([[ 8.,  9., 10., 11.]], dtype=torch.float16)
tensor([[12., 13., 14., 15.]], dtype=torch.float16)


In [13]:
# create a dataloader with batchsizse
    
dataloader_2 = DataLoader(t,batch_size=2)
for i,batch in enumerate(dataloader_2):
    print(f'Batch no:{i+1} data:{batch}')

Batch no:1 data:tensor([[0., 1., 2., 3.],
        [4., 5., 6., 7.]], dtype=torch.float16)
Batch no:2 data:tensor([[ 8.,  9., 10., 11.],
        [12., 13., 14., 15.]], dtype=torch.float16)


In [15]:
#Drop the last full batch by argument drop_last=True, this will drop the last unfinished batch ie batch not of the same size that of the rest of the batches

Combining two tensors into a joint dataset

In [20]:
t_x = torch.rand((5,4), dtype=torch.float16)
t_y = torch.arange(5,dtype=torch.float16)

In [21]:
t_x, t_y

(tensor([[0.0825, 0.1021, 0.0024, 0.3403],
         [0.4878, 0.7627, 0.5527, 0.7520],
         [0.8945, 0.9390, 0.5469, 0.0903],
         [0.6040, 0.4917, 0.6421, 0.5015],
         [0.8022, 0.8433, 0.9526, 0.1558]], dtype=torch.float16),
 tensor([0., 1., 2., 3., 4.], dtype=torch.float16))

In [28]:
#now create a dataset that returns single row and subsequent target
from torch.utils.data import Dataset
class JointDataset(Dataset):
    def __init__(self, x,y):
        super().__init__()
        self.x = x
        self.y = y
    
    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [30]:
dt = JointDataset(t_x, t_y)
for i,batch in enumerate(dt):
    print(f"batch {i+1} and {batch}")


batch 1 and (tensor([0.0825, 0.1021, 0.0024, 0.3403], dtype=torch.float16), tensor(0., dtype=torch.float16))
batch 2 and (tensor([0.4878, 0.7627, 0.5527, 0.7520], dtype=torch.float16), tensor(1., dtype=torch.float16))
batch 3 and (tensor([0.8945, 0.9390, 0.5469, 0.0903], dtype=torch.float16), tensor(2., dtype=torch.float16))
batch 4 and (tensor([0.6040, 0.4917, 0.6421, 0.5015], dtype=torch.float16), tensor(3., dtype=torch.float16))
batch 5 and (tensor([0.8022, 0.8433, 0.9526, 0.1558], dtype=torch.float16), tensor(4., dtype=torch.float16))


In [32]:
#Dataloader in batches

torch.manual_seed(33)

data_loader = DataLoader(dt, batch_size=3,shuffle=True)
for i in data_loader:
    print(i)

[tensor([[0.6040, 0.4917, 0.6421, 0.5015],
        [0.0825, 0.1021, 0.0024, 0.3403],
        [0.8945, 0.9390, 0.5469, 0.0903]], dtype=torch.float16), tensor([3., 0., 2.], dtype=torch.float16)]
[tensor([[0.8022, 0.8433, 0.9526, 0.1558],
        [0.4878, 0.7627, 0.5527, 0.7520]], dtype=torch.float16), tensor([4., 1.], dtype=torch.float16)]


In [33]:
#when training a model for multiple epochs, we need to shuffle and iterate over the dataset

for epoch in range(5):
    print(f"epoch {epoch}")
    for i, batch in enumerate(data_loader,start=1):
        print(f"batch {i} and {batch}")

epoch 0
batch 1 and [tensor([[0.0825, 0.1021, 0.0024, 0.3403],
        [0.4878, 0.7627, 0.5527, 0.7520],
        [0.6040, 0.4917, 0.6421, 0.5015]], dtype=torch.float16), tensor([0., 1., 3.], dtype=torch.float16)]
batch 2 and [tensor([[0.8022, 0.8433, 0.9526, 0.1558],
        [0.8945, 0.9390, 0.5469, 0.0903]], dtype=torch.float16), tensor([4., 2.], dtype=torch.float16)]
epoch 1
batch 1 and [tensor([[0.4878, 0.7627, 0.5527, 0.7520],
        [0.8022, 0.8433, 0.9526, 0.1558],
        [0.6040, 0.4917, 0.6421, 0.5015]], dtype=torch.float16), tensor([1., 4., 3.], dtype=torch.float16)]
batch 2 and [tensor([[0.0825, 0.1021, 0.0024, 0.3403],
        [0.8945, 0.9390, 0.5469, 0.0903]], dtype=torch.float16), tensor([0., 2.], dtype=torch.float16)]
epoch 2
batch 1 and [tensor([[0.0825, 0.1021, 0.0024, 0.3403],
        [0.6040, 0.4917, 0.6421, 0.5015],
        [0.4878, 0.7627, 0.5527, 0.7520]], dtype=torch.float16), tensor([0., 3., 1.], dtype=torch.float16)]
batch 2 and [tensor([[0.8022, 0.8433, 0.952

In [49]:
import glob
import os
import pathlib
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
imdir = pathlib.Path('cat_dog')


imdir

PosixPath('cat_dog')

In [43]:
file_list = sorted([str(path) for path in imdir.glob('*.png')])
file_list

['cat_dog/im1.png',
 'cat_dog/im2.png',
 'cat_dog/im3.png',
 'cat_dog/im4.png',
 'cat_dog/im5.png',
 'cat_dog/im6.png']

In [75]:
fig = plt.figure(figsize=(10,5))

<Figure size 1000x500 with 0 Axes>

In [76]:
for i, file in enumerate(file_list):
    img = Image.open(file)
    print('Image shape',np.array(img).shape)
    ax = fig.add_subplot(2,3, i+1)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.imshow(img)
    ax.set_title(os.path.basename(file))
plt.tight_layout()
plt.show()
#mistakenly all the images were in single image


Image shape (600, 1179, 4)
Image shape (600, 1179, 4)
Image shape (600, 1179, 4)
Image shape (600, 1179, 4)
Image shape (600, 1179, 4)
Image shape (600, 1179, 4)


<Figure size 640x480 with 0 Axes>