## DataLoader Notebook 

This notebook prepares the Flickr30K dataset for training. 

It loads and preprocesses the dataset, expands the data so that each image is paired with all its associated captions, tokenizes the captions using a pretrained DistilBERT tokenizer, and applies standard image transformations. Finally, it wraps everything into a PyTorch Dataset and DataLoader to efficiently feed batches of images and tokenized captions into a model for training.

In [1]:
#Libraries
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import pandas as pd
from transformers import AutoTokenizer
from ast import literal_eval

os.chdir("..")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Paths
DATA_PATH = "./data/raw/"
IMG_PATH = DATA_PATH + "Flickr30k_images/"

In [3]:
#Load Annotations
df = pd.read_csv(DATA_PATH + "flickr_annotations_30k.csv")
df = df[df["split"]=="train"].reset_index(drop=True)
print("Total Observations:", len(df))
df.head()

Total Observations: 29000


Unnamed: 0,raw,sentids,split,filename,img_id
0,"[""Two young guys with shaggy hair look at thei...","[0, 1, 2, 3, 4]",train,1000092795.jpg,0
1,"[""Several men in hard hats are operating a gia...","[5, 6, 7, 8, 9]",train,10002456.jpg,1
2,"[""A child in a pink dress is climbing up a set...","[10, 11, 12, 13, 14]",train,1000268201.jpg,2
3,"[""Someone in a blue shirt and hat is standing ...","[15, 16, 17, 18, 19]",train,1000344755.jpg,3
4,"[""Two men, one in a gray shirt, one in a black...","[20, 21, 22, 23, 24]",train,1000366164.jpg,4


In [4]:

def expand_captions(df, list_col='raw', filename_col='filename'):
    df[list_col] = df[list_col].apply(literal_eval)
    df_expanded = df.explode(list_col).rename(columns={list_col: 'caption'}).reset_index(drop=True)
    df_expanded = df_expanded[[filename_col, 'caption']]
    return df_expanded


df_exploded = expand_captions(df)
print("Total Observations after exploding captions:", len(df_exploded))


Total Observations after exploding captions: 145000


In [15]:
# We Initialize the tokenizer
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
MAX_LENGTH = 64
tokens = tokenizer(df_exploded['caption'][0], padding='max_length', truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
print(f"Example tokenized caption: \nReal caption: {df_exploded['caption'][0]} \nToken: {tokens}")

Example tokenized caption: 
Real caption: Two young guys with shaggy hair look at their hands while hanging out in the yard. 
Token: {'input_ids': tensor([[49406,  1237,  1888,  1791,   593, 42662,  2225,  1012,   536,   911,
          3500,  1519,  4850,   620,   530,   518,  4313,   269, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [16]:
# We define image transformations for preprocessing based on CLIP
transform = transforms.Compose([
    transforms.Resize((224,224)),        
    transforms.ToTensor(),                 
    transforms.Normalize(                  
        mean=[0.48145466, 0.4578275, 0.40821073],
        std=[0.26862954, 0.26130258, 0.27577711])
])

In [None]:
# We define a Pytorch Dataset that returns both the processed image tensor and the tokenized caption.
class Flickr30kDataset(Dataset):
    def __init__(self, df, img_root, tokenizer, transform=None, max_length=64):
        self.df = df.reset_index(drop=True)
        self.img_root = img_root
        self.tokenizer = tokenizer
        self.transform = transform
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = f"{self.img_root}/{row.filename}"
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        tokens = self.tokenizer(
            row.caption,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        tokens = {k: v.squeeze(0) for k, v in tokens.items()}

        return image, tokens

In [17]:
# We load the dataset and create a DataLoader
dataset = Flickr30kDataset(df_exploded, IMG_PATH, tokenizer, transform, max_length=MAX_LENGTH)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)