In [1]:
!pip install -q "datasets<3.0.0"

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/527.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/177.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.6/177.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2024.6.1 which is incompatible.[0m[31m
[0m

In [2]:
from datasets import load_dataset
import datasets

print("datasets version:", datasets.__version__)

ds = load_dataset("nlphuji/flickr30k")
print(ds)

datasets version: 2.21.0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/506M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/502M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/506M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/512M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/504M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/495M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/495M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/497M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/289M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/31014 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['image', 'caption', 'sentids', 'split', 'img_id', 'filename'],
        num_rows: 31014
    })
})


In [3]:
# Imports and basic config

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

from transformers import AutoTokenizer  # for text tokenization (CLIP tokenizer)
from datasets import load_dataset

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [5]:
# Load the Flickr30k dataset

ds = load_dataset("nlphuji/flickr30k")

# Check the available splits
print(ds)

# Look at one example to see the keys
print(ds["test"][0])

DatasetDict({
    test: Dataset({
        features: ['image', 'caption', 'sentids', 'split', 'img_id', 'filename'],
        num_rows: 31014
    })
})
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=333x500 at 0x79B96D50C530>, 'caption': ['Two young guys with shaggy hair look at their hands while hanging out in the yard.', 'Two young, White males are outside near many bushes.', 'Two men in green shirts are standing in a yard.', 'A man in a blue shirt standing in a garden.', 'Two friends enjoy time spent together.'], 'sentids': ['0', '1', '2', '3', '4'], 'split': 'train', 'img_id': '0', 'filename': '1000092795.jpg'}


In [6]:
# Define image and text transforms

# Image transform: PIL -> resized tensor in [-1, 1]
image_transform = transforms.Compose([
    transforms.Resize((512, 512)),              # resize to 512x512 for your model
    transforms.ToTensor(),                      # [0,1]
    transforms.Normalize(mean=[0.5, 0.5, 0.5],  # [0,1] -> [-1,1]
                         std=[0.5, 0.5, 0.5])
])

# Text tokenizer (you can swap this with your CLIP tokenizer path)
tokenizer_name = "openai/clip-vit-base-patch32"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# Max token length used in your generate() code = 77
MAX_LEN = 77


tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [8]:
# Custom PyTorch Dataset for Flickr30k

class Flickr30kTorchDataset(Dataset):
    def __init__(self, hf_split, image_transform=None, tokenizer=None, max_len=77):
        """
        hf_split: a split from the HF dataset, e.g. ds["train"]
        image_transform: torchvision transforms to apply to images
        tokenizer: tokenizer to convert text caption -> token ids
        max_len: max sequence length for tokens (should match your model, e.g., 77)
        """
        self.data = hf_split
        self.image_transform = image_transform
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        # Total number of samples
        return len(self.data)

    def __getitem__(self, idx):
        # Get raw HF example
        example = self.data[idx]

        # --- 1. Load image ---
        # For the nlphuji/flickr30k dataset, 'image' is usually a PIL.Image already.
        img = example["image"]  # adjust key name if needed

        # Apply image transforms
        if self.image_transform is not None:
            img = self.image_transform(img)  # shape (3, 512, 512), values in [-1, 1]

        # --- 2. Get caption text ---
        # Often there is something like 'sentences' or 'caption'; adjust if necessary.
        # Example structure: example["sentences"] = [{"raw": "..."} , ...]
        if "sentences" in example:
            # take the first caption's raw text
            caption = example["sentences"][0]["raw"]
        elif "caption" in example:
            # fallback if dataset uses 'caption' key
            caption = example["caption"]
        else:
            raise KeyError("Could not find caption field in example. Check ds['train'][0] keys.")

        # --- 3. Tokenize caption ---
        # Convert text into token IDs for your text encoder (CLIP)
        encoding = self.tokenizer(
            caption,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        # encoding["input_ids"] has shape (1, max_len) -> squeeze to (max_len,)
        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)

        # Return everything your training loop needs
        return {
            "pixel_values": img,             # (3, 512, 512) float tensor in [-1, 1]
            "input_ids": input_ids,          # (max_len,)
            "attention_mask": attention_mask,# (max_len,)
            "caption": caption               # raw text (optional, nice for debugging)
        }


In [10]:
# Build PyTorch datasets and dataloaders

# If your HF dataset has only "train", you can do a manual split for validation.
# Use the 'test' split from the loaded dataset, as there is no 'train' split available directly.
hf_data = ds["test"]
# Example: 90% train, 10% val
total_len = len(hf_data)
train_len = int(0.9 * total_len)
val_len = total_len - train_len

hf_train_split = hf_data.select(range(train_len))
hf_val_split = hf_data.select(range(train_len, total_len))

# Wrap them with our custom Dataset
train_dataset = Flickr30kTorchDataset(
    hf_split=hf_train_split,
    image_transform=image_transform,
    tokenizer=tokenizer,
    max_len=MAX_LEN,
)

val_dataset = Flickr30kTorchDataset(
    hf_split=hf_val_split,
    image_transform=image_transform,
    tokenizer=tokenizer,
    max_len=MAX_LEN,
)

# DataLoaders
BATCH_SIZE = 4  # adjust based on your GPU memory

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

len(train_dataset), len(val_dataset)

(27912, 3102)

In [11]:
# Sanity check one batch

batch = next(iter(train_loader))

print("pixel_values shape:", batch["pixel_values"].shape)  # (B, 3, 512, 512)
print("input_ids shape:", batch["input_ids"].shape)        # (B, 77)
print("attention_mask shape:", batch["attention_mask"].shape)
print("sample caption:", batch["caption"][0])

# Check value range
print("image min:", batch["pixel_values"].min().item())
print("image max:", batch["pixel_values"].max().item())


pixel_values shape: torch.Size([4, 3, 512, 512])
input_ids shape: torch.Size([4, 5, 77])
attention_mask shape: torch.Size([4, 5, 77])
sample caption: ['Two male police officers on patrol, wearing the normal gear and bright green reflective shirts.', 'A man with a white tank top is on a scooter is driving up the street.', 'Three people in shorts, two shirtless are in front of an Asian storefront, one resting in a chair and the other two cleaning and watering.', 'A guitar and bass player perform on a temporary stage.']
image min: -1.0
image max: 1.0
