In [None]:
from datasets import load_dataset
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm
from PIL import Image
import matplotlib.pyplot as plt

### Download the dataset from hugging face

In [5]:


# 0️⃣ Setup
ROOT    = Path("data/flickr30k")
IMG_DIR = ROOT / "images"
ROOT.mkdir(parents=True, exist_ok=True)
IMG_DIR.mkdir(exist_ok=True)

# 1️⃣ Download & load with HF’s built-in progress bar
ds_all = load_dataset("nlphuji/flickr30k")

# 2️⃣ Build & save metadata.parquet
frames = []
for split_name, ds in ds_all.items():
    frames.append(pd.DataFrame({
        "filename": ds["filename"],
        "caption":  [c[0] for c in ds["caption"]],
        "split":    split_name,
    }))
df = pd.concat(frames, ignore_index=True)
df.to_parquet(ROOT / "metadata.parquet", index=False)

# 3️⃣ Save images with your own progress bars
for split_name, ds in ds_all.items():
    for example in tqdm(ds, desc=f"Writing {split_name}", unit="img"):
        img = example["image"]       # a PIL.Image
        img.save(IMG_DIR / example["filename"])

# 4️⃣ Sanity check
print(f"Saved {len(list(IMG_DIR.glob('*.jpg')))} images into {IMG_DIR}")


Writing test:   0%|          | 0/31014 [00:00<?, ?img/s]

### Print an image and caption

In [None]:
# Sanity check: show first image + caption

# Load your metadata
meta_path = Path("data/flickr30k/metadata.parquet")
df = pd.read_parquet(meta_path)

# Grab the first row
first = df.iloc[0]
img_path = Path("data/flickr30k/images") / first["filename"]

# Open & display
img = Image.open(img_path)
plt.imshow(img)
plt.axis("off")
plt.title(first["caption"])
plt.show()

# Also print caption text
print("Caption:", first["caption"])
