# ImageCLEF 2025 Data Prep & Preview

Download (if available) and preview ImageCLEF 2025 medical image captioning data. Set `IMAGECLEF_2025_URL` to a private/authenticated ZIP link containing `captioning.jsonl`, `concept_detection.jsonl`, `explainability.jsonl`, and the referenced images. If files already exist locally, no download is performed.

In [None]:
import os, sys
sys.path.append(os.path.abspath(".."))

In [None]:
from src.data.dataset import maybe_download_imageclef_2025, load_imageclef_2025_splits

In [None]:
data_dir = os.path.abspath("../data/imageclef_2025")
os.makedirs(data_dir, exist_ok=True)

print(f"Data directory: {data_dir}")
print("IMAGECLEF_2025_URL set:", bool(os.environ.get("IMAGECLEF_2025_URL")))

# Download if missing; otherwise returns immediately.
maybe_download_imageclef_2025(data_dir)

In [None]:
splits = load_imageclef_2025_splits(data_dir)
caption_ds = splits["captioning"]
concept_ds = splits["concept_detection"]
explain_ds = splits["explainability"]

print(caption_ds)
print(concept_ds)
print(explain_ds)

In [None]:
from IPython.display import display
from PIL import Image
import random

if len(caption_ds) == 0:
    raise ValueError("Captioning dataset is empty; ensure JSONL has records.")

idx = random.randint(0, len(caption_ds) - 1)
sample = caption_ds[idx]
print(f"Sample #{idx}")
print("Instruction:", sample.get("instruction"))
print("Output:", sample.get("output"))
print("Image path:", sample.get("image_path"))

img = Image.open(sample["image_path"]).convert("RGB")
display(img)