# ImageCLEF 2025 Data Prep & Preview

Download (if available) and preview ImageCLEF 2025 medical image captioning data. Set `IMAGECLEF_2025_URL` to a private/authenticated ZIP link containing `captioning.jsonl`, `concept_detection.jsonl`, `explainability.jsonl`, and the referenced images. If files already exist locally, no download is performed.

In [4]:
import os, sys
sys.path.append(os.path.abspath(".."))

In [5]:
from src.data.dataset import maybe_download_imageclef_2025, load_imageclef_2025_splits

In [3]:
data_dir = os.path.abspath("../data/imageclef_2025")
os.makedirs(data_dir, exist_ok=True)

print(f"Data directory: {data_dir}")
print("IMAGECLEF_2025_URL set:", bool(os.environ.get("IMAGECLEF_2025_URL")))

# Download if missing; otherwise returns immediately.
maybe_download_imageclef_2025(data_dir)

Data directory: /Users/yashwanth/Documents/OMSCS/Deep_Learning/dl_project_fall_2025/data/imageclef_2025
IMAGECLEF_2025_URL set: False


FileNotFoundError: ImageCLEF 2025 data not found. Place captioning.jsonl, concept_detection.jsonl, and explainability.jsonl under the data directory, or set IMAGECLEF_2025_URL to an authenticated download link (e.g., a private ZIP) before running.

In [None]:
splits = load_imageclef_2025_splits(data_dir)
caption_ds = splits["captioning"]
concept_ds = splits["concept_detection"]
explain_ds = splits["explainability"]

print(caption_ds)
print(concept_ds)
print(explain_ds)

In [None]:
from IPython.display import display
from PIL import Image
import random

if len(caption_ds) == 0:
    raise ValueError("Captioning dataset is empty; ensure JSONL has records.")

idx = random.randint(0, len(caption_ds) - 1)
sample = caption_ds[idx]
print(f"Sample #{idx}")
print("Instruction:", sample.get("instruction"))
print("Output:", sample.get("output"))
print("Image path:", sample.get("image_path"))

img = Image.open(sample["image_path"]).convert("RGB")
display(img)

In [None]:
from datasets import load_dataset
from PIL import Image
import matplotlib.pyplot as plt

# 1. Load ROCOv2 from Hugging Face
dataset = load_dataset("eltorio/ROCOv2-radiology")

train_ds = dataset["train"]
print(len(train_ds))  # ~60k examples (HF split)

# 2. Inspect one training example
example = train_ds[0]

print("Image ID:", example["image_id"])
print("Caption :", example["caption"])
print("CUIs    :", example["cui"])

# 3. Show the image
img = example["image"]  # this is already a PIL.Image
plt.imshow(img)
plt.axis("off")
plt.show()