# Download ROCOv2 locally

Notebook to pull ROCOv2 to local storage. It uses a hosted mirror by default because the official `roco` script on Hugging Face requires you to manually place the dataset files. You can switch to the official loader if you already downloaded the data.

Defaults store data under `./data/roco_v2`. Adjust paths as needed.

In [None]:
# Install deps (pin to avoid ABI mismatches on Colab)
!pip install -q --upgrade torch torchvision torchaudio transformers==4.42.3 accelerate peft bitsandbytes \
    numpy==1.26.4 pandas==2.2.2 datasets==2.19.1 pillow tqdm

In [None]:
# Paths and dataset choice
from pathlib import Path
from datasets import load_dataset, DownloadConfig
from datasets.exceptions import EmptyDatasetError

# Where to store
BASE_DIR = Path('./data/roco_v2').resolve()
BASE_DIR.mkdir(parents=True, exist_ok=True)
print('Saving to', BASE_DIR)

# Choose dataset source
# Option 1 (default): hosted mirror that includes images
DATASET_NAME = 'flaviagiammarino/roco-dataset'
DATASET_CONFIG = None

# Option 2: official script (requires you to download ROCO locally first)
# DATASET_NAME = 'roco'
# DATASET_CONFIG = 'en'

# Subset for quick tests; set to None to download all
TRAIN_SPLIT = 'train[:100]'  # or None
VAL_SPLIT = 'validation[:20]'  # or None
TEST_SPLIT = None  # e.g., 'test[:20]'

In [None]:
# Download
try:
    raw_ds = load_dataset(
        DATASET_NAME,
        DATASET_CONFIG,
        cache_dir=str(BASE_DIR),
        download_config=DownloadConfig(use_auth_token=True),
    )
except EmptyDatasetError as e:
    raise RuntimeError(
        'Official ROCO loader needs local files. Place them under BASE_DIR and set DATASET_NAME="roco"; '
        'or use a mirror that bundles images.'
    ) from e

splits = {}
splits['train'] = raw_ds[TRAIN_SPLIT] if TRAIN_SPLIT else raw_ds['train']
splits['validation'] = raw_ds[VAL_SPLIT] if VAL_SPLIT else raw_ds.get('validation', None)
splits['test'] = raw_ds[TEST_SPLIT] if TEST_SPLIT else raw_ds.get('test', None)

print({k: v.num_rows if v is not None else 0 for k, v in splits.items()})

In [None]:
# Save to disk in Arrow format (restorable with datasets.load_from_disk)
from datasets import DatasetDict

to_save = {k: v for k, v in splits.items() if v is not None}
ds_to_save = DatasetDict(to_save)
out_path = BASE_DIR / 'arrow'
out_path.mkdir(parents=True, exist_ok=True)
ds_to_save.save_to_disk(out_path)
print('Saved to', out_path)
print(ds_to_save)

In [None]:
# Optional: export images + captions to simple folders/CSV for inspection
import csv

export_dir = BASE_DIR / 'export'
img_dir = export_dir / 'images'
export_dir.mkdir(parents=True, exist_ok=True)
img_dir.mkdir(parents=True, exist_ok=True)

cap_key = None
sample_cols = splits['train'].column_names
for candidate in ['caption', 'text', 'caption_en', 'description']:
    if candidate in sample_cols:
        cap_key = candidate
        break
if cap_key is None:
    raise ValueError('Could not find a caption column; please set cap_key manually.')

csv_path = export_dir / 'captions.csv'
with open(csv_path, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['split', 'index', 'image_path', 'caption'])
    for split_name, ds in to_save.items():
        for idx, ex in enumerate(ds):
            pil_img = ex['image']
            img_path = img_dir / f"{split_name}_{idx}.jpg"
            pil_img.save(img_path)
            writer.writerow([split_name, idx, str(img_path), ex[cap_key]])

print('Exported images to', img_dir)
print('Captions CSV:', csv_path)

In [None]:
# Quick visual sanity check on one sample
import matplotlib.pyplot as plt

sample = splits['train'][0]
plt.imshow(sample['image'])
plt.axis('off')
plt.title(sample.get(cap_key, ''))
plt.show()