Use this script to merge the `datasets/roboflow_official` dataset into `datasets/roboflow_merged`.

This is done in order to delete default splitting in `train` `test` and `valid`.


In [2]:
from pathlib import Path
import shutil
import os

# ---- User parameters ----
DATASET_ROOT = Path('../../datasets/roboflow_official')  # Path to the dataset with 'train/', 'test/', 'valid/'
MERGED_NAME  = 'roboflow_merged'          # Name for the merged dataset directory
COPY_MODE    = 'copy'                       # 'copy' or 'symlink'
# -------------------------

TARGET_DIR   = DATASET_ROOT.parent / MERGED_NAME
IMAGES_DIR   = TARGET_DIR / 'images'
LABELS_DIR   = TARGET_DIR / 'labels'

print(f'Merging into {TARGET_DIR.resolve()}')
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
LABELS_DIR.mkdir(parents=True, exist_ok=True)


Merging into /home/andrea/work/AI-waste-detection/datasets/roboflow_merged


In [3]:
def merge_split(split: str):
    """Copy or symlink all images and labels from a given split (train/test/valid)."""
    images_src = DATASET_ROOT / split / 'images'
    labels_src = DATASET_ROOT / split / 'labels'

    for img_path in images_src.glob('*'):
        dest = IMAGES_DIR / img_path.name
        if dest.exists():                       # Handle name collision
            dest = IMAGES_DIR / f'{split}_{img_path.name}'
        if COPY_MODE == 'copy':
            shutil.copy2(img_path, dest)
        else:
            dest.symlink_to(img_path.resolve())

    for lbl_path in labels_src.glob('*'):
        dest = LABELS_DIR / lbl_path.name
        if dest.exists():
            dest = LABELS_DIR / f'{split}_{lbl_path.name}'
        if COPY_MODE == 'copy':
            shutil.copy2(lbl_path, dest)
        else:
            dest.symlink_to(lbl_path.resolve())

    print(f'Finished {split}: {len(list(images_src.glob("*")))} images, {len(list(labels_src.glob("*")))} labels')


In [4]:
for split in ['train', 'test', 'valid']:
    merge_split(split)

total_images = len(list(IMAGES_DIR.glob('*')))
total_labels = len(list(LABELS_DIR.glob('*')))
print(f'All done! Total images: {total_images}, total labels: {total_labels}')


Finished train: 11466 images, 11466 labels
Finished test: 546 images, 546 labels
Finished valid: 1092 images, 1092 labels
All done! Total images: 13104, total labels: 13104
