# Split dataset

Split dataset into train/test/val sets. Saves those sets in dataset directory, under appropriate set sub-directories (e.g. train).

Also renames labels and images (adds dataset prefix).

In [2]:
import time
from tqdm import tqdm
from pathlib import Path
import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.preprocessing import MultiLabelBinarizer

In [6]:
# Dataset root directory
DATASET_ROOT_DIR: str = "../datasets/001-COCO-"

# Dataset identifier
DATASET_IDENTIFIER: str = "001-COCO-"

# Labels directory
LABELS_DIR: str = "labels"

# Images directory
IMAGES_DIR: str = "images"

# Train directory name
TRAIN_DIR: str = "train"

# Val directory name
VAL_DIR: str = "val"

# Test directory name
TEST_DIR: str = "test"

In [None]:
# Add dataset prefix to annotations and images
print("Renaming labels...")

labels_paths = list(Path(f"{DATASET_ROOT_DIR}/{LABELS_DIR}").rglob("*.txt"))

for label in tqdm(labels_paths):
    if not label.name.startswith(DATASET_IDENTIFIER):
        label.rename(Path(label.parent, f"{DATASET_IDENTIFIER}{label.name}"))

print("Renaming images...")

images_paths = list(Path(f"{DATASET_ROOT_DIR}/{IMAGES_DIR}").rglob("*.jpg"))

for img in tqdm(images_paths):
    if not img.name.startswith(DATASET_IDENTIFIER):
        img.rename(Path(img.parent, f"{DATASET_IDENTIFIER}{img.name}"))


In [30]:
# Read labels and images
labels_paths = list(Path(f"{DATASET_ROOT_DIR}/{LABELS_DIR}").rglob("*.txt"))

annotations = []
classes = []

# Dict with labels stem and path objects for later use when moving files
annotations_path = {}

for label in tqdm(labels_paths):
    with open(label, 'r') as f:
        cl = []
        for line in f:
            bbox_class = int(line.strip().split(" ")[0])
            cl.append(bbox_class)
        annotations.append(label.stem)
        classes.append(cl)
        annotations_path[label.stem] = label

100%|██████████| 39349/39349 [00:03<00:00, 12742.39it/s]


In [16]:
def mskf_train_test_val_split(X, y):
    # Split dataset into train and test
    mskf = MultilabelStratifiedKFold(n_splits=5)

    train_indexes, test_indexes = next(mskf.split(X, y))

    X_train, y_train = X[train_indexes], y[train_indexes]
    _X_test, _y_test = X[test_indexes], y[test_indexes]

    # Split test into test and validation
    mskf = MultilabelStratifiedKFold(n_splits=5)

    test_indexes, val_indexes = next(mskf.split(_X_test, _y_test))

    X_test, y_test = _X_test[test_indexes], _y_test[test_indexes]
    X_val, y_val = _X_test[val_indexes], _y_test[val_indexes]

    return X_train, y_train, X_test, y_test, X_val, y_val, train_indexes, test_indexes, val_indexes

# Binarize labels to 'multilabel-indicator'
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(np.array(classes))

# Split the dataset (80:20 full dataset to train/test, 80:20 test set to test/val)
X_train, y_train, X_test, y_test, X_val, y_val, train_indexes, test_indexes, val_indexes = mskf_train_test_val_split(
    np.array(annotations),
    y,
)

  y = mlb.fit_transform(np.array(classes))


In [None]:
# Statistics
# Note: train/test proportion should be close to 5 (80% / 16% = 5)
print("Train set shape:", X_train.shape)
print("Test set shape", X_test.shape)
print("Val test shape:", X_val.shape)
print("Class occurences proportion:", y_train.sum(axis=0) / y_test.sum(axis=0))
print("Class instances proportion:", np.bincount(np.concatenate(np.array(classes, dtype=object)[train_indexes])) / np.bincount(np.concatenate(np.array(classes, dtype=object)[test_indexes])))
print("Train and test images overlap (should be 0):", np.intersect1d(X_train, X_test).shape)
print("Test and val overlap:", np.intersect1d(X_test, X_val).shape)
print("Val and train overlap:", np.intersect1d(X_val, X_train).shape)

In [37]:
# Move images and labels to appropriate subdirectory

# Make test, train, val if not exist
Path(f"{DATASET_ROOT_DIR}/{IMAGES_DIR}/test").mkdir(exist_ok=True)
Path(f"{DATASET_ROOT_DIR}/{IMAGES_DIR}/train").mkdir(exist_ok=True)
Path(f"{DATASET_ROOT_DIR}/{IMAGES_DIR}/val").mkdir(exist_ok=True)

Path(f"{DATASET_ROOT_DIR}/{LABELS_DIR}/test").mkdir(exist_ok=True)
Path(f"{DATASET_ROOT_DIR}/{LABELS_DIR}/train").mkdir(exist_ok=True)
Path(f"{DATASET_ROOT_DIR}/{LABELS_DIR}/val").mkdir(exist_ok=True)

# Move train
for file_stem in tqdm(X_train):
    file = annotations_path[file_stem]
    im = Path(str(file.with_suffix(".jpg")).replace(LABELS_DIR, IMAGES_DIR))
    file.rename(Path(file.parent.parent, "train", file.name))
    im.rename(Path(im.parent.parent, "train", im.name))

# Move test
for file_stem in tqdm(X_test):
    file = annotations_path[file_stem]
    im = Path(str(file.with_suffix(".jpg")).replace(LABELS_DIR, IMAGES_DIR))
    file.rename(Path(file.parent.parent, "test", file.name))
    im.rename(Path(im.parent.parent, "test", im.name))

# Move val
for file_stem in tqdm(X_val):
    file = annotations_path[file_stem]
    im = Path(str(file.with_suffix(".jpg")).replace(LABELS_DIR, IMAGES_DIR))
    file.rename(Path(file.parent.parent, "val", file.name))
    im.rename(Path(im.parent.parent, "val", im.name))

100%|██████████| 31521/31521 [00:02<00:00, 11783.30it/s]
100%|██████████| 6273/6273 [00:02<00:00, 2802.77it/s]
100%|██████████| 1555/1555 [00:00<00:00, 3123.30it/s]


In [40]:
# Verify if files were moved correctly

print(len(list(Path(f"{DATASET_ROOT_DIR}/{IMAGES_DIR}/train").rglob("*.jpg"))) == X_train.shape[0])
print(len(list(Path(f"{DATASET_ROOT_DIR}/{LABELS_DIR}/train").rglob("*.txt"))) == X_train.shape[0])

print(len(list(Path(f"{DATASET_ROOT_DIR}/{IMAGES_DIR}/test").rglob("*.jpg"))) == X_test.shape[0])
print(len(list(Path(f"{DATASET_ROOT_DIR}/{LABELS_DIR}/test").rglob("*.txt"))) == X_test.shape[0])

print(len(list(Path(f"{DATASET_ROOT_DIR}/{IMAGES_DIR}/val").rglob("*.jpg"))) == X_val.shape[0])
print(len(list(Path(f"{DATASET_ROOT_DIR}/{LABELS_DIR}/val").rglob("*.txt"))) == X_val.shape[0])

True
True
True
True
True
True
