In [5]:
from pathlib import Path
import numpy as np
import shutil

from typing import List


In [6]:
data_folder = Path("dataset")

labels = list(data_folder.rglob("*).txt"))
images = list(data_folder.rglob("*.jpg"))
labels = sorted(labels, key= lambda x: x.stem)
images = sorted(images, key= lambda x: x.stem)

dataset_size = len(labels)

print(dataset_size, len(images))

1162 1162


In [9]:
np.random.seed(69)

train_coeff = 0.7
val_coeff = 0.2
test_coeff = 0.1

train_size = int(dataset_size * train_coeff)
val_size = int(dataset_size * val_coeff)
test_size = dataset_size - train_size - val_size

print(train_size, val_size, test_size, train_size + val_size + test_size)


all_indexes = np.arange(0, len(labels))
print(all_indexes)

probs = np.ones_like(all_indexes.copy())
probs = probs / probs.sum()

train_indices = np.random.choice(all_indexes, [train_size], replace=False, p=probs)

probs[train_indices] = 0

probs = probs / probs.sum()

val_indices = np.random.choice(all_indexes, [val_size], replace=False, p=probs)

probs[val_indices] = 0
probs = probs / probs.sum()

test_indices = np.random.choice(all_indexes, [test_size], replace=False, p=probs)


print("Train:", train_indices[:10])
print("Val:", val_indices[:10])
print("Test:", test_indices[:10])

for train in train_indices:
    assert train not in test_indices and train not in val_indices

for val in val_indices:
    assert val not in train_indices and val not in test_indices


[   0    1    2 ... 1159 1160 1161]
Train: [344 940 406 917 652 294 121  67 782 810]
Val: [ 447  498  162    7  685  850  889 1095  741  263]


  probs = probs / probs.sum()


In [10]:
def get_pairs(indices_array: np.ndarray):

    images_ = np.array(images)[indices_array]
    labels_ = np.array(labels)[indices_array]
    return images_, labels_


train_images, train_labels = get_pairs(train_indices)
val_images, val_labels = get_pairs(val_indices)
test_images, test_labels = get_pairs(test_indices)



In [12]:
path_to_go = Path("datasets/data")
path_to_go.mkdir(exist_ok=True)

train_path = path_to_go / "train"
val_path = path_to_go / "val"
test_path = path_to_go / "test"

train_path.mkdir(exist_ok=True)
val_path.mkdir(exist_ok=True)
test_path.mkdir(exist_ok=True)


In [13]:

def copy_files(target_folder: Path | str, files: List[Path | str]):
    
    for file in files:
        shutil.copy(file, target_folder)


def copy_data(target_folder: Path| str, labels_: List[Path | str], images_: List[Path | str]):

    copy_files(target_folder, labels_)
    copy_files(target_folder, images_)


copy_data(train_path, train_labels, train_images)
copy_data(val_path, val_labels, val_images)
copy_data(test_path, test_labels, test_images)

