In [None]:
%load_ext autoreload
%autoreload 2

import os
import random
import shutil

from pathlib import Path

from drone_detection.config import (
    RAW_DATA_DIR, PROCESSED_DATA_DIR,
)

### Check data

In [None]:
dataset_1 = RAW_DATA_DIR / "dataset_1"
jpeg_imgs = list(dataset_1.glob("*.JPEG"))

In [None]:
print(len(jpeg_imgs))
jpeg_imgs[:2]

In [None]:
new_paths = [Path(str(item).replace("JPEG", "jpg")) for item in jpeg_imgs]
new_paths[:2]

In [None]:
# Rename extension in directory
for (old_path, new_path) in zip(jpeg_imgs, new_paths):
    os.rename(old_path, new_path)

In [None]:
# Check images and labels
need_to_delete = []
images = dataset_1 / "images"
labels = dataset_1 / "labels"

imgs_lst = list(images.glob("*"))
labels_lst = list(labels.glob("*"))

len(imgs_lst), len(labels_lst)

In [None]:
for img_path in imgs_lst:
    img_name_stem = img_path.stem
    corresponding_label = labels / f"{img_name_stem}.txt"
    
    if not corresponding_label.exists():
        print(f"Label for {img_path.name} is missing!")

### Train Valid Test splitting

In [None]:
# Random shuffle
combined = list(zip(imgs_lst, labels_lst))
random.shuffle(combined)
imgs_lst_shuffled, labels_lst_shuffled = zip(*combined)

In [None]:
train_size = 0.8
test_size = 0.1
val_size = 0.1

total_len = len(imgs_lst_shuffled)
train_end = int(train_size * total_len)
val_end = train_end + int(val_size * total_len)

In [None]:
# Splitting
imgs_train = imgs_lst_shuffled[:train_end]
labels_train = labels_lst_shuffled[:train_end]

imgs_val = imgs_lst_shuffled[train_end:val_end]
labels_val = labels_lst_shuffled[train_end:val_end]

imgs_test = imgs_lst_shuffled[val_end:]
labels_test = labels_lst_shuffled[val_end:]

In [None]:
len(imgs_train), len(imgs_val), len(imgs_test)

In [None]:
processed_train = PROCESSED_DATA_DIR / "train"
processed_val = PROCESSED_DATA_DIR / "valid"
processed_test = PROCESSED_DATA_DIR / "test"

In [None]:
# Copy splitted data into folders
# Train
for img_path, label_path in zip(imgs_train, labels_train):
    new_img_path = processed_train / "images" / img_path.name
    new_label_path = processed_train / "labels" / label_path.name
    shutil.copy(img_path, new_img_path)
    shutil.copy(label_path, new_label_path)

In [None]:
# Valid
for img_path, label_path in zip(imgs_val, labels_val):
    new_img_path = processed_val / "images" / img_path.name
    new_label_path = processed_val / "labels" / label_path.name
    shutil.copy(img_path, new_img_path)
    shutil.copy(label_path, new_label_path)

In [None]:
# Test
for img_path, label_path in zip(imgs_test, labels_test):
    new_img_path = processed_test / "images" / img_path.name
    new_label_path = processed_test / "labels" / label_path.name
    shutil.copy(img_path, new_img_path)
    shutil.copy(label_path, new_label_path)