Project name: AMAG test

Description: this notebook file aims to make directories for train/val data split

Author: Ali Saghafi

Date: 31/10/2023

In [1]:
from sklearn.model_selection import train_test_split
import os
import shutil

In [2]:
try:
    __location__ = os.path.realpath(os.path.join(
        os.getcwd(), os.path.dirname(__file__)))
except(NameError):
    __location__ = os.getcwd()

### Utilities

In [3]:
def move_files_to_folder(list_of_files, destination_folder):
    for f in list_of_files:
        try:
            shutil.move(f, destination_folder)
        except:
            print(f)
            assert False

In [9]:
# there are 4952 annotation files while the images are 5000, the first step is to remove the 48 images which do not have annotation files.

image_folder = __location__ + '\\Dataset\\images'
annotation_folder = __location__ + '\\Dataset\\labels'

# Get the list of files in each folder
image_files = os.listdir(image_folder)
annotation_files = os.listdir(annotation_folder)

# Extract file names without extensions
image_names = [os.path.splitext(file)[0] for file in image_files]
annotation_names = [os.path.splitext(file)[0] for file in annotation_files]

# Find images without corresponding annotations
images_without_annotations = list(set(image_names) - set(annotation_names))

# Remove images without annotations
for image_name in images_without_annotations:
    image_path = os.path.join(image_folder, image_name + '.jpg')  # Change the extension to match  image format
    if os.path.exists(image_path):
        os.remove(image_path)
        print(f"Removed {image_path}")

In [11]:
# Read images and annotations
images = [os.path.join(image_folder, x) for x in os.listdir(image_folder)]
annotations = [os.path.join(annotation_folder, x) for x in os.listdir(annotation_folder) if x[-3:] == "txt"]

images.sort()
annotations.sort()

# Split the dataset into train-valid-test splits 
train_images, val_images, train_annotations, val_annotations = train_test_split(images, annotations, test_size = 0.1, random_state = 1)


In [12]:
# making train/val folders for images
train_path = __location__+ '\\Dataset\\images\\train'
val_path = __location__+ '\\Dataset\\images\\val'

# Check if the directory doesn't exist, then create it
if not os.path.exists(train_path):
    os.makedirs(train_path)

if not os.path.exists(val_path):
    os.makedirs(val_path)


In [13]:
# making train/val folders for annotations

train_annot_path = __location__+ '\\Dataset\\labels\\train'
val_annot_path = __location__+ '\\Dataset\\labels\\val'

# Check if the directory doesn't exist, then create it
if not os.path.exists(train_annot_path):
    os.makedirs(train_annot_path)

if not os.path.exists(val_annot_path):
    os.makedirs(val_annot_path)

In [14]:
# moving train/val images to their corresponding folders
move_files_to_folder(train_images, train_path)
move_files_to_folder(val_images, val_path)

In [15]:
# moving train/val annotations to their corresponding folders
move_files_to_folder(train_annotations, train_annot_path)
move_files_to_folder(val_annotations, val_annot_path)