Create a train/test/val split in the dataset

In [None]:
import os
import random
import shutil
from sklearn.model_selection import train_test_split
from concurrent.futures import ThreadPoolExecutor

def train_test_val_split_image_annotation(image_dir, annotation_dir, output_dir, train_size=0.7, val_size=0.15, test_size=0.15, random_state=None):
    """
    Splits image and annotation files into train, validation, and test sets and writes them to new directories.

    Parameters:
    image_dir (str): Directory containing image files.
    annotation_dir (str): Directory containing annotation files.
    output_dir (str): Base directory where images and labels directories will be created.
    train_size (float): Proportion of the dataset to include in the train split.
    val_size (float): Proportion of the dataset to include in the validation split.
    test_size (float): Proportion of the dataset to include in the test split.
    random_state (int): Controls the shuffling applied to the data before applying the split.
    """
    # Get list of image files
    image_files = [f for f in os.listdir(image_dir) if f.endswith('.png')]

    # Ensure the annotations exist
    image_annotation_pairs = []
    for img in image_files:
        img_path = os.path.join(image_dir, img)
        ann_path = os.path.join(annotation_dir, img.replace('.png', '.txt'))
        if os.path.isfile(ann_path):
            image_annotation_pairs.append((img_path, ann_path))
        else:
            print(f"Skipping {img} as corresponding annotation {img.replace('.png', '.txt')} does not exist.")

    # Shuffle data
    if random_state is not None:
        random.seed(random_state)
    random.shuffle(image_annotation_pairs)

    # Split into train+val and test sets
    train_val_size = train_size + val_size
    train_val_split, test_split = train_test_split(image_annotation_pairs, test_size=test_size, random_state=random_state)

    # Calculate the proportion of train/val split
    val_adjusted_size = val_size / train_val_size

    # Split the train+val set into train and val sets
    train_split, val_split = train_test_split(train_val_split, test_size=val_adjusted_size, random_state=random_state)

    # Helper function to copy files
    def copy_files(file_pairs, subset_name):
        image_output_dir = os.path.join(output_dir, 'images', subset_name)
        annotation_output_dir = os.path.join(output_dir, 'labels', subset_name)
        os.makedirs(image_output_dir, exist_ok=True)
        os.makedirs(annotation_output_dir, exist_ok=True)

        def copy_pair(pair):
            img_path, ann_path = pair
            shutil.copy(img_path, image_output_dir)
            shutil.copy(ann_path, annotation_output_dir)

        with ThreadPoolExecutor() as executor:
            executor.map(copy_pair, file_pairs)

    # Copy files to train, val, and test directories
    copy_files(train_split, 'train')
    copy_files(val_split, 'val')
    copy_files(test_split, 'test')

    print("Train files:", len(train_split))
    print("Validation files:", len(val_split))
    print("Test files:", len(test_split))


# Example usage
if __name__ == "__main__":
    # Directories containing images and annotations
    image_dir = '/home/jovyan/work/datasets/Zebrafish Embryos Dataset V2 Early Frames/images'
    annotation_dir = '/home/jovyan/work/datasets/Zebrafish Embryos Dataset V2 Early Frames/labels'
    output_dir = '/home/jovyan/work/datasets/Zebrafish Embryos Dataset V2 Early Frames/data'  # Base directory where images and labels directories will be created

    # Splitting the data and copying to new directories
    train_test_val_split_image_annotation(image_dir, annotation_dir, output_dir, train_size=0.8, val_size=0.10, test_size=0.10, random_state=42)


Now Install Ultralytics and set up the YOLOv8 model for training

In [None]:
!pip install ultralytics

Change YOLO's default dataset directory (if necessary)

In [None]:
!yolo settings

Download and run the model

In [None]:
ROOT_DIR = '/home/jovyan/work/datasets/Zebrafish Embryos Dataset V2 Early Frames'

### Notes for training YOLOv8

from [https://github.com/ultralytics/ultralytics/issues/4106](https://github.com/ultralytics/ultralytics/issues/4106)

"@FatemaD1577 It really depends on a lot of parameters to answer this question. Here are some examples:

It depends on the model you will be using, for example yolov8n is much smaller model that yolov8x and will consume far less GB of GPU or RAM space for the training.
Also, the size of the training images which is specified in imgsz parameter is very crucial, larger image size for training images will consume more GB while training.
If your GPU is any new nvidia model and supports Mixed Precision then less space will be needed for the training.
Batch size is also a crucial parameter which plays a significant role in the training procedure and higher batch_size consume more memory while training.
Typically if you have a 4GB GPU without support for Mixed Precision, you will need to train using imgsz = 320 which i think is the lowest resolution accepted by the model and small batch_size like 1,2,4.

If you have 8GB of GPU or more, then you have nothing to worry you can train easy.

On CPU your training is going to be slower but if you have 12GB or more of RAM you can train."

In [None]:
import os
from ultralytics import YOLO


# Load a model
model = YOLO("yolov8n.pt")

ROOT_DIR = '/home/jovyan/work/datasets/Zebrafish Embryos Dataset V3'

# NOTE: If you would like to use YOLOv8n's built-in capability to determine batch size, set batch=-1
results = model.train(data=os.path.join(ROOT_DIR, "zebratrack.yaml"), epochs=400, batch=8, workers=4, patience=10)

