# Data preparation and training

These scripts are designed to prepare data for YOLOv8 training.

The training folder must contain three elements :
- a 'labels' folder: in which annotation files are stored,
- an 'images' folder: in which image files are stored,
- a 'labels.txt' file: containing annotation data in YOLO format: 
    - '0': 'class 0',/n '1': 'class 1',/n etc.
    
All these scripts are designed to process image and text data with the same name (except for the extension) contained in the 'labels' and 'images' folders.

## Environment


In [None]:
import os
import shutil
import random
import yaml
from ultralytics import YOLO
import time
from datetime import datetime

from modules.class_names_functions import get_labels

## Cleaning annotation files (.txt)

In [None]:
def clean_comma(dataset_folder):
    """
    This function removes the commas that may appear when creating .txt files from a csv file.
    
    The 'dataset_folder' parameter is the path to the folder in which the dataset is stored.
    """
    for filename in os.listdir(os.path.join(dataset_folder, 'labels')):
        if filename.endswith('.txt'):
            file_path = os.path.join(dataset_folder, 'labels', filename)
            # print(file_path)
            
            # Read the file content
            with open(file_path, 'r') as file:
                content = file.read()
            
            # Remove commas
            content_without_comma = content.replace(',', '')
            
            # Write the modified content in the file
            with open(file_path, 'w') as file:
                file.write(content_without_comma)

## Create the training dataset

In [None]:
def create_training_dataset(dataset_folder, model_folder, newDistribution):

    """
    This script prepares the data sets that will be used to train and validate the trained model.
    This script generates 3 .txt files:
    - A 'traindata.txt' file containing the list of images that will be used to train the model (80% of the data);
    - A 'valdata.txt' file containing the list of images that will be used to evaluate the model (20% of the data);
    - A 'training_dataset.txt' file containing the list of all the images used (training and validation).

    N.B.: The creation of a "test" dataset has been removed because trained models are tested on similar data not contained in the training set.

    The 'dataset_folder' parameter is the path to the folder in which the dataset is stored.
    The 'model_folder' parameter is the path to the folder in which a pre-trained model data are stored
    The 'distributionParameters' parameter default setting is True, and will create new .txt files to distribute the images used for training 
    and validation, as well as a .txt file with the entire training dataset.
    If False, no file will be created and you can keep the pre-existing train/val distribution of the data.
    
    """
    
    folder_base = os.path.dirname(dataset_folder)
    dataset_name = os.path.basename(dataset_folder)
    
    # Folder in which all training-related files are stored
    stat_folder = os.path.join(dataset_folder, 'dataset_statistics')
    
    if newDistribution == True:
        # Get a list of the images
        files = os.listdir(os.path.join(dataset_folder, 'images'))

        # Filter file names to keep only those with ".jpg" and ".png" extensions
        image_files = [f for f in files if f.endswith(".jpg") or f.endswith(".png")]

        # Shuffle file names randomly
        random.shuffle(image_files)

        # Calcul le nombre d'images pour chaque ensemble
        num_images = len(image_files)
        num_train = int(num_images * 0.8)
        num_val = int(num_images - num_train)

        # Divide file names into two sets : one for the training, one for the validation
        train_files = image_files[:num_train]
        val_files = image_files[num_train:num_train+num_val]
        
        # Check if the destination folder exists, if not create it
        
        os.makedirs(stat_folder, exist_ok=True)

        # Create a file with the list for the train data
        with open(os.path.join(stat_folder, 'traindata.txt'), 'w') as f:
            for image_file in train_files:
                f.write(os.path.join(dataset_folder, 'images', image_file) + "\n")
        print(f"File create in {os.path.join(stat_folder, 'traindata.txt')}")

        # Create a file with the list for valdidation data
        with open(os.path.join(stat_folder,'valdata.txt'), 'w') as f:
            for image_file in val_files:
                f.write(os.path.join(dataset_folder, 'images', image_file) + "\n")
        print(f"File create in {os.path.join(stat_folder, 'valdata.txt')}")


        # Create a file with all the dataset
        with open(os.path.join(stat_folder, 'training_dataset.txt'), 'w') as f:
            for image_file in image_files:
                    f.write(os.path.join(dataset_folder, 'images', image_file) + "\n")
            print(f"File create {os.path.join(stat_folder, 'training_dataset.txt')}")
    
    else:
        print(f'Use pre-existing files from {model_folder}.')
        shutil.copyfile(os.path.join(model_folder, 'dataset_statistics/traindata.txt'), os.path.join(stat_folder,'traindata.txt'))
        shutil.copyfile(os.path.join(model_folder, 'dataset_statistics/valdata.txt'), os.path.join(stat_folder,'valdata.txt'))
        shutil.copyfile(os.path.join(model_folder, 'dataset_statistics/training_dataset.txt'), os.path.join(stat_folder,'training_dataset.txt')) 
    

    # Split images and txt files into folders from a .txt file
    split_data_for_training(os.path.join(stat_folder, 'traindata.txt'), 
                            os.path.join(dataset_folder, 'labels'), 
                            os.path.join(folder_base, 'datasets', dataset_name, 'images/train'), 
                            os.path.join(folder_base, 'datasets', dataset_name, 'labels/train'))
    
    split_data_for_training(os.path.join(stat_folder,'valdata.txt'),
                            os.path.join(dataset_folder, 'labels'),
                            os.path.join(folder_base, 'datasets', dataset_name, 'images/val'),
                            os.path.join(folder_base, 'datasets', dataset_name, 'labels/val'))
    


## Split the data for training

In [None]:
def split_data_for_training(txt_list, txt_folder, output_img_folder, output_txt_folder):
    
    """
    This script is used to divide, from the .txt files generated previously, the images and annotations into the folders 
    used by YOLOv8 for training and model validation. 

    YOLOv8's parameters require that the data be sent in a single folder, itself contained in the 'datasets' folder. Each training 
    folder must contain an 'images' folder and a 'labels' folder, each containing a 'train' folder and a 'val' folder.

    The following script produces :
    - A folder with the name of the training session
    - In this folder :
        - An images folder, containing
            - a 'train' folder
            - a 'val' folder
        - A labels folder, containing
            - a 'train' folder
            - a 'val' folder
    - The .txt files with the names of the image files used for training and their distribution in the 'train' and val sets,
    as well as the .txt file with all the image files used for training.
    
    The parameters to be specified are :
    
    txt_list_train : path to txt file containing list of images for training
    txt_list_val : path to txt file containing list of images for validation 

    output_img_folder_train : path to folder where images for training are to be stored
    output_img_folder_val : path to folder where images for validation are to be stored

    output_txt_folder_train : path to folder where annotations file for training are to be stored
    output_txt_folder_val : path to folder where annotations file for validation are to be stored

    According YOLOv8 documentation the folder must be contained in 'datasets' folder
    """

    # Create the output folder if it does not already exist
    os.makedirs(output_img_folder, exist_ok=True)
    os.makedirs(output_txt_folder, exist_ok=True)
    
    folder_base = os.path.dirname(dataset_folder)
    dataset_name = os.path.basename(dataset_folder)
    
    # Open the text file containing the image paths
    with open(txt_list, "r") as f:
        # Browse through each line of the file
        for line in f:
            # Get the image path and text file name
            image_path = line.strip()
            image_name = os.path.basename(image_path)

            txt_file = os.path.join(txt_folder, image_name).replace('.jpg', '.txt')
            
            # Copy image to output folder
            shutil.move(image_path, os.path.join(output_img_folder, os.path.basename(image_path)))
        
            
            # Copy text file to output folder
            try:
                shutil.move(txt_file, os.path.join(output_txt_folder, os.path.basename(txt_file)))

            except FileNotFoundError:
                print(f'Text file {txt_file} does not exist')
    print(f'Image files move in {output_img_folder}')
    print(f'Text files move in {output_txt_folder}')
    
    #Create the yaml file
    write_yaml_file(dataset_folder, dataset_name, folder_base)
    

## Create the .yaml file

In [None]:
def write_yaml_file(dataset_folder, dataset_name, folder_base):
    
    '''
    This function creates the .yaml file that will be used  to train the model.
    
    The 'datasets_folder' parameter is the path to the folder containing the training data.
    The 'folder_name' parameter is the name of the training session. 
    The 'folder_base' is the path to the root folder.
    '''
    
    # Get the annotations classes
    annotation_classes = get_labels(os.path.join(dataset_folder, 'labels.txt'))
    
    # Convertir les clés du dictionnaire annotation_classes en entiers
    annotation_classes_int = {int(key): value for key, value in annotation_classes.items()}

    # Formater la chaîne avec les éléments dans l'ordre souhaité
    yaml_data = f"path: {os.path.join(folder_base, 'datasets', dataset_name)}/\n" \
                f"train: 'images/train'\n" \
                f"val: 'images/val'\n" \
                f"\n" \
                f"#class names\n" \
                f"names:\n"
    
    for key, value in annotation_classes_int.items():
        yaml_data += f"  {key}: '{value}'\n"
        
    with open(os.path.join(folder_base, 'datasets', dataset_name, dataset_name + '.yaml'), 'w') as yaml_file:
        yaml_file.write(yaml_data)
    print(f"File edit in {os.path.join(folder_base, 'datasets', dataset_name, dataset_name + '.yaml')}")

## Model training

In [None]:
def yolo_training(dataset_folder, use_model, img_size, epochs, batch, workers):
    
    """
    This function allows you to start training by selecting and adjusting the model's performance, speed, and accuracy.
    This list is not exhaustive.
    For a complete list of modifiable arguments, 
    see :  https://docs.ultralytics.com/modes/train/#arguments.
    
    The 'datasets_folder' parameter is the path to the folder containing the training data.
    

    **Warning 1** : The arguments proposed "by default" are those that I have evaluated as being the most balanced 
    for training different models. They are suggestions rather than recommendations, and should be adapted to the data and 
    capacity of the equipment used.

    **Warning 2** : The 'model_name' variable is constructed by taking the name of the training dataset, 
    the date and the parameters used for training.

    The output folder in which the training data will be stored (save_dir) is constructed by YOLO in the form
    **save_dir = project/name**. By default, project =  './runs/detect'.
    """
    
    folder_base = os.path.dirname(dataset_folder)
    dataset_name = os.path.basename(dataset_folder)
    
    date = datetime.now().strftime('%Y%m%d')
    model_name = f'{dataset_name}_{date}_{use_model[-4]}_i{img_size}_e{epochs}_b{batch}_w{workers}'
    yaml_file = os.path.join(folder_base, 'datasets', dataset_name, dataset_name + '.yaml')

    # Load a pretrained YOLO model
    model = YOLO(use_model)

    # Train the model
    results = model.train(
       data = yaml_file, #path to the datasets and classes
       imgsz = img_size, #image size
       epochs = epochs,
       batch = batch,
       workers = workers, # increases training speed, default setting is 8
       name = model_name, # output folder
       project = os.path.join(folder_base, 'runs/train')
    )

    # Evaluate the model's performance on the validation set
    results = model.val(
        name = model_name + '/'+ model_name +'_val')

### Resuming interrupted trainings(Optional)

In [None]:
def resume_training(model_folder):
    
    '''
    This function resumes an interrupted training session.
    The resume parameter retrieves all previously declared parameters
    The parameters are :
    - 'last_weight': path to last trained weight ('last.pt'),
    - 'model_name': used for the val session.
    '''
    
    last_weight = os.path.join(model_folder, 'weights/last.pt')
    model_name = os.path.basename(model_folder)
    
    # Load a model
    model = YOLO(last_weight)  # load a partially trained model

    # Resume training
    results = model.train(resume=True)

    # Evaluate the model's performance on the validation set
    results = model.val(
        name = model_name + '/'+ model_name +'_val')

## Re-arrange in pristine state

In [None]:
def dispatch_data(dataset_folder, model_folder):

    """
    This function moves the .txt file containing the distribution list of data used for training, and the .yaml file, 
    into the model folder and returns the image data and annotations to the original folder.
    Finally, the folder used for training is deleted.

    In this way, a new model can be trained by reusing this notebook, and each trained model will have a specific folder
    with all the data linked to it.

    """
    
    folder_base = os.path.dirname(dataset_folder)
    dataset_name = os.path.basename(dataset_folder)
    
    date = datetime.now().strftime('%Y%m%d') 
    model_name = f'{dataset_name}_{date}_{use_model[-4]}_i{img_size}_e{epochs}_b{batch}_w{workers}'

    
    if model_folder == '':
        model_folder = os.path.join(folder_base, 'runs/train', model_name)
    else:
        model_folder = model_folder
    
    # Move the data used for the training session into the model folder
    
    shutil.move(os.path.join(dataset_folder, 'dataset_statistics'), model_folder)
    print(f'The .txt files with the training data have been moved to {model_folder}.')
  
    shutil.move(os.path.join(folder_base, 'datasets', dataset_name, dataset_name + '.yaml'), os.path.join(model_folder, dataset_name + '.yaml'))
    print(f'The .yaml file has been moved into {model_folder}')
    
    shutil.copyfile(os.path.join(dataset_folder, 'labels.txt'), os.path.join(model_folder, 'labels.txt'))
    print(f'The labels.txt file has been copied in {model_folder}')

    # Replace the data in the dataset folder
    img_folder_train = os.path.join(folder_base, 'datasets', dataset_name, 'images/train') 
    txt_folder_train = os.path.join(folder_base, 'datasets', dataset_name, 'labels/train')
    img_folder_val = os.path.join(folder_base, 'datasets', dataset_name, 'images/val')
    txt_folder_val = os.path.join(folder_base, 'datasets', dataset_name, 'labels/val')
    
    for file in os.listdir(img_folder_train):
        shutil.move(os.path.join(img_folder_train, file), os.path.join(dataset_folder, 'images', file))
    print(f"Files from {img_folder_train} have been moved into {os.path.join(dataset_folder, 'images')}")
    
    for file in os.listdir(img_folder_val):
        shutil.move(os.path.join(img_folder_val, file), os.path.join(dataset_folder, 'images', file))
    print(f"Files from {img_folder_val} move into {os.path.join(dataset_folder, 'images')}")
    
    for file in os.listdir(txt_folder_train):
        shutil.move(os.path.join(txt_folder_train, file), os.path.join(dataset_folder, 'labels', file))
    print(f"Files from {txt_folder_train} move into {os.path.join(dataset_folder, 'labels')}")
    
    for file in os.listdir(txt_folder_val):
        shutil.move(os.path.join(txt_folder_val, file), os.path.join(dataset_folder, 'labels', file))
    print(f"Files from {txt_folder_val} move into {os.path.join(dataset_folder, 'labels')}")

    shutil.rmtree(os.path.join(folder_base, 'datasets', dataset_name))
    print(f"The {os.path.join(folder_base, 'datasets', dataset_name)} has been deleted")

## Process

In [None]:
dataset_folder = 'ABSPATHTODATASET' # to be changed, absolute path to the dataset you will use for the training session

# if you don't want to use pre-existing files or if this is the first training session model_folder = ''
model_folder = '' # to be changed, absolute path to the model folder

### Clean and split data

In [None]:
# Clean the file txt if needed
clean_comma(dataset_folder)

In [None]:
# Generate data distribution file for train/val sets
create_training_dataset(dataset_folder,model_folder, newDistribution=True)

### Start a training session

In [None]:
use_model = 'yolov8l.pt' # to be changed as needed, by default use 'yolov8x.pt'
img_size = 640 # to be changed as needed, by default use 640
epochs = 100 # to be changed as needed, by default use 100
batch = 8 # to be changed as needed, by default use 8
workers = 24 # to be changed as needed, by default use 24 or -1 for AutoBatch

In [None]:
%%time

# Start a training session
yolo_training(dataset_folder, use_model, img_size, epochs, batch, workers)

### Dispatch the data

In [None]:
# Move the .txt files describing the distribution of images/labels in train and val of the training data into the model folder and replace the image/label data themself in their original folders
dispatch_data(dataset_folder, model_folder)

### Resume an uncompleted training session (Optional)

In [None]:
%time

# Resume an interrupted training
# resume_training(model_folder)

In [None]:
# Move the .txt files describing the distribution of images/labels in train and val of the training data into the model folder and replace the image/label data themself in their original folders
# dispatch_data(dataset_folder, model_folder)