# Extracting Annotated Training Data

This notebook is the first step in preparing your dataset for model training.

It processes Label Studio JSON exports to build a YOLO-compatible training folder with:
- `images/` – containing **only** annotated images
- `labels/` – YOLO-format `.txt` files
- `labels.txt` – listing **only used** annotation classes

📂 **This notebook expects**:
- JSON annotations exported from Label Studio
- A valid `project/` folder structure as described in the [README](../../README.md)

It will generate a YOLO-formatted training dataset under `data/`, based on the project name.

---

&copy; 2023 Marion Charpier — use of this notebook requires appropriate citation.

## Environment

In [None]:
import os
import json
import shutil
from pathlib import Path

import pandas as pd
from PIL import Image

import sys
sys.path.append(os.path.join('..', 'modules'))

from transform_coordinates_functions import from_ls_to_yolo
from class_names_functions import get_labels, get_class_code
from folders_path import img_folder_training, ground_truth_folder_training, get_data_folder
from manipulate_files import open_json_file

## Functions

### Create a csv file with images data

In [24]:
def create_csv_file(project_folder):
    """
    This function generates a CSV file containing metadata for each image in a specified project folder.
    It extracts relevant information such as image dimensions, format, and file paths. The resulting CSV 
    is designed for use in annotation processes or to facilitate analysis of training or inference sessions.

    :param project_folder: 
        - Type: str
        - Description: The absolute path to the folder named after the project. It should contain the images 
                       whose metadata needs to be extracted and stored.
    
    :return: 
        - Type: None
        - Description: This function does not return a value. It saves the extracted image metadata as a CSV 
                       file in the project folder.
    
    Metadata Extracted for Each Image:
        - 'Image_name': 
            - Type: str
            - Description: The name of the image file without its extension (e.g., 'image1' for 'image1.jpg').
        
        - 'Folder': 
            - Type: str
            - Description: The path to the folder where the images are stored.
        
        - 'Absolute_path': 
            - Type: str
            - Description: The absolute path to the image file on disk.
        
        - 'Format': 
            - Type: str
            - Description: The format of the image (e.g., JPEG, PNG).
        
        - 'Width': 
            - Type: int
            - Description: The width of the image in pixels.
        
        - 'Height': 
            - Type: int
            - Description: The height of the image in pixels.
        
        - 'Image_size': 
            - Type: int
            - Description: The total size of the image in pixels (Width x Height).

    The CSV file is saved in the image folder with the name format '<project_folder>_data.csv'.
    """
    
    data = []

    img_folder = img_folder_training(project_folder)
    
    images = [img for img in os.listdir(img_folder) if img.lower().endswith(('jpg', 'png', 'tiff'))]

    # Retrieve the size for each image and save the relevant information in a dictionary
    for file in images:
        img_name = '.'.join(file.split('.')[:-1])
        folder = img_folder
        with Image.open(os.path.join(img_folder, file)) as img:
            absolute_path = img.filename
            format = img.format
            width, height  = img.size
            img_size = width*height

        img_data = {
              'Image_name' : img_name,
              'Folder' : folder,
              'Absolute_path' : absolute_path,
              'Format' : format,
              'Width' : width,
              'Height': height,
              'Image_size' : img_size
        }

        data.append(img_data)
        
    # Create a DataFrame from the image data list
    df = pd.DataFrame(data)
    
    # Save DataFrame to a CSV file
    csv_filename = os.path.join(img_folder, os.path.basename(project_folder) + '_data.csv')
    df.to_csv(csv_filename, sep=';', index=False)
    
    print(f"Image data saved to {csv_filename}")

### Create txt files for data

#### Create the labels file in txt format

In [25]:
def create_labels_file(project_folder):

    """
    This function generates a text file containing the unique class labels from the annotations 
    present in the project folder. The labels file can be used to map class indices to their 
    corresponding labels for training or evaluation purposes.
    
    :param project_folder: 
        - Type: str
        - Description: The absolute path to the folder named after the project. This folder should contain 
                       the annotation files, which are used to extract the class labels.

    :return: 
        - Type: None
        - Description: This function does not return a value. It creates a text file named 'labels.txt' 
                       in the project folder's image subdirectory.
    
    The resulting text file (`labels.txt`) is saved in the image folder of the project directory, 
    and can be used for further reference during model training or evaluation.
    """

    img_folder = img_folder_training(project_folder)
    annotation_folder = ground_truth_folder_training(project_folder)
    
    annotation_files = [file for file in os.listdir(annotation_folder) if not file.startswith('.')]
    
    unique_classes = set()
    
    for annotation_file in annotation_files:
        annotations = open_json_file(os.path.join(annotation_folder, annotation_file))
        
        for i, result in enumerate(annotations['result']):
            value = result['value']
            label = value['rectanglelabels'][0]
            
            unique_classes.add(label)

    classes = list(unique_classes)
    print(classes)

    with open(os.path.join(img_folder, 'labels.txt'), 'w') as labels_file:
        for index, classe in enumerate(classes):
            labels_file.write(f"'{index}': '{classe}'\n")
    
    print(f"Labels file write in {os.path.join(img_folder, 'labels.txt')} ")

#### Create the annotations files in txt format

In [26]:
def create_annotations_file(project_folder):
    """
    This function generates annotation files for each image in the dataset, converting annotations into 
    the YOLO format. It reads annotation data from JSON files and outputs individual text files containing 
    bounding box information for each image. These files are saved in a 'labels' subdirectory within the 
    project folder's image directory.

    :param project_folder: 
        - Type: str
        - Description: The absolute path to the folder named after the project. The folder should contain 
                       the images and annotation files that need to be processed.
    
    :return: 
        - Type: None
        - Description: This function does not return a value. It creates and saves annotation files 
                       in the YOLO format for each image in the project.


    This file format is compatible with YOLO-based object detection frameworks, allowing seamless integration 
    for model training and inference.

    The resulting annotation files are saved as `<image_name>.txt` in the 'labels' folder.
    """

    img_folder = img_folder_training(project_folder)
    annotation_folder = ground_truth_folder_training(project_folder)
    
    # If not exists, create the folder for the labels file
    os.makedirs(os.path.join(img_folder, 'labels'), exist_ok = True)
    
    # Get the classes of the dataset from the labels file created with create_labels_file
    labels = get_labels(os.path.join(img_folder, 'labels.txt'))
    print(labels)
    
    # Get a list of the annotation files
    annotation_files = [file for file in os.listdir(annotation_folder) if not file.startswith('.')]
        
    for annotation_file in annotation_files:
        annotations = open_json_file(os.path.join(annotation_folder, annotation_file))

        # Get the name of the image
        name = annotations['task']['data']['image']
        img_name = os.path.basename(name).split('.')[0]
        
        with open(os.path.join(img_folder, 'labels', img_name + '.txt'), 'w') as yolo_annotation:
            for i, result in enumerate(annotations['result']):
                value = result['value']
                x, y, w, h = from_ls_to_yolo(value['x'], value['y'], value['width'], value['height'])
                classe_name = value['rectanglelabels'][0]
                classe_id = get_class_code(classe_name, labels)

                yolo_annotation.write(f"{classe_id} {x} {y} {w} {h}\n")
    
    print(f"Annotations succeffully converted and saved")

#### Create the folder for the training session

In [27]:
def create_training_folder(project_folder, data_folder):
    """
    This function creates a dedicated folder for storing training data within a specified directory. 
    If a folder with the same name as the project already exists, a new folder is created with a unique numeric suffix to prevent overwriting.

    :param project_folder: 
        - Type: str
        - Description: The absolute path to the project folder. The basename of this path will be used 
                       to name the new training folder.
    
    :param data_folder: 
        - Type: str
        - Description: The absolute path to the directory where all project-related data is stored. 
                       The training folder will be created within this directory.
    
    :return: 
        - Type: str
        - Description: Returns the absolute path of the created training folder. This path can be used 
                       for further processing and data management.

    This function ensures that training data is organized without overwriting or losing previous project data.
    """

    training_folder = os.path.join(data_folder, os.path.basename(project_folder))
    
    # If not exists, create the folder for the training data
    if not os.path.exists(training_folder):
        os.makedirs(training_folder)
        print(f'folder created {training_folder}')
        return training_folder
      
    # If a folder with the same name already exits, create a new one with the same name and a number
    i = 1
    while True:
        new_training_folder = f'{training_folder}_{i}'
        if not os.path.exists(new_training_folder):
            os.makedirs(new_training_folder)

            print(f'folder created {new_training_folder}')
            return new_training_folder
        i += 1

#### Get the data in the training folder

In [28]:
def get_training_data(project_folder):
    """
    This function prepares and organizes the training data by copying the annotated images and their corresponding labels into a new training folder. 
    It ensures that the images are stored in an 'images' subdirectory and the annotations in a 'labels' subdirectory within the newly created training folder.

    :param project_folder: 
        - Type: str
        - Description: The absolute path to the project folder containing the images and annotations 
                       to be used for training.

    :param data_folder: 
        - Type: str
        - Description: The absolute path to the base folder where all project-related data is stored. 
                       The training folder will be created within this directory.
    
    :return: 
        - Type: None
        - Description: This function does not return a value. It copies and organizes the training data 
                       within a new training folder and moves the necessary label files.

    This structure is compatible with various object detection frameworks and can be used directly for training a model. 
    The function ensures that only the annotated images and their labels are moved, making it easier to work with clean and organized data.
    """

    img_folder = img_folder_training(project_folder)
    annotation_folder = ground_truth_folder_training(project_folder)
    
    # If not exists, create the data folder 
    data_folder = get_data_folder(project_folder)
    os.makedirs(data_folder, exist_ok=True)

    # Create the folders in which training data have to be stored
    training_folder = create_training_folder(project_folder, data_folder)

    # Get the list of annotations
    annotation_files = [file for file in os.listdir(annotation_folder) if not file.startswith('.')]
    # print(len(annotation_files))

    # Get a list of annotated images
    annotated_img = []
            
    for annotation_file in annotation_files:
        annotations = open_json_file(os.path.join(annotation_folder, annotation_file))
    
        # Get the name of the image
        annotated_img.append(annotations['task']['data']['image'].split('/')[-1])
    
    # Move the annotated images in a 'images' folder, if not exists created it
    os.makedirs(os.path.join(training_folder, 'images'), exist_ok = True)

    for img in annotated_img:
        shutil.copyfile(os.path.join(img_folder, img), os.path.join(training_folder, 'images', img))

    print(f"Images copied in {os.path.join(training_folder, 'images')}")

    # Move the labels folder in the training folder
    shutil.move(os.path.join(img_folder, 'labels'), os.path.join(training_folder, 'labels'))
    print(f"Labels folder moved to {os.path.join(training_folder, 'labels')}")

    # Move the labels file in the training folder
    shutil.move(os.path.join(img_folder, 'labels.txt'), os.path.join(training_folder, 'labels.txt'))
    print(f"Labels file moved in {os.path.join(training_folder, 'labels.txt')}")

In [None]:
def clean_classes_file(project_folder):
    classes_txt_path = os.path.join(project_folder, 'classes.txt')
    labels_txt_path = classes_txt_path.replace(os.path.basename(classes_txt_path), 'labels.txt')

    try:
        # Read class names
        with open(classes_txt_path, 'r') as f:
            classes = [line.strip() for line in f if line.strip()]

        # Write to labels.txt in the same folder
        with open(classes_txt_path, 'w') as f:
            for i, class_name in enumerate(classes):
                f.write(f"'{i}': '{class_name}'\n")

        os.rename(classes_txt_path, labels_txt_path)
        print(f"Labels file written and renamed to : {classes_txt_path.replace('classes', 'labels')}")

    except Exception as e:
        print(f"There was a probleme loading the classes file :\n{e}")

clean_classes_file('/Users/marioncharpier/Documents/TORNE-H/GitHub/TiamaT/data/TEST')

In [None]:
def create_dataset(project_folder, manually_downloaded):
    """
    Prepares a dataset for training by organizing files and generating required metadata.

    Parameters:
    ----------
    project_folder : str
        Path to the project directory containing 'classes.txt' and an 'images' subfolder.

    manually_downloaded : bool
        If True, processes a manually downloaded dataset by:
        - Cleaning and formatting the 'classes.txt' file.
        - Renaming the existing 'project' folder to match the project folder's name.
        - Copying image files from the 'images' subfolder into the structured project directory.
        - Generating a CSV file from the prepared structure.

        If False, assumes the project folder is already structured and runs the full pipeline:
        - create_csv_file
        - create_labels_file
        - create_annotations_file
        - get_training_data

    Notes:
    -----
    - The function checks for the existence of required folders before renaming or copying.
    - If the target folder already exists, it aborts the operation to avoid overwriting.
    """

    if manually_downloaded:
        clean_classes_file(project_folder)

        project_name = os.path.basename(project_folder)
        project_base_path = os.path.dirname(os.path.dirname(project_folder))

        old_project_path = os.path.join(project_base_path, 'project')
        new_project_path = os.path.join(project_base_path, project_name)

        # Check if the 'project' folder exists
        if not os.path.isdir(old_project_path):
            print(f"[ERREUR] The 'project' folder doesn't existe : {old_project_path}")
            return

        # Check if the destination folder already exists
        if os.path.exists(new_project_path):
            print(f"[ERREUR] The destination folder already exists : {new_project_path}")
            return

        # Rename the 'project' folder
        os.rename(old_project_path, new_project_path)
        print(f"[INFO] The 'project' folder renamed to : {new_project_path}")

        # Path to ground truth image folder
        ground_truth_folder_training = os.path.join(new_project_path, 'image_inputs', 'ground_truth_images')

        # Copy images from project_folder/images
        image_folder = os.path.join(project_folder, 'images')
        if os.path.isdir(image_folder):
            for file in os.listdir(image_folder):
                file_path = os.path.join(image_folder, file)
                if os.path.isfile(file_path):
                    shutil.copy(file_path, ground_truth_folder_training)
        else:
            print(f"[AVERTISSEMENT] The 'images/' folder can't be find : {image_folder}")

        create_csv_file(new_project_path)

    else:
        create_csv_file(project_folder)
        create_labels_file(project_folder)
        create_annotations_file(project_folder)
        get_training_data(project_folder)

## Processing

In [None]:
project_folder = 'PROJECT_DIR'

In [None]:
# Create the training folder
create_dataset(project_folder, manually_downloaded=False)