# Descriptive statistics of training data set

These scripts are designed to prepare to descibe data for YOLOv8 training.
All these scripts are designed to process image and text data with the same name (except for the extension) contained in two folders, one named 'labels' with all the .txt files and one 'images' with all the images.

The results obtained should make it possible to evaluate the relevance of the corpus used for training and to investigate the reasons for the model's robustness.


**Warning before using the notebook**

Create a 'labels.txt' file containing annotation classes in the following format:

'0': 'class_name0',  
'1': 'class_name1',  
'2': 'class_name2'

## Environment

In [None]:
import os
import re
import codecs
import shutil
import pandas as pd
import matplotlib.pyplot as plt

from modules.class_names_functions import get_labels

## Functions

### Functions to create the folder for statistics results and the path to data folders

In [None]:
def create_stats_folder(folder_path):
    
    """
    This script creates a folder in which the statistical data from the dataset used for training will be stored.
    Before creating the folder the function check if the statistic folder exists,
    if not create it and move the labels file in it.
    
    """
    
    if not os.path.exists(os.path.join(folder_path, 'dataset_statistics')):
        os.makedirs(os.path.join(folder_path, 'dataset_statistics'))

### Functions to describe the annotated sources

#### Clean up annotated data names with Label Studio

In [None]:
def clean_LS(folder_path, annotated_with_LS):
    """
    This function is used to 'clean up' the names of files downloaded after being annotated 
    with Label Studio. Files retrieved in YOLO format from LS have a string of 8 characters 
    (letters or numbers) followed by a '-'. This function removes these additions so that the 
    data can be processed with the following functions, which use the file names to produce 
    statistics on the names of the manuscripts from which the images were taken.
    
    """
    
    if annotated_with_LS:
        img_folder = os.path.join(folder_path, 'images')
        label_folder = os.path.join(folder_path, 'labels')

        # Browse the files in the 'images' directory
        for img_file in os.listdir(img_folder):
            new_img_filename = img_file[9:]
            new_img_filepath = os.path.join(img_folder, new_img_filename)
            
            os.rename(os.path.join(img_folder, img_file), new_img_filepath)
            print(f"Renamed image file : {img_file} -> {new_img_filename}")

        # Browse the files in the 'labels' directory
        for label_file in os.listdir(label_folder):
            new_label_filename = label_file[9:]
            new_label_filepath = os.path.join(label_folder, new_label_filename)
            
            os.rename(os.path.join(label_folder, label_file), new_label_filepath)
            print(f"Renamed label file : {label_file} -> {new_label_filename}")

#### Number of manuscripts from which training images are sourced

In [None]:
def nb_manuscripts(folder_path):
    
    """
    This function determines the number of manuscripts used to build the dataset, 
    according to their name.

    """    
    image_extensions = (".jpg", ".jpeg", ".png")
    image_files = [filename for filename in os.listdir(os.path.join(folder_path, 'images')) if filename.endswith(image_extensions)]
   
    # List to store manuscripts names
    ms_names = []

    # Browse directory files
    for file_name in image_files:
        # Get the manuscript name without the folio number (after the last '_')
        ms_name = file_name.rsplit('_', 2)[0]
        
        ms_names.append(ms_name)

    # Delete duplicates
    files_name_sans_doublons = list(set(ms_names))
    
    ms_nb = len(files_name_sans_doublons)
    
    return ms_nb


#### Number of images per manuscript

In [None]:
def img_per_ms(folder_path):
    
    """
    This function determines the number of images per manuscript used to build the dataset, 
    according to their name.
    
    """

    image_extensions = (".jpg", ".jpeg", ".png")
    image_files = [filename for filename in os.listdir(os.path.join(folder_path, 'images')) if filename.endswith(image_extensions)]
    # image_files = [filename.rsplit('_', 1)[0] for filename in os.listdir(os.path.join(folder_path, 'images')) if filename.endswith(image_extensions)]
    
    idno = re.compile(r'^(.+)_(\d+)\.(jpg|jpeg|png)$')

    # Dictionary to store manuscripts and their associated image filenames
    manuscripts_images = {}

    # Browse directory files
    for file_name in image_files:
        # Get the manuscript name without the folio number (after the last '_')
        ms_name = file_name.rsplit('_', 2)[0]
            
        if ms_name not in manuscripts_images:
            manuscripts_images[ms_name] = [file_name]
        else:
            manuscripts_images[ms_name].append(file_name)

    # Print the results and store them in a list
    results = []
    for ms_name, image_list in manuscripts_images.items():
        num_images = len(image_list)
        results.append((ms_name, num_images))
    
    # Create a DataFrame from the results
    df = pd.DataFrame(results, columns=['ms_name', 'nb_images'])

    # Write the DataFrame to a CSV file with ';' as the separator
    csv_file_path = os.path.join(os.path.join(folder_path, 'dataset_statistics'), 'img_per_ms.csv')
    df.to_csv(csv_file_path, index=False, sep=';')
    
    print(f'{csv_file_path} created')
    
    # Separate keys and values
    manuscrits = list(manuscripts_images.keys())
    nb_images = [len(image_list) for image_list in manuscripts_images.values()]

    # Creating a stacked bar graph
    plt.bar(manuscrits, nb_images)

    # Hide x-axis labels
    plt.xticks([])

    # Configuration des étiquettes des axes et du titre
    plt.xlabel('Manuscripts')
    plt.ylabel('Number of images')
    plt.title('Distribution of images by manuscript')

    # Display and save the graph
    plt.savefig(os.path.join(folder_path, 'dataset_statistics', 'img_per_ms.png'), bbox_inches='tight')
    plt.show()

###  Distribution of annotations

#### Get the annotation files

In [None]:
def get_annotation_files(img_folder, txt_folder):
    
    """
    This function get the list of .txt files containing the annotations used to train the model.
    
    The "img_folder" parameter is the path to the folder in which images are stored.
    The "txt_folder" parameter is the path to the folder in which annotations are stored.
    
    """
    
    image_extensions = (".jpg", ".jpeg", ".png")
    image_files = [filename for filename in os.listdir(img_folder) if filename.endswith(image_extensions)]

    annotation_files = []
    
    for image_file in image_files:
        image_name, image_ext = os.path.splitext(image_file)
        annotation_file = os.path.join(txt_folder, image_name + '.txt')
        
        if os.path.exists(annotation_file):
            annotation_files.append(annotation_file)
            
    return annotation_files

#### Check that all annotation files are utf-8 encoded

In [None]:
def encoding(folder_path):
    
    """
    This function ensures that annotation files are encoded in utf-8, a required format for model training with YOLOv8.
    
    """
    
    annotations_txt = get_annotation_files(os.path.join(folder_path, 'images'), os.path.join(folder_path, 'labels'))

    for filename in annotations_txt:
        file_path = os.path.join(os.path.join(folder_path, 'labels'), filename)
        with open(file_path, 'rb') as f:
            rawdata = f.read()
        try:
            result = codecs.decode(rawdata, 'utf-8')
        except UnicodeDecodeError:
            try:
                result = codecs.decode(rawdata, 'iso-8859-1')
                print(f"{filename} is encoded in ISO-8859-1")
            except UnicodeDecodeError:
                print(f"{filename} encoding not recognized")

#### Function to get the number of images without annotations

In [None]:
def img_without_annotations(img_folder, txt_folder):
    
    """
    This function allows you to retrieve the list of unannotated images, whether they have no annotation file 
    or an empty annotation file.
    
    The "img_folder" parameter is the path to the folder in which images are stored.
    The "txt_folder" parameter is the path to the folder in which annotations are stored.
    """
    
    annotation_files = get_annotation_files(img_folder, txt_folder)
    
    image_extensions = (".jpg", ".jpeg", ".png")
    image_files = [filename for filename in os.listdir(os.path.join(folder_path, 'images')) if filename.endswith(image_extensions)]
    
    count = 0
    for image_file in image_files:
        image_name, image_ext = os.path.splitext(image_file)
        annotation_file = os.path.join(txt_folder, image_name + '.txt')
        if annotation_file not in annotation_files:
            count += 1
            print(f"Image {image_file} has no annotation file")
    
    for annotation_file in annotation_files:
        with open(os.path.join(txt_folder, annotation_file), 'r') as f:
            annotations = f.read()
            if annotations == "":
                count += 1
    return count

#### Get number of annotations per image

In [None]:
def annotations_per_img(folder_path):
    """
    This function calculates the number of annotations per image and produces a 
    .csv file containing the results.
    
    """

    annotation_files = get_annotation_files(os.path.join(folder_path, 'images'), os.path.join(folder_path, 'labels'))
    
    lines_per_file = {}

    for annotation_file in annotation_files:
        with open(os.path.join(os.path.join(folder_path, 'labels'), annotation_file), 'r') as f:
            nb_lines = 0
            for line in f:
                nb_lines += 1

        image_name = os.path.splitext(annotation_file)[0]  # Get the image name without extension
        image_path = os.path.join(os.path.join(folder_path, 'images'), f'{image_name}.jpg')  # Assume images have .jpg extension, modify as needed
        lines_per_file[image_path] = nb_lines
    
    lines_per_file_tries = dict(sorted(lines_per_file.items(), key=lambda x: x[1], reverse=True))

    # Create a DataFrame from the results
    df = pd.DataFrame(lines_per_file_tries.items(), columns=['image_name', 'annotations_nb'])

    # Write the DataFrame to a CSV file with ';' as the separator
    csv_file_path = os.path.join(os.path.join(folder_path, 'dataset_statistics'), 'annotations_per_img.csv')
    df.to_csv(csv_file_path, index=False, sep=';')

    print(f'{csv_file_path} created')    

#### Get total number of annotations

In [None]:
def total_annotations(img_folder, txt_folder):
    
    """
    This function retrieves the total number of annotations in the training dataset.
    
    The "img_folder" parameter is the path to the folder in which images are stored.
    The "txt_folder" parameter is the path to the folder in which annotations are stored.
    """
    
    annotation_files = get_annotation_files(img_folder, txt_folder)

    total_lines = 0

    for annotation_file in annotation_files:
        with open(os.path.join(txt_folder, annotation_file), 'r') as f:
            nb_lines = 0
            for line in f:
                if line.strip():  # ignore les lines vides
                    nb_lines += 1
            total_lines += nb_lines

    return total_lines
    print(f"The total number of annotations is {total_lines}.")

#### Get the number of annotations for each class

In [None]:
def classes_distribution(folder_path):
    
    """
    This function counts the number of annotations per class and stored the result in a 
    .csv file.
    
    """


    # Get the labels from the labels.txt file
    annotation_classes = get_labels(os.path.join(folder_path, 'labels.txt'))
    annotation_files = get_annotation_files(os.path.join(folder_path, 'images'), os.path.join(folder_path, 'labels'))
    
    annotation_labels = annotation_classes

    occurrences = {}
    for annotation_file in annotation_files:
        with open(os.path.join(os.path.join(folder_path, 'labels'), annotation_file), 'r', encoding='ascii') as f:
            for line in f:
                annotation_code = line.split()[0]
                if annotation_code not in occurrences:
                    occurrences[annotation_code] = 1
                else:
                    occurrences[annotation_code] += 1

    # Map annotation codes to class names
    class_names = [annotation_labels[code].strip() for code in occurrences.keys()]
    
    # Create a DataFrame from the results
    df = pd.DataFrame({'class_name': class_names, 'nb_occurrences': occurrences.values()})

    # Write the DataFrame to a CSV file with ';' as the separator
    csv_file_path = os.path.join(os.path.join(folder_path, 'dataset_statistics'), 'class_distribution.csv')
    df.to_csv(csv_file_path, index=False, sep=';')

    print(f'{csv_file_path} created')
    
    # Creating a stacked bar chart
    plt.barh(class_names, occurrences.values())

    # Setting axis and title labels
    plt.xlabel('Nombre d\'occurrences')
    plt.ylabel('Classes')
    plt.title('Distribution des classes')

    # Display and save the graph
    plt.savefig(os.path.join(folder_path, 'dataset_statistics', 'class_distribution.png'), bbox_inches='tight')
    plt.show()


#### Output global statistics

In [None]:
def get_global_results(folder_path):
    
    """
    The function generates a txt file with the data from the functions:'nb_manuscripts', 'img_without_annotations',
    'total_annotations'.
    
    """
    
    # Calculate the metrics
    metrics = {
        'Number of manuscripts': nb_manuscripts(folder_path),
        'Number of files without annotations': img_without_annotations(os.path.join(folder_path, 'images'), os.path.join(folder_path, 'labels')),
        'Total number of annotations': total_annotations(os.path.join(folder_path, 'images'), os.path.join(folder_path, 'labels'))
    }

    # Create a DataFrame from the results
    df = pd.DataFrame(metrics.items(), columns=['metric', 'value'])

    # Write the DataFrame to a CSV file with ';' as the separator
    csv_file_path = os.path.join(os.path.join(folder_path, 'dataset_statistics'), 'global_data.csv')
    df.to_csv(csv_file_path, index=False, sep=';')

    print(f'{csv_file_path} created')

## Processing

In [None]:
folder_path = 'ABSPATHTOTHEFOLDER' # to be modified, absolute path to the folder in which the training session data are stored

In [None]:
# Create the statistic folder
create_stats_folder(folder_path)

In [None]:
# Clean Label Studio file, default False, change as needed
clean_LS(folder_path, annotated_with_LS=False)

In [None]:
# Print a txt file with the number of images per manuscript
img_per_ms(folder_path)

In [None]:
# Check encoding format of annotation files
encoding(folder_path)

In [None]:
# Print a txt file with the number of annotations per image
annotations_per_img(folder_path)

In [None]:
# Print a file with the distribution of classes in the training dataset
classes_distribution(folder_path)

In [None]:
# Print a file with the number of manuscripts used for training, the number of unannotated images and total annotations
get_global_results(folder_path)