# Dataset Preparation


### Display the frame rate of a video


In [None]:
import cv2


def print_fps(video_path: str) -> float:
    """
    Print the frame rate (FPS) of the given video.

    Args:
        video_path (str): Path to the video file.

    Returns:
        float: FPS of the video if successfully retrieved, None otherwise.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video file {video_path}")
        return
    fps = cap.get(cv2.CAP_PROP_FPS)
    if fps == 0:
        print("Error: Could not retrieve FPS information")
    else:
        print(f"Frames per second: {fps}")
    cap.release()


if __name__ == "__main__":
    video_path = "../../data/AARP/videos/test/test_indoor1.avi"  # Change this to the path of the video file you want to analyse
    print_fps(video_path)

### Step 1: Frames Extraction

In [None]:
import cv2
import os


# Function to extract frames from video
def extract_frames(video_path, output_folder):
    """
    Extract frames from a video and save them as JPEG images.

    Args:
        video_path (str): The path to the video file from which frames are to be extracted.
        output_folder (str): The directory where the extracted frames will be saved.
    """
    # Initialize the video capture object
    cap = cv2.VideoCapture(video_path)

    count = 0
    success = True

    while success:
        # Read each new frame
        success, img = cap.read()

        # Check if the read was unsuccessful
        if not success:
            print("End of video reached.")
            break

        # Save the current frame as a JPEG image
        output_file = os.path.join(output_folder, f"frame_{count}.jpg")
        cv2.imwrite(output_file, img)

        count += 1

    # Release the video capture object and close all windows
    cap.release()
    cv2.destroyAllWindows()

#### Usage Example

In [None]:
# Replace 'path_to_video' with your actual path to video file
# input_folder_path = "../../data/AARP/videos/test"
# output_folder_path = "../../data/AARP/frames"
# for video in os.listdir(input_folder_path):
#     video_path = os.path.join(input_folder_path, video)
#     file_name = video.split(".")[0]
#     output_path = os.path.join(output_folder_path, file_name)
#     os.makedirs(output_path, exist_ok=True)
#     extract_frames(video_path, output_path)
#     print(f"Frames extracted from {video_path}")

## Label Studio Conversion

In [None]:
def convert_json(input_file: str, output_file: str):
    """
    Converts the input JSON file containing bounding box annotations in percentage format
    to a new JSON file with bounding box coordinates in pixel format.

    Args:
        input_file (str): Path to the input JSON file that contains the annotations.
        output_file (str): Path to the output JSON file where the converted annotations will be saved.

    Returns:
        None: The function saves the converted data to the specified output file.
    """
    with open(input_file, "r") as f:
        data = json.load(f)

    output_data = []

    for item in data:
        frame = item["file_upload"].split("-")[-1].split(".")[0]
        if item["annotations"]:
            first_annotation = item["annotations"][0]
            if first_annotation["result"]:
                first_result = first_annotation["result"][0]
                bbox = first_result["value"]

                # Convert percentages to pixel values based on original image width/height
                x = bbox["x"] * first_result["original_width"] / 100.0
                y = bbox["y"] * first_result["original_height"] / 100.0
                width = bbox["width"] * first_result["original_width"] / 100.0
                height = bbox["height"] * first_result["original_height"] / 100.0

                bbox_xywh = [x, y, width, height]

                output_entry = {"frame": frame, "bbox": bbox_xywh}

                output_data.append(output_entry)

    with open(output_file, "w") as f:
        json.dump(output_data, f, indent=4)

In [None]:
# Specify your input and output file paths
# input_file = "/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/code/LSTM/data/outdoor2/test_outdoor2.json"
# output_file = "/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/code/LSTM/data/outdoor2/outdoor2_full_dataset.json"

# convert_json(input_file, output_file)

### Function to create training, validation & test datasets with a classic JSON file

In [None]:
import json
from sklearn.model_selection import train_test_split


def create_datasets(
    annotation_file: str,
    train_ratio: float = 0.64,
    val_ratio: float = 0.16,
    test_ratio: float = 0.2,
    output_dir: str = ".",
):
    """
    Splits a dataset of annotations into training, validation, and test sets based on the provided ratios
    and saves each split as a JSON file.

    Args:
        annotation_file (str): Path to the JSON file containing the dataset annotations.
        train_ratio (float, optional): Proportion of the data to include in the training set. Defaults to 0.64.
        val_ratio (float, optional): Proportion of the data to include in the validation set. Defaults to 0.16.
        test_ratio (float, optional): Proportion of the data to include in the test set. Defaults to 0.2.
        output_dir (str, optional): Directory where the split datasets will be saved. Defaults to the current directory.

    Returns:
        None: The function saves the split datasets into JSON files in the specified output directory.
    """
    # Loads the annotations json file
    with open(annotation_file, "r") as f:
        annotations = json.load(f)

    # Checks that the division ratios are correct
    assert train_ratio + val_ratio + test_ratio == 1.0, "Ratios should sum to 1.0"

    num_annotations = len(annotations)

    # Calculate the division indices
    train_end = int(train_ratio * num_annotations)
    val_end = int((train_ratio + val_ratio) * num_annotations)

    # Split the annotations into continuous sections for training, validation, and testing
    train_annotations = annotations[:train_end]
    val_annotations = annotations[train_end:val_end]
    test_annotations = annotations[val_end:]

    # Saves JSON files for training, validation, and testing
    with open(f"{output_dir}/train.json", "w") as f:
        json.dump(train_annotations, f, indent=4)

    with open(f"{output_dir}/validation.json", "w") as f:
        json.dump(val_annotations, f, indent=4)

    with open(f"{output_dir}/test.json", "w") as f:
        json.dump(test_annotations, f, indent=4)

    print(
        "The files train.json, validation.json, and test.json have been created and saved."
    )

In [None]:

# create_datasets(
#     "/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/code/LSTM/data/outdoor1/outdoor1_full_dataset.json",
#     output_dir="/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/code/LSTM/data/outdoor1",
# )

### Function to create training, validation & test datasets with a YOLO format dataset folder

In [None]:
def create_splits_and_yaml(dataset_path, train_size=0.7, val_size=0.1, test_size=0.2):
    """
    Splits a dataset of images and labels into training, validation, and test sets,
    and generates a YAML configuration file for the dataset.

    Args:
        dataset_path (str): Path to the dataset directory containing 'images' and 'labels' subdirectories.
        train_size (float, optional): Proportion of the dataset to include in the training set. Defaults to 0.7.
        val_size (float, optional): Proportion of the dataset to include in the validation set. Defaults to 0.1.
        test_size (float, optional): Proportion of the dataset to include in the test set. Defaults to 0.2.

    Returns:
        None: The function creates 'train', 'val', and 'test' directories with 'images' and 'labels' subdirectories,
              and saves the split data and a YAML configuration file to the dataset path.
    """
    dataset_path = Path(dataset_path)

    assert (
        images_path.exists() and labels_path.exists()
    ), "Invalid dataset path, or missing 'images' and 'labels' folders."

    # Get all image files
    images = list(images_path.glob("*.jpg")) + list(
        images_path.glob("*.png")
    )  # depending on your dataset format

    # Split the dataset
    train_images, test_images = train_test_split(
        images, test_size=test_size, random_state=42
    )
    train_images, val_images = train_test_split(
        train_images, test_size=val_size / (train_size + val_size), random_state=42
    )

    def move_files(file_list, dest_images_folder, dest_labels_folder):
        """
        Moves images and corresponding labels to the destination directories.

        Args:
            file_list (list): List of image file paths.
            dest_images_folder (Path): Destination folder for images.
            dest_labels_folder (Path): Destination folder for labels.

        Returns:
            None
        """
        for image in file_list:
            label_file = labels_path / (image.stem + ".txt")
            if label_file.exists():
                shutil.copy(label_file, dest_labels_folder / label_file.name)
            shutil.copy(image, dest_images_folder / image.name)

    split_folders = ["train", "val", "test"]
    for folder in split_folders:
        dest_images_folder = dataset_path / folder / "images"
        dest_labels_folder = dataset_path / folder / "labels"
        (dataset_path / folder).mkdir(parents=True, exist_ok=True)
        dest_images_folder.mkdir(exist_ok=True)
        dest_labels_folder.mkdir(exist_ok=True)

    move_files(
        train_images,
        dataset_path / "train" / "images",
        dataset_path / "train" / "labels",
    )
    move_files(
        val_images, dataset_path / "val" / "images", dataset_path / "val" / "labels"
    )
    move_files(
        test_images, dataset_path / "test" / "images", dataset_path / "test" / "labels"
    )

    # Create the .yaml file
    yaml_content = f"""
    # class names
    names:
        0: fuel port

    # number of classes
    nc: 1

    train: {str(dataset_path / 'train')}
    val: {str(dataset_path / 'val')}
    test: {str(dataset_path / 'test')}
    """

    with open(dataset_path / "dataset.yaml", "w") as yaml_file:
        yaml_file.write(yaml_content.strip())

In [None]:
# Example usage:
dataset_folder = "/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/data/frames/full_dataset_annotated_YOLO"  # replace with your actual dataset path

training_videos = [
    "video_1",
    "video_2",
    "video_3",
    "video_4",
    "video_5",
    "video_6",
    "video_7",
    "video_8",
    "video_9",
    "video_10",
]

validation_videos = ["video_11", "video_12", "video_13"]


test_videos = [
    "video_14",
    "video_15",
    "video_16",
    "video_17",
    "video_18",
    "video_19",
    "video_20",
]


create_splits_and_yaml(dataset_folder)

### Converter for the provided dataset to YOLO format

In [None]:
import os
import json
from glob import glob
import shutil


def convert_annotation(json_file, output_dir_labels):
    """
    Convert a single annotation from a JSON file to YOLO format and save it as a text file.

    Args:
        json_file (str): Path to the JSON annotation file.
        output_dir_labels (str): Directory where the YOLO format label files should be saved.

    Returns:
        None
    """
    with open(json_file, "r") as f:
        data = json.load(f)

    txt_output_path = os.path.join(
        output_dir_labels, os.path.splitext(os.path.basename(json_file))[0] + ".txt"
    )

    with open(txt_output_path, "w") as txt_out:
        for shape in data["shapes"]:
            if shape["shape_type"] != "rectangle":
                continue

            label = shape["label"]
            points = shape["points"]
            x1, y1 = points[0]
            x2, y2 = points[1]

            # Convert to YOLO format
            width = data["imageWidth"]
            height = data["imageHeight"]
            xc = (x1 + x2) / 2 / width
            yc = (y1 + y2) / 2 / height
            w = (x2 - x1) / width
            h = (y2 - y1) / height

            class_id = (
                0  # Update this if you have multiple classes and a class mapping system
            )
            txt_out.write(f"{class_id} {xc} {yc} {w} {h}\n")


def convert_annotations_in_directory(input_dir, output_dir_images, output_dir_labels):
    """
    Convert all JSON annotations in a directory to YOLO format and save them as text files.
    Also, copy corresponding image files to the output images directory.

    Args:
        input_dir (str): Directory containing the JSON annotation files and images.
        output_dir_images (str): Directory where the images should be copied.
        output_dir_labels (str): Directory where the YOLO format label files should be saved.

    Returns:
        None
    """
    if not os.path.exists(output_dir_images):
        os.makedirs(output_dir_images)
    if not os.path.exists(output_dir_labels):
        os.makedirs(output_dir_labels)

    json_files = glob(os.path.join(input_dir, "*.json"))

    for json_file in json_files:
        convert_annotation(json_file, output_dir_labels)

    # Copy image files to the output images directory
    image_files = glob(os.path.join(input_dir, "*.jpg"))
    for image_file in image_files:
        dest_file = os.path.join(output_dir_images, os.path.basename(image_file))
        if not os.path.exists(dest_file):
            shutil.copy(image_file, dest_file)

In [None]:
# if __name__ == '__main__':
#     input_directory = '/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/data/official_dataset_annotated/experiments/video_lab_platform_8'  # Replace with the path to your input directory
#     output_directory_images = input_directory + "_yolo/images"  # Replace with the path to your output images directory
#     output_directory_labels = input_directory + "_yolo/labels"  # Replace with the path to your output labels directory

#     convert_annotations_in_directory(input_directory, output_directory_images, output_directory_labels)

# YOLO Fine Tuning - New Dataset Preparation Method

In [None]:
import csv
import os
import shutil


def split_YOLO_Dataset_from_CSV_exported_file(
    export_csv_file_path, yolo_dataset_folder_path, output_folder_path
):
    """
    Split a YOLO dataset based on a CSV file that maps images to different videos.

    Args:
        export_csv_file_path (str): Path to the CSV file exported from the source system.
        yolo_dataset_folder_path (str): Path to the YOLO dataset containing 'images' and 'labels' folders.
        output_folder_path (str): Directory where the split dataset will be saved, with one folder per video.

    Returns:
        None
    """
    video_id = 1
    frames_per_video = 0

    with open(export_csv_file_path, "r") as file:
        reader = csv.DictReader(file)
        for row in reader:
            try:
                # Extract the frame ID and image file name
                frame_id = int(row["image"].split("_")[-1].split(".")[0])
                image_file = row["image"].split("/")[-1]  # Extract the image file name
                image_label_file = image_file.replace(
                    ".jpg", ".txt"
                )  # Replace the image file extension with .txt for the label file

                # Retrieve the image and label file paths
                from_image_file_path = os.path.join(
                    yolo_dataset_folder_path, "images", image_file
                )
                from_image_label_file_path = os.path.join(
                    yolo_dataset_folder_path, "labels", image_label_file
                )

                # Create a folder if it doesn't exist (1 folder per video)
                to_video_folder_path = os.path.join(
                    output_folder_path, f"video_{video_id}"
                )
                os.makedirs(to_video_folder_path, exist_ok=True)

                # Create an "images" folder if it doesn't exist
                to_images_folder_path = os.path.join(to_video_folder_path, "images")
                os.makedirs(to_images_folder_path, exist_ok=True)

                # Create a "labels" folder if it doesn't exist
                to_labels_folder_path = os.path.join(to_video_folder_path, "labels")
                os.makedirs(to_labels_folder_path, exist_ok=True)

                # Copy the image file to the "images" folder
                shutil.copy2(
                    from_image_file_path,
                    os.path.join(to_images_folder_path, image_file),
                )

                # Copy the label file to the "labels" folder
                shutil.copy2(
                    from_image_label_file_path,
                    os.path.join(to_labels_folder_path, image_label_file),
                )

                # Increment the video ID if we detect the start of a new video
                if frames_per_video and frame_id == 0:
                    video_id += 1
                    frames_per_video = 0

                # Increment the frame count for the current video
                frames_per_video += 1

            except Exception as e:
                print(f"Error processing row: {row}")
                print(e)

In [None]:
export_csv_file_path = "/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/data/frames/full_dataset_annotated_fpp/raw_data.csv"
yolo_dataset_folder_path = "/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/data/frames/full_dataset_annotated_YOLO"
output_folder_path = "/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/data/frames/full_dataset_annotated_YOLO"
split_YOLO_Dataset_from_CSV_exported_file(
    export_csv_file_path, yolo_dataset_folder_path, output_folder_path
)

### Convert Label Studio CSV Export file to full training json file

In [None]:
import csv
import json


def verify_value_is_between_0_and_1(value):
    """
    Ensures that a value is within the range [0, 1].

    Args:
        value (float): The value to be checked.

    Returns:
        float: The value clipped to be within [0, 1].
    """
    return max(0, min(1, value))


def convert_percentage_to_normalized(
    x_pct, y_pct, width_pct, height_pct, original_width, original_height
):
    """
    Converts percentage coordinates to normalized coordinates.

    Args:
        x_pct (float): The x coordinate as a percentage.
        y_pct (float): The y coordinate as a percentage.
        width_pct (float): The width as a percentage.
        height_pct (float): The height as a percentage.
        original_width (int): The original width of the image.
        original_height (int): The original height of the image.

    Returns:
        tuple: Normalized x, y, width, and height.
    """
    x = x_pct / 100.0
    y = y_pct / 100.0
    width = width_pct / 100.0
    height = height_pct / 100.0
    return x, y, width, height


def convert_csv_to_json(csv_file):
    """
    Converts a CSV file with image and bounding box annotations into a JSON format organized by video.

    Args:
        csv_file (str): Path to the input CSV file.

    Returns:
        list: A list of dictionaries, each containing a video ID and associated frames with bounding box data.
    """
    data = []
    video_id = 1
    frames = []

    with open(csv_file, "r") as file:
        reader = csv.DictReader(file)
        for row in reader:
            try:
                annotation = json.loads(row["label"]) if row["label"] else []
                frame_id = int(row["image"].split("_")[-1].split(".")[0])
                image_name = row["image"].split("/")[-1]

                # Start a new video segment if frame_id is 0
                if frame_id == 0 and frames:
                    data.append({"video_id": video_id, "frames": frames})
                    video_id += 1
                    frames = []

                if annotation:
                    original_width = annotation[0]["original_width"]
                    original_height = annotation[0]["original_height"]

                    value = annotation[0]
                    x_pct = value["x"]
                    y_pct = value["y"]
                    width_pct = value["width"]
                    height_pct = value["height"]

                    # Convert to normalized coordinates
                    x_norm, y_norm, width_norm, height_norm = (
                        convert_percentage_to_normalized(
                            x_pct,
                            y_pct,
                            width_pct,
                            height_pct,
                            original_width,
                            original_height,
                        )
                    )

                    bbox_xywh_norm = [x_norm, y_norm, width_norm, height_norm]

                    output_entry = {
                        "image_name": image_name,
                        "frame": frame_id,
                        "bbox": bbox_xywh_norm,
                    }
                    frames.append(output_entry)

            except json.JSONDecodeError:
                print(f"Skipping row due to JSON decode error: {row}")
                continue

    if frames:
        data.append({"video_id": video_id, "frames": frames})

    return data

In [None]:
# Example usage
csv_file = '/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/data/frames/full_dataset_annotated_fpp/raw_data.csv'
json_data = convert_csv_to_json(csv_file)

with open('/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/data/frames/full_dataset_annotated_fpp/full_dataset_formatted.json', 'w') as file:
    json.dump(json_data, file, indent=4)

### Function to create the final YOLO Training set

In [None]:
import os
import shutil
import random
import math


def collect_images(source_folder, split):
    """
    Collects all images from the specified split within the source folder.

    Args:
        source_folder (str): Path to the folder containing the images for a specific class.
        split (str): The data split to collect images from (e.g., 'train', 'val', 'test').

    Returns:
        list: A list of paths to the images within the specified split.
    """
    all_images = []
    split_path = os.path.join(source_folder, split)
    all_folder_path = os.path.join(split_path, "all")

    if os.path.isdir(all_folder_path):
        images_path = os.path.join(all_folder_path, "images")
        if os.path.isdir(images_path):
            all_images.extend(
                [
                    os.path.join(images_path, f)
                    for f in os.listdir(images_path)
                    if f.endswith((".jpg", ".jpeg", ".png"))
                ]
            )
    return all_images


def split_and_balance_dataset(
    source_folders, dest_folder, train_ratio, val_ratio, test_ratio
):
    """
    Splits and balances a dataset into training, validation, and test sets according to specified ratios.
    The dataset is balanced across classes and saved in the YOLO format.

    Args:
        source_folders (dict): A dictionary where keys are class names and values are paths to the corresponding class folders.
        dest_folder (str): The destination folder where the balanced dataset will be saved.
        train_ratio (float): Ratio of the dataset to use for training.
        val_ratio (float): Ratio of the dataset to use for validation.
        test_ratio (float): Ratio of the dataset to use for testing.
    """
    # Create the destination folder if it doesn't exist
    os.makedirs(dest_folder, exist_ok=True)

    # Create subfolders for train, val, and test splits
    for split in ["train", "val", "test"]:
        os.makedirs(os.path.join(dest_folder, split), exist_ok=True)
        for subfolder in ["images", "labels"]:
            os.makedirs(os.path.join(dest_folder, split, subfolder), exist_ok=True)

    # Collect images for each class from their respective folders
    dataset = {}
    for key, source_folder in source_folders.items():
        dataset[key] = {
            "train": collect_images(source_folder, "train"),
            "val": collect_images(source_folder, "val"),
            "test": collect_images(source_folder, "test"),
        }

    # Determine the minimum number of images available across all classes for each split
    min_images = {}
    for split in ["train", "val", "test"]:
        min_images[split] = min(len(dataset[key][split]) for key in dataset.keys())

    # Balance each split by truncating to the minimum number of images
    for key in dataset.keys():
        for split in ["train", "val", "test"]:
            random.shuffle(
                dataset[key][split]
            )  # Shuffle the images to ensure randomness
            dataset[key][split] = dataset[key][split][
                : min_images[split]
            ]  # Truncate to the min number of images

    # Create balanced train, val, and test sets by concatenating images from all classes
    train_set = []
    val_set = []
    test_set = []
    for key in dataset.keys():
        train_set.extend(dataset[key]["train"])
        val_set.extend(dataset[key]["val"])
        test_set.extend(dataset[key]["test"])

    # Shuffle the sets to ensure randomness
    random.shuffle(train_set)
    random.shuffle(val_set)
    random.shuffle(test_set)

    # Calculate the total number of images
    total_images = len(train_set) + len(val_set) + len(test_set)

    # Determine the number of images for each split based on the ratios
    val_num_images = math.floor(total_images * val_ratio)
    test_num_images = math.floor(total_images * test_ratio)
    train_num_images = total_images - val_num_images - test_num_images
    if train_num_images > len(train_set):
        train_num_images = len(train_set)

    # Adjust the number of images in test and val sets to match the ratios
    total = val_num_images + test_num_images + train_num_images
    if ((val_num_images + test_num_images) / total) > (val_ratio + test_ratio):
        while ((val_num_images + test_num_images) / total) > (val_ratio + test_ratio):
            test_num_images -= 1
            val_num_images -= 1

    # Truncate the sets to the calculated number of images
    train_set = train_set[:train_num_images]
    val_set = val_set[:val_num_images]
    test_set = test_set[:test_num_images]

    # Copy the files to the destination directory in YOLO format
    for split in ["train", "val", "test"]:
        dataset = (
            train_set if split == "train" else val_set if split == "val" else test_set
        )

        # Define the destination folders for images and labels
        images_folder = os.path.join(dest_folder, split, "images")
        labels_folder = os.path.join(dest_folder, split, "labels")

        # Create a text file listing all images in the split
        with open(os.path.join(dest_folder, f"{split}.txt"), "w") as f:
            for image_path in dataset:
                image_name = os.path.basename(image_path)
                label_name = os.path.splitext(image_name)[0] + ".txt"
                label_path = os.path.join(
                    os.path.dirname(os.path.dirname(image_path)), "labels", label_name
                )

                # Copy the image file
                shutil.copy(image_path, os.path.join(images_folder, image_name))

                # Write the relative path to the image in the list file
                f.write(f"{split}/images/{image_name}\n")

                # Copy the label file if it exists
                if os.path.exists(label_path):
                    shutil.copy(label_path, os.path.join(labels_folder, label_name))
                else:
                    print(
                        f"Warning: Label missing for {image_name} in {os.path.dirname(image_path)}"
                    )

    # Create the data.yaml file for YOLO training
    with open(os.path.join(dest_folder, "data.yaml"), "w") as f:
        f.write(f"train: {os.path.join(dest_folder, 'train.txt')}\n")
        f.write(f"val: {os.path.join(dest_folder, 'val.txt')}\n")
        f.write(f"test: {os.path.join(dest_folder, 'test.txt')}\n")
        f.write(f"nc: {len(source_folders)}\n")
        f.write(f"names: {list(source_folders.keys())}\n")

    # Print statistics about the number of images in each split
    total_images = len(train_set) + len(val_set) + len(test_set)
    print("Number of images per class in each split:")
    print(
        f"Training set: Number of images: {len(train_set)} and Ratio: {len(train_set) / total_images:.2f}"
    )
    print(
        f"Validation set: Number of images: {len(val_set)} and Ratio: {len(val_set) / total_images:.2f}"
    )
    print(
        f"Test set: Number of images: {len(test_set)} and Ratio: {len(test_set) / total_images:.2f}"
    )


# Example usage
source_folders = {
    "Close": "/path/to/close_class",
    "Open": "/path/to/open_class",
    "Semi-Open": "/path/to/semiopen_class",
}
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

destination_folder_path = "/path/to/save/balanced_dataset"

split_and_balance_dataset(
    source_folders, destination_folder_path, train_ratio, val_ratio, test_ratio
)

### Function to create the final FPP Training set

In [None]:
import os
import shutil
import random
import json


def collect_data(source_folder, split):
    """
    Collects image and label data from the specified source folder and split (train, val, or test).

    Args:
        source_folder (str): The path to the folder containing the dataset for a specific class.
        split (str): The data split to collect images from (e.g., 'train', 'val', 'test').

    Returns:
        list: A list of dictionaries, each representing a video with its frames and associated data.
    """
    all_data = []
    split_path = os.path.join(source_folder, split)

    # Iterate through each video folder within the split
    for video_folder in os.listdir(split_path):
        images_path = os.path.join(split_path, video_folder, "images")
        if os.path.isdir(images_path):
            frames = []
            # Iterate through each image file within the video folder
            for image_file in os.listdir(images_path):
                if image_file.endswith((".jpg", ".jpeg", ".png")):
                    image_path = os.path.join(images_path, image_file)
                    image_name = os.path.basename(image_path)

                    # Get the corresponding label file path
                    label_name = os.path.splitext(image_name)[0] + ".txt"
                    label_path = os.path.join(
                        os.path.dirname(os.path.dirname(image_path)),
                        "labels",
                        label_name,
                    )

                    # Extract the frame ID from the image name
                    frame_id = int(image_name.split("_")[-1].split(".")[0])

                    # Initialize bounding box variables
                    class_id, x, y, w, h = None, None, None, None, None
                    if os.path.exists(label_path):
                        # Read the bounding box data from the label file
                        with open(label_path, "r") as f:
                            line = f.readline().strip().split()
                            if len(line) == 5:
                                class_id, x, y, w, h = line
                                class_id = int(class_id)
                                x, y, w, h = float(x), float(y), float(w), float(h)

                    # Append the frame data to the frames list
                    frames.append(
                        {
                            "frame_id": frame_id,
                            "image_name": image_name,
                            "class_id": class_id,
                            "bbox": [x, y, w, h],
                        }
                    )

            # Sort frames by their frame ID
            frames = sorted(frames, key=lambda x: x["frame_id"])
            all_data.append({"video_id": video_folder, "frames": frames})

    return all_data


def prepare_datasets(source_folders, dest_folder):
    """
    Prepares datasets by collecting data from multiple source folders for different classes
    and splitting them into training, validation, and test datasets. The resulting datasets
    are saved as JSON files in the destination folder.

    Args:
        source_folders (dict): A dictionary where keys are class names and values are paths to the corresponding class folders.
        dest_folder (str): The destination folder where the JSON files (train, val, test) will be saved.
    """
    # Initialize the dataset dictionary for each class
    dataset = {key: {} for key in source_folders.keys()}

    # Collect data for each class and each split (train, val, test)
    for key, folder in source_folders.items():
        for split in ["train", "val", "test"]:
            dataset[key][split] = collect_data(folder, split)

    # Initialize lists to hold the data for each split
    train_data = []
    val_data = []
    test_data = []

    # Combine the data from all classes for each split
    for key in dataset.keys():
        for video in dataset[key]["train"]:
            train_data.append(video)
        for video in dataset[key]["val"]:
            val_data.append(video)
        for video in dataset[key]["test"]:
            test_data.append(video)

    # Save the combined data to JSON files
    with open(os.path.join(dest_folder, "train.json"), "w") as f:
        json.dump(train_data, f, indent=4)

    with open(os.path.join(dest_folder, "val.json"), "w") as f:
        json.dump(val_data, f, indent=4)

    with open(os.path.join(dest_folder, "test.json"), "w") as f:
        json.dump(test_data, f, indent=4)


# Example usage
source_folders = {
    "Close": "/path/to/close_class",
    "Open": "/path/to/open_class",
    "Semi-Open": "/path/to/semiopen_class",
}

destination_folder_path = "/path/to/save/annotated_fpp"

prepare_datasets(source_folders, destination_folder_path)