# Dataset Preparation


## Frames Extraction


In [None]:
import cv2
import os


# Function to extract frames from video
def extract_frames(video_path, output_folder):
    # Initialize the video capture object
    cap = cv2.VideoCapture(video_path)

    count = 0
    success = True

    while success:
        # Read each new frame
        success, img = cap.read()

        # Check if the read was unsuccessful
        if not success:
            print("End of video reached.")
            break

        # Save the current frame as a JPEG image
        output_file = os.path.join(output_folder, f"frame_{count}.jpg")
        cv2.imwrite(output_file, img)

        count += 1

    # Release the video capture object and close all windows
    cap.release()
    cv2.destroyAllWindows()

In [None]:
# Replace 'path_to_video' with your actual path to video file
# input_folder_path = "/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/data/official_dataset/test"
# output_folder_path = "/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/data/frames"
# for video in os.listdir(input_folder_path):
#     video_path = os.path.join(input_folder_path, video)
#     file_name = video.split(".")[0]
#     output_path = os.path.join(output_folder_path, file_name)
#     os.makedirs(output_path, exist_ok=True)
#     extract_frames(video_path, output_path)
#     print(f"Frames extracted from {video_path}")

## Label Studio Conversion

In [None]:
import json 

def convert_json(input_file, output_file):
    with open(input_file, "r") as f:
        data = json.load(f)

    output_data = []

    for item in data:
        frame = item["file_upload"].split("-")[-1].split(".")[0]
        if item["annotations"]:
            first_annotation = item["annotations"][0]
            if first_annotation["result"]:
                first_result = first_annotation["result"][0]
                bbox = first_result["value"]

                # Convert percentages to pixel values based on original image width/height
                x = bbox["x"] * first_result["original_width"] / 100.0
                y = bbox["y"] * first_result["original_height"] / 100.0
                width = bbox["width"] * first_result["original_width"] / 100.0
                height = bbox["height"] * first_result["original_height"] / 100.0

                bbox_xywh = [x, y, width, height]

                output_entry = {
                    "frame": frame,
                    "bbox": bbox_xywh
                }

                output_data.append(output_entry)

    with open(output_file, "w") as f:
        json.dump(output_data, f, indent=4)

In [None]:
# Specify your input and output file paths
# input_file = "/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/code/LSTM/data/outdoor2/test_outdoor2.json"
# output_file = "/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/code/LSTM/data/outdoor2/outdoor2_full_dataset.json"

# convert_json(input_file, output_file)

### Function to create training, validation & test datasets with a classic JSON file

In [None]:
import json
from sklearn.model_selection import train_test_split


def create_datasets(
    annotation_file, train_ratio=0.64, val_ratio=0.16, test_ratio=0.2, output_dir="."
):
    """
    Creates train.json, validation.json and test.json files from annotations.json.

    Parameters :
        annotation_file (str): Path to the annotations.json file
        train_ratio (int): Ratio of data allocated to training
        val_ratio (int): Ratio of data assigned to validation
        test_ratio (int) : Ratio of data allocated to tests
        output_dir (str): Directory where the resultant files will be saved
    """
    # Loads the annotations json file
    with open(annotation_file, "r") as f:
        annotations = json.load(f)

    # Checks that the division ratios are correct
    assert train_ratio + val_ratio + test_ratio == 1.0, "Ratios should slumber at 1.0"

    num_annotations = len(annotations)

    # Calculer les indices de division
    train_end = int(train_ratio * num_annotations)
    val_end = int((train_ratio + val_ratio) * num_annotations)

    # Découper les annotations en chaînes continues pour l'entraînement, la validation et les tests
    train_annotations = annotations[:train_end]
    val_annotations = annotations[train_end:val_end]
    test_annotations = annotations[val_end:]

    # Saves JSON files for training, validation and testing
    with open(f"{output_dir}/train.json", "w") as f:
        json.dump(train_annotations, f, indent=4)

    with open(f"{output_dir}/validation.json", "w") as f:
        json.dump(val_annotations, f, indent=4)

    with open(f"{output_dir}/test.json", "w") as f:
        json.dump(test_annotations, f, indent=4)

    print(
        "‘The files train.json, validation.json and test.json have been created and saved."
    )

In [None]:

# create_datasets(
#     "/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/code/LSTM/data/outdoor1/outdoor1_full_dataset.json",
#     output_dir="/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/code/LSTM/data/outdoor1",
# )

### Function to create training, validation & test datasets with a YOLO format dataset folder

In [1]:
import os
import random
import shutil
from pathlib import Path
from sklearn.model_selection import train_test_split

def create_splits_and_yaml(dataset_path, train_size=0.64, val_size=0.16, test_size=0.2):
    dataset_path = Path(dataset_path)
    images_path = dataset_path / "images"
    labels_path = dataset_path / "labels"

    assert images_path.exists() and labels_path.exists(), "Invalid dataset path, or missing 'images' and 'labels' folders."

    # Get all image files
    images = list(images_path.glob("*.jpg")) + list(images_path.glob("*.png"))  # depending on your dataset format

    # Split the dataset
    train_images, test_images = train_test_split(images, test_size=test_size, random_state=42)
    train_images, val_images = train_test_split(train_images, test_size=val_size/(train_size+val_size), random_state=42)

    def move_files(file_list, dest_images_folder, dest_labels_folder):
        for image in file_list:
            label_file = labels_path / (image.stem + ".txt")
            if label_file.exists():
                shutil.copy(label_file, dest_labels_folder / label_file.name)
            shutil.copy(image, dest_images_folder / image.name)

    split_folders = ["train", "val", "test"]
    for folder in split_folders:
        dest_images_folder = dataset_path / folder / "images"
        dest_labels_folder = dataset_path / folder / "labels"
        (dataset_path / folder).mkdir(parents=True, exist_ok=True)
        dest_images_folder.mkdir(exist_ok=True)
        dest_labels_folder.mkdir(exist_ok=True)

    move_files(train_images, dataset_path / "train" / "images", dataset_path / "train" / "labels")
    move_files(val_images, dataset_path / "val" / "images", dataset_path / "val" / "labels")
    move_files(test_images, dataset_path / "test" / "images", dataset_path / "test" / "labels")

    # Create the .yaml file
    yaml_content = f"""
    # class names
    names:
        0: fuel port

    # number of classes
    nc: 1

    train: {str(dataset_path / 'train')}
    val: {str(dataset_path / 'val')}
    test: {str(dataset_path / 'test')}
    """

    with open(dataset_path / "dataset.yaml", "w") as yaml_file:
        yaml_file.write(yaml_content.strip())

In [2]:
# Example usage:
dataset_folder = "/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/data/frames/full_dataset_annotated_YOLO"  # replace with your actual dataset path
create_splits_and_yaml(dataset_folder)

### Converter for the provided dataset to YOLO format

In [None]:
import os
import json
from glob import glob
import shutil

def convert_annotation(json_file, output_dir_labels):
    with open(json_file, 'r') as f:
        data = json.load(f)

    txt_output_path = os.path.join(output_dir_labels, os.path.splitext(os.path.basename(json_file))[0] + '.txt')

    with open(txt_output_path, 'w') as txt_out:
        for shape in data['shapes']:
            if shape['shape_type'] != 'rectangle':
                continue

            label = shape['label']
            points = shape['points']
            x1, y1 = points[0]
            x2, y2 = points[1]

            # Convert to YOLO format
            width = data['imageWidth']
            height = data['imageHeight']
            xc = (x1 + x2) / 2 / width
            yc = (y1 + y2) / 2 / height
            w = (x2 - x1) / width
            h = (y2 - y1) / height

            class_id = 0  # Update this if you have multiple classes and a class mapping system
            txt_out.write(f"{class_id} {xc} {yc} {w} {h}\n")

def convert_annotations_in_directory(input_dir, output_dir_images, output_dir_labels):
    if not os.path.exists(output_dir_images):
        os.makedirs(output_dir_images)
    if not os.path.exists(output_dir_labels):
        os.makedirs(output_dir_labels)

    json_files = glob(os.path.join(input_dir, '*.json'))

    for json_file in json_files:
        convert_annotation(json_file, output_dir_labels)

    # Copy image files to the output images directory
    image_files = glob(os.path.join(input_dir, '*.jpg'))
    for image_file in image_files:
        dest_file = os.path.join(output_dir_images, os.path.basename(image_file))
        if not os.path.exists(dest_file):
            shutil.copy(image_file, dest_file)

In [None]:
# if __name__ == '__main__':
#     input_directory = '/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/data/official_dataset_annotated/experiments/video_lab_platform_8'  # Replace with the path to your input directory
#     output_directory_images = input_directory + "_yolo/images"  # Replace with the path to your output images directory
#     output_directory_labels = input_directory + "_yolo/labels"  # Replace with the path to your output labels directory

#     convert_annotations_in_directory(input_directory, output_directory_images, output_directory_labels)

In [None]:
import os
import shutil

# Set the path to the "experiments" folder
experiments_folder = "/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/data/official_dataset_annotated/experiments/yolo"

# Set the path to the merged dataset folder
merged_folder = "/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/data/official_dataset_annotated/experiments/yolo_merged_dataset"
merged_images_folder = os.path.join(merged_folder, "images")
merged_labels_folder = os.path.join(merged_folder, "labels")

# Create the merged dataset folder if it doesn't exist
os.makedirs(merged_images_folder, exist_ok=True)
os.makedirs(merged_labels_folder, exist_ok=True)

# Iterate over each subfolder in the "experiments" folder
for subfolder in os.listdir(experiments_folder):
    subfolder_path = os.path.join(experiments_folder, subfolder)

    # Check if the subfolder is a directory
    if os.path.isdir(subfolder_path):
        images_folder = os.path.join(subfolder_path, "images")
        labels_folder = os.path.join(subfolder_path, "labels")

        # Iterate over each file in the "images" folder
        for filename in os.listdir(images_folder):
            old_image_path = os.path.join(images_folder, filename)
            new_image_filename = f"{subfolder}_{filename}"
            new_image_path = os.path.join(merged_images_folder, new_image_filename)

            # Copy the image file to the merged dataset folder with the new filename
            shutil.copy2(old_image_path, new_image_path)

        # Iterate over each file in the "labels" folder
        for filename in os.listdir(labels_folder):
            old_label_path = os.path.join(labels_folder, filename)
            new_label_filename = f"{subfolder}_{filename}"
            new_label_path = os.path.join(merged_labels_folder, new_label_filename)

            # Copy the label file to the merged dataset folder with the new filename
            shutil.copy2(old_label_path, new_label_path)

print("Merging complete!")

### Convert Label Studio CSV Export file to full training json file

In [None]:
import csv
import json


def convert_csv_to_json(csv_file):
    data = []
    video_id = 1
    frames = []

    with open(csv_file, "r") as file:
        reader = csv.DictReader(file)
        for row in reader:
            try:
                annotation = json.loads(row["label"]) if row["label"] else []
                # Extract the frame number from the 'image' field
                frame_id = int(row["image"].split("_")[-1].split(".")[0])

                if frame_id == 0:
                    if len(frames) > 0:
                        data.append({"video_id": video_id, "frames": frames})
                        video_id += 1
                        frames = []
                    frames = []
                
                # Convert percentages to pixel values based on original image width/height
                x = annotation[0]["x"] * annotation[0]["original_width"] / 100.0 if annotation else 0
                y = annotation[0]["y"] * annotation[0]["original_height"] / 100.0 if annotation else 0
                width = annotation[0]["width"] * annotation[0]["original_width"] / 100.0 if annotation else 0
                height = annotation[0]["height"] * annotation[0]["original_height"] / 100.0 if annotation else 0
                bbox_xywh = [x, y, width, height]
                output_entry = {
                    "frame": frame_id,
                    "bbox": bbox_xywh
                }
                frames.append(output_entry)

            except json.JSONDecodeError:
                # Skip any rows that have an issue with the 'label' field
                print(f"Skipping row: {row}")
                continue

    return data

In [None]:
# Example usage
# csv_file = '/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/data/frames/full_dataset_annotated_fpp/raw_data.csv'
# json_data = convert_csv_to_json(csv_file)

# with open('/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/data/frames/full_dataset_annotated_fpp/full_dataset_formatted.json', 'w') as file:
#     json.dump(json_data, file, indent=2)

### Prepare training dataset for the future position prediction model

In [None]:
import json

def split_video(video, train_ratio, val_ratio, test_ratio):
    num_frames = len(video['frames'])

    train_end = int(train_ratio * num_frames)
    val_end = int((train_ratio + val_ratio) * num_frames)

    train_frames = video['frames'][:train_end]
    val_frames = video['frames'][train_end:val_end]
    test_frames = video['frames'][val_end:]

    return (
        {'video_id': video['video_id'], 'frames': train_frames},
        {'video_id': video['video_id'], 'frames': val_frames},
        {'video_id': video['video_id'], 'frames': test_frames}
    )

def create_datasets(annotation_file, train_ratio=0.64, val_ratio=0.16, test_ratio=0.2, output_dir="."):
    """
    Crée les fichiers train.json, validation.json et test.json à partir d'un fichier annotations.json.

    Paramètres :
        annotation_file (str): Chemin vers le fichier annotations.json
        train_ratio (float): Ratio des données allouées à l'entraînement
        val_ratio (float): Ratio des données allouées à la validation
        test_ratio (float): Ratio des données allouées aux tests
        output_dir (str): Répertoire où les fichiers résultants seront enregistrés
    """
    # Chargement du fichier JSON des annotations
    with open(annotation_file, "r") as f:
        annotations = json.load(f)

    # Vérification que la somme des ratios est correcte
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "Les ratios doivent correspondre à 1.0"

    train_annotations = []
    val_annotations = []
    test_annotations = []

    # Diviser les vidéos séparément
    for video in annotations:
        train, val, test = split_video(video, train_ratio, val_ratio, test_ratio)
        train_annotations.append(train)
        val_annotations.append(val)
        test_annotations.append(test)

    # Enregistrement des fichiers JSON pour l'entraînement, la validation et les tests
    with open(f"{output_dir}/train.json", "w") as f:
        json.dump(train_annotations, f, indent=4)

    with open(f"{output_dir}/validation.json", "w") as f:
        json.dump(val_annotations, f, indent=4)

    with open(f"{output_dir}/test.json", "w") as f:
        json.dump(test_annotations, f, indent=4)

    print("Les fichiers train.json, validation.json et test.json ont été créés et enregistrés.")

In [None]:
# Exemple d'utilisation
# create_datasets(
#     "/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/data/frames/full_dataset_annotated_fpp/full_dataset_formatted.json",
#     output_dir="/Users/alexis/Library/CloudStorage/OneDrive-Balayre&Co/Cranfield/Thesis/thesis-github-repository/data/frames/full_dataset_annotated_fpp",
# )