## Creating YOLO Training Dataset

This script generates YOLO-compatible training data from annotated JSON files and images. It performs the following tasks:

1. **Define Labels and Create a Label Map**: Defines a list of object labels and maps them to class IDs.
2. **Ensure Directories Exist**: Checks that the label directory exists, creating it if necessary.
3. **Open Output CSV File**: Opens a CSV file to write annotations.
4. **Read JSON Annotations**: Reads annotations from a JSON file and processes each frame.
5. **Process Each Image**:
    - Constructs the image and label file paths.
    - Checks if the image file exists before processing.
    - Reads the image and retrieves its dimensions.
    - Opens a label file for writing YOLO annotations.
6. **Extract and Write Annotations**:
    - Extracts bounding box coordinates and labels from the JSON annotations.
    - Normalizes bounding box coordinates for YOLO format.
    - Writes annotations to the CSV file and the YOLO label file.
    - Draws bounding boxes and labels on the image for visualization.
7. **Display Annotated Images**: Optionally displays the annotated images with bounding boxes for a brief period.
8. **Ensure Label File Existence**: Ensures that each image has a corresponding label file, creating empty label files if necessary.
9. **Print Completion Message**: Prints a message indicating the completion of CSV file creation, YOLO label files, and validation.

The function `create_yolo_training_dataset` is called with the paths to the image and label directories to generate the necessary files for training a YOLO model.


In [2]:
import json
import os
import cv2
import csv

def create_yolo_training_dataset(image_directory, label_directory, json_file='index_train.json', output_csv='annotations.csv'):
    labels = [
        "person",
        "bike",
        "car",
        "motor",
        "bus",
        "train",
        "truck",
        "scooter",
        "other_vehicle"
    ]
    
    label_map = {label: idx for idx, label in enumerate(labels)}

    # Ensure the labels directory exists
    os.makedirs(label_directory, exist_ok=True)

    # Open the CSV file to write the annotations
    with open(output_csv, mode='w', newline='') as file:
        csv_writer = csv.writer(file)
        # Write the header of the CSV file
        csv_writer.writerow(['image_name', 'label', 'x', 'y', 'width', 'height'])

        # Read the JSON file with annotations
        with open(json_file) as f:
            data = json.load(f)
            frames = data['frames']

            for frame in frames:
                image_name = f"video-{frame['videoMetadata']['videoId']}-frame-{str(frame['videoMetadata']['frameIndex']).zfill(6)}-{frame['datasetFrameId']}.jpg"
                image_path = os.path.join(image_directory, image_name)
                label_path = os.path.join(label_directory, f"{os.path.splitext(image_name)[0]}.txt")
                
                # Check if the image file exists before processing
                if os.path.isfile(image_path):
                    img = cv2.imread(image_path)
                    img_height, img_width = img.shape[:2]

                    # Open the label file to write YOLO annotations
                    with open(label_path, mode='w') as label_file:

                        # Extract annotations, write to CSV, YOLO file, and visualize
                        for anno in frame["annotations"]:
                            label = anno['labels'][0]
                            if label in labels:
                                class_id = label_map[label]
                                bbox_height = anno["boundingBox"]["h"]
                                bbox_width = anno["boundingBox"]["w"]
                                x = anno["boundingBox"]["x"]
                                y = anno["boundingBox"]["y"]
                                
                                # Normalize the coordinates for YOLO
                                x_center = (x + bbox_width / 2) / img_width
                                y_center = (y + bbox_height / 2) / img_height
                                norm_width = bbox_width / img_width
                                norm_height = bbox_height / img_height

                                # Writing to CSV: image name, label, and bounding box coordinates
                                csv_writer.writerow([image_name, label, x, y, bbox_width, bbox_height])

                                # Writing to YOLO label file: class id and normalized bounding box coordinates
                                label_file.write(f"{class_id} {x_center} {y_center} {norm_width} {norm_height}\n")

                                # Drawing the bounding box on the image
                                cv2.rectangle(img, (x, y), (x + bbox_width, y + bbox_height), (0, 255, 0), 2)
                                cv2.putText(img, label, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
                    
                    # Display the image with bounding boxes
                    cv2.imshow('Image', img)
                    cv2.waitKey(10) # Wait for 10ms to show the image
                    cv2.destroyAllWindows()
                else:
                    print(f"Image {image_path} not found, skipping annotation.")
    
    # Ensure each image has a corresponding label file
    for image_name in os.listdir(image_directory):
        if image_name.endswith('.jpg'):
            label_path = os.path.join(label_directory, f"{os.path.splitext(image_name)[0]}.txt")
            if not os.path.exists(label_path):
                # Create an empty label file if it doesn't exist
                open(label_path, 'w').close()

    print("CSV file creation, YOLO label files, and validation complete.")

# Now you can call the function with the image directory path as a parameter:
create_yolo_training_dataset('data_train/', 'label_train/')


CSV file creation, YOLO label files, and validation complete.
