### Create a Training and Validation Dataset

This script, authored by AliRKhojasteh, converts image and mask files into the **COCO dataset format**. It randomly separates frames into **training** and **validation sets**, processes each frame to extract contours from masks, and stores image and annotation data in the COCO format. 

The **total number of frames** is equal to the sum of the frames for training and frames for evaluation.

The resulting **COCO-format datasets** are saved as JSON files.

**Author:** AliRKhojasteh  
**License:** Apache 2.0

In [10]:
import os

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

image_dir = os.path.join(parent_dir, 'Demo\Train_data')
mask_dir = os.path.join(parent_dir, 'Demo\Train_data')
print(image_dir)

Total_frames = 10
Evaluation_frames = 3 

evaluation_percentage = (Evaluation_frames / Total_frames) * 100
print(f"\n\nTotal frames: {Total_frames}, using {evaluation_percentage} % for evaluation.")


i:\My Drive\Flow_segmentation\Demo\Train_data


Total frames: 10, using 30.0 % for evaluation.


In [11]:
import cv2
import json
import random
import numpy as np
from PIL import Image
from pycocotools import mask as maskUtils


coco_format_train = {"images": [], "annotations": [], "categories": [{"id": 1, "name": "jet"}]}
coco_format_val = {"images": [], "annotations": [], "categories": [{"id": 1, "name": "jet"}]}

all_frames = list(range(1, Total_frames))
# Evaluation frames
val_frames = random.sample(all_frames, Evaluation_frames)
# Training frames
train_frames = [frame for frame in all_frames if frame not in val_frames]

def process_frame(i, coco_format):
    
    image_filename = f"Jet_image_{i}.png"
    mask_filename = f"Jet_mask_{i}.png"

    image = Image.open(os.path.join(image_dir, image_filename))
    mask = cv2.imread(os.path.join(mask_dir, mask_filename), 0)

    contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    coco_format["images"].append({
        "file_name": image_filename,
        "height": image.height,
        "width": image.width,
        "id": i
    })

    for j, contour in enumerate(contours):

        # Get the bounding box coordinates
        x, y, w, h = cv2.boundingRect(contour)
        mask_contour = np.zeros((image.height, image.width), dtype=np.uint8)
        cv2.drawContours(mask_contour, [contour], -1, (1), thickness=cv2.FILLED)
        mask_rle = maskUtils.encode(np.asfortranarray(mask_contour))
        mask_rle_list = mask_rle['counts'].decode('utf-8')

        # Add annotation information 
        coco_format["annotations"].append({
            "bbox": [x, y, w, h],
            "image_id": i,
            "category_id": 1,
            "id": i * 1000 + j,  # Unique ID for each annotation
            "segmentation": {"counts": mask_rle_list, "size": mask_rle['size']}  # Add the RLE-encoded mask
        })

# Process the training frames
for i in train_frames:
    process_frame(i, coco_format_train)

# Process the validation frames
for i in val_frames:
    process_frame(i, coco_format_val)


# Save the COCO-format 
annotations_train_path = os.path.join(parent_dir, "Demo/Train_data/annotations_train.json")
annotations_val_path = os.path.join(parent_dir, "Demo/Train_data/annotations_val.json")

with open(annotations_train_path, "w") as f:
    json.dump(coco_format_train, f)

with open(annotations_val_path, "w") as f:
    json.dump(coco_format_val, f)