In [1]:
import os
import json
import shutil
from collections import OrderedDict
import yaml # For data.yaml
from PIL import Image # To verify/get image dimensions if needed
import concurrent.futures
import time # For timing

# --- Utility Function: Convert to YOLO Format ---
def convert_yolo_format(image_width, image_height, points, class_id):
    """
    Converts a single bounding box from [[x1, y1], [x2, y2]] points to YOLO format.
    Returns a string: "class_id x_center_norm y_center_norm width_norm height_norm"
    All coordinates are normalized by image dimensions.
    """
    x_coords = [p[0] for p in points]
    y_coords = [p[1] for p in points]

    x_min_abs = float(min(x_coords))
    y_min_abs = float(min(y_coords))
    x_max_abs = float(max(x_coords))
    y_max_abs = float(max(y_coords))

    # Clamp to image boundaries
    x_min_abs = max(0.0, x_min_abs)
    y_min_abs = max(0.0, y_min_abs)
    x_max_abs = min(float(image_width - 1), x_max_abs)
    y_max_abs = min(float(image_height - 1), y_max_abs)
    
    if x_min_abs >= x_max_abs or y_min_abs >= y_max_abs:
        return None # Invalid box

    box_width_abs = x_max_abs - x_min_abs
    box_height_abs = y_max_abs - y_min_abs

    x_center_abs = x_min_abs + box_width_abs / 2.0
    y_center_abs = y_min_abs + box_height_abs / 2.0

    # Normalize
    x_center_norm = x_center_abs / image_width
    y_center_norm = y_center_abs / image_height
    width_norm = box_width_abs / image_width
    height_norm = box_height_abs / image_height

    return f"{class_id} {x_center_norm:.6f} {y_center_norm:.6f} {width_norm:.6f} {height_norm:.6f}"

# --- Fixed Class Definition ---
def get_fixed_class_map(class_name="bark_beetle", class_id=0):
    """
    Returns a map for a single fixed class.
    YOLO class IDs are typically 0-indexed.
    """
    print(f"Setting up fixed class: '{class_name}' with ID {class_id}")
    master_class_to_id_map = OrderedDict([(class_name, class_id)])
    print(f"Using 1 class:")
    for name, idx in master_class_to_id_map.items():
        print(f"  '{name}': {idx}")
    return master_class_to_id_map

# --- Worker for processing one image-JSON pair (Single Class) ---
def _process_single_image_json_pair_single_class(args_tuple):
    """
    Processes one image and its JSON for a single fixed class.
    args_tuple: (img_filename_no_ext, source_data_path, fixed_class_id, 
                 output_image_path, output_label_path)
    Returns: True if successful (label file created), False otherwise.
    """
    img_filename_no_ext, source_data_path, fixed_class_id, \
    output_image_path, output_label_path = args_tuple

    png_file = f"{img_filename_no_ext}.png"
    json_file = f"{img_filename_no_ext}.json"

    source_png_path = os.path.join(source_data_path, png_file)
    source_json_path = os.path.join(source_data_path, json_file)

    if not (os.path.exists(source_png_path) and os.path.exists(source_json_path)):
        # print(f"Warning (Worker): Missing PNG or JSON for {img_filename_no_ext} in {source_data_path}. Skipping.")
        return False

    try:
        # Copy image first
        shutil.copy2(source_png_path, output_image_path)

        with open(source_json_path, 'r') as f:
            data = json.load(f)
        
        image_height = data.get("imageHeight")
        image_width = data.get("imageWidth")

        if image_height is None or image_width is None:
            with Image.open(source_png_path) as img_pil: # Use source_png_path for reading dimensions
                pil_width, pil_height = img_pil.size
            if image_width is None: image_width = pil_width
            if image_height is None: image_height = pil_height
        
        if not image_height or not image_width: # Handles 0 or None
            if os.path.exists(output_image_path): os.remove(output_image_path) # Clean up copied image
            return False

        yolo_annotations = []
        for shape in data.get("shapes", []): # Assuming 'shapes' from LabelMe JSON
            points = shape.get("points") # Assuming format [[x1,y1],[x2,y2]]
            shape_type = shape.get("shape_type")

            if not points or shape_type != "rectangle" or len(points) != 2:
                continue # Skip non-rectangle or malformed shapes
            
            yolo_str = convert_yolo_format(image_width, image_height, points, fixed_class_id)
            if yolo_str:
                yolo_annotations.append(yolo_str)
        
        if yolo_annotations:
            with open(output_label_path, 'w') as f_label:
                f_label.write("\n".join(yolo_annotations) + "\n")
            return True # Successfully created image and label file
        else:
            # No valid annotations found for this image.
            # Optionally, remove the copied image if no labels are generated.
            # For a test set, you might want to keep images even if they are "negative" (no objects of interest).
            # If you want to remove images without annotations:
            # if os.path.exists(output_image_path):
            #     os.remove(output_image_path)
            # return False 
            return True # Image copied, but no labels (considered "processed" as a negative sample)

    except Exception as e:
        # print(f"Warning (Worker): Error processing {source_png_path} or {source_json_path}: {e}")
        if os.path.exists(output_image_path): os.remove(output_image_path) # Clean up
        return False

# --- Main Function to Create YOLO Test Dataset (Single Class) ---
def create_yolo_test_dataset_single_class(
    source_test_data_dir,
    output_yolo_test_dir,
    fixed_class_name="bark_beetle", # Name of the single class
    fixed_class_id=0,             # YOLO ID for this class (usually 0)
    max_workers=None
):
    """
    Creates a YOLO-formatted test dataset with all objects mapped to a single class.
    """
    overall_start_time = time.time()
    print(f"\nProcessing Test Dataset for YOLO (Single Class: '{fixed_class_name}')...")

    if not os.path.isdir(source_test_data_dir):
        print(f"Error: Source test data directory '{source_test_data_dir}' not found.")
        return

    # Get the class map (will contain only the fixed class)
    class_map = get_fixed_class_map(fixed_class_name, fixed_class_id)
    actual_fixed_class_id = class_map[fixed_class_name] # Get the ID from the map

    # Define output structure (YOLO typical structure)
    test_images_dir = os.path.join(output_yolo_test_dir, "images", "test")
    test_labels_dir = os.path.join(output_yolo_test_dir, "labels", "test")
    os.makedirs(test_images_dir, exist_ok=True)
    os.makedirs(test_labels_dir, exist_ok=True)

    print(f"  Output images to: {test_images_dir}")
    print(f"  Output labels to: {test_labels_dir}")

    tasks = []
    image_basenames = sorted([
        os.path.splitext(f)[0] for f in os.listdir(source_test_data_dir)
        if f.lower().endswith(".png") and os.path.exists(os.path.join(source_test_data_dir, f"{os.path.splitext(f)[0]}.json"))
    ])

    if not image_basenames:
        print(f"  No matching PNG/JSON pairs found in '{source_test_data_dir}'.")
        return

    for img_basename in image_basenames:
        output_image_path = os.path.join(test_images_dir, f"{img_basename}.png")
        output_label_path = os.path.join(test_labels_dir, f"{img_basename}.txt")
        tasks.append(
            (img_basename, source_test_data_dir, actual_fixed_class_id,
             output_image_path, output_label_path)
        )

    processed_count = 0
    print(f"  Found {len(tasks)} images to process...")
    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
        future_to_task = {
            executor.submit(_process_single_image_json_pair_single_class, task_args): task_args
            for task_args in tasks
        }
        for future in concurrent.futures.as_completed(future_to_task):
            try:
                if future.result():
                    processed_count += 1
            except Exception as exc:
                task_args_failed = future_to_task[future]
                # print(f"  Warning: Task for image '{task_args_failed[0]}' generated an exception: {exc}")
                pass
    
    print(f"  Processed {processed_count} image-JSON pairs.")

    # Create data.yaml
    data_yaml_content = {
        'path': os.path.abspath(output_yolo_test_dir), # Absolute path to dataset root
        'test': os.path.join('images', 'test'),       # Relative path to test images from dataset root
        # 'train' and 'val' can be omitted if only a test set is being prepared
        'nc': len(class_map),
        'names': list(class_map.keys())
    }
    data_yaml_path = os.path.join(output_yolo_test_dir, "data.yaml")
    try:
        with open(data_yaml_path, 'w') as f:
            yaml.dump(data_yaml_content, f, sort_keys=False, default_flow_style=False)
        print(f"  Successfully created 'data.yaml' at {data_yaml_path}")
    except Exception as e:
        print(f"  Error writing data.yaml: {e}")

    total_script_time = time.time() - overall_start_time
    print(f"YOLO Single-Class Test Dataset preparation complete in {total_script_time:.2f} seconds!")
    print(f"Dataset ready under: {output_yolo_test_dir}")


# --- Configuration and Execution (Single Class for Test Set) ---
if __name__ == "__main__":
    # **IMPORTANT**: Modify these paths and settings before running!
    
    # Path to the single folder containing your raw test images (.png) and annotation files (.json)
    SOURCE_TEST_DATA_DIR_SINGLE_CLASS = "/blue/hulcr/gmarais/PhD/phase_1_data/1_data_splitting/test_set_output"
    
    # Path to the directory where the YOLO-formatted test set will be saved.
    # This directory will be created if it doesn't exist.
    OUTPUT_YOLO_TEST_DIR_SINGLE_CLASS = "/blue/hulcr/gmarais/PhD/phase_1_data/2_object_detection_phase_2/ultralytics/test"

    # Define the single class name and its ID for YOLO (typically 0-indexed)
    FIXED_CLASS_NAME = "bark_beetle" 
    FIXED_CLASS_ID = 0              # YOLO class IDs start from 0

    # Optional: Control the number of parallel processes
    MAX_WORKERS = None # Defaults to os.cpu_count()
    # MAX_WORKERS = 4 # Example

    print("="*50)
    print(f"Starting YOLO dataset creation for TEST SET (SINGLE CLASS: '{FIXED_CLASS_NAME}')")

    # Safety check for placeholder paths
    if SOURCE_TEST_DATA_DIR_SINGLE_CLASS == "/path/to/your/source_test_data_folder" or \
       OUTPUT_YOLO_TEST_DIR_SINGLE_CLASS == "/path/to/your_output_yolo_test_dir_single_class":
        print("\nPLEASE UPDATE 'SOURCE_TEST_DATA_DIR_SINGLE_CLASS' and 'OUTPUT_YOLO_TEST_DIR_SINGLE_CLASS' before running!")
    else:
        create_yolo_test_dataset_single_class(
            source_test_data_dir=SOURCE_TEST_DATA_DIR_SINGLE_CLASS,
            output_yolo_test_dir=OUTPUT_YOLO_TEST_DIR_SINGLE_CLASS,
            fixed_class_name=FIXED_CLASS_NAME,
            fixed_class_id=FIXED_CLASS_ID,
            max_workers=MAX_WORKERS
        )
        print("\nYOLO Single-Class Test Set script execution finished.")

Starting YOLO dataset creation for TEST SET (SINGLE CLASS: 'bark_beetle')

Processing Test Dataset for YOLO (Single Class: 'bark_beetle')...
Setting up fixed class: 'bark_beetle' with ID 0
Using 1 class:
  'bark_beetle': 0
  Output images to: /blue/hulcr/gmarais/PhD/phase_1_data/2_object_detection_phase_2/ultralytics/test/images/test
  Output labels to: /blue/hulcr/gmarais/PhD/phase_1_data/2_object_detection_phase_2/ultralytics/test/labels/test
  Found 2031 images to process...
  Processed 2031 image-JSON pairs.
  Successfully created 'data.yaml' at /blue/hulcr/gmarais/PhD/phase_1_data/2_object_detection_phase_2/ultralytics/test/data.yaml
YOLO Single-Class Test Dataset preparation complete in 2.20 seconds!
Dataset ready under: /blue/hulcr/gmarais/PhD/phase_1_data/2_object_detection_phase_2/ultralytics/test

YOLO Single-Class Test Set script execution finished.
