In [1]:
import os
import json
import shutil
from PIL import Image
import concurrent.futures
import time
import datetime

# --- COCO Bounding Box Conversion ---
def convert_to_coco_bbox(points, image_height, image_width):
    """
    Converts [[x1, y1], [x2, y2]] points to COCO bbox [x_min, y_min, width, height].
    Clamps coordinates to be within image boundaries.
    """
    x_coords = [p[0] for p in points]
    y_coords = [p[1] for p in points]

    x_min_abs = float(min(x_coords))
    y_min_abs = float(min(y_coords))
    x_max_abs = float(max(x_coords))
    y_max_abs = float(max(y_coords))

    x_min_abs = max(0.0, x_min_abs)
    y_min_abs = max(0.0, y_min_abs)
    x_max_abs = min(float(image_width - 1), x_max_abs)
    y_max_abs = min(float(image_height - 1), y_max_abs)
    
    if x_min_abs >= x_max_abs or y_min_abs >= y_max_abs:
        return None

    width = x_max_abs - x_min_abs
    height = y_max_abs - y_min_abs
    
    return [x_min_abs, y_min_abs, width, height]

# --- Category Discovery for Test Set ---
def _get_labels_from_single_json(json_path):
    """Helper function to extract labels from a single JSON file."""
    labels_in_file = set()
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
        for shape in data.get("shapes", []): # Assuming 'shapes' contains the objects
            label = shape.get("label")
            if label:
                labels_in_file.add(label)
    except Exception as e:
        print(f"Warning (Category Discovery Worker): Error reading {json_path}: {e}")
    return labels_in_file

def discover_coco_categories_for_test_set(source_test_data_dir, max_workers=None):
    """Scans the test data directory in parallel to discover unique class labels."""
    print("Discovering COCO categories from the test set (parallelized)...")
    overall_start_time = time.time()
    
    json_file_paths = []
    if not os.path.isdir(source_test_data_dir):
        print(f"Error (Category Discovery): Source directory '{source_test_data_dir}' not found.")
        return []

    for item_name in os.listdir(source_test_data_dir):
        if item_name.lower().endswith(".json"):
            json_file_paths.append(os.path.join(source_test_data_dir, item_name))

    if not json_file_paths:
        print("Warning: No JSON files found in the test set for category discovery.")
        return []

    unique_labels = set()
    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
        future_to_path = {executor.submit(_get_labels_from_single_json, path): path for path in json_file_paths}
        for future in concurrent.futures.as_completed(future_to_path):
            try:
                labels_from_file = future.result()
                unique_labels.update(labels_from_file)
            except Exception as exc:
                path = future_to_path[future]
                print(f"Warning (Category Discovery Main): Generated an exception for {path}: {exc}")
    
    sorted_labels = sorted(list(unique_labels))
    
    categories_list = []
    # COCO category IDs typically start from 1.
    for i, label_name in enumerate(sorted_labels):
        categories_list.append({
            "id": i + 1, # Category ID
            "name": label_name,
            "supercategory": label_name # Or a more general supercategory if you have one
        })
    
    discovery_time = time.time() - overall_start_time
    if categories_list:
        print(f"Discovered {len(categories_list)} COCO categories in {discovery_time:.2f} seconds:")
        for cat in categories_list:
            print(f"  ID: {cat['id']}, Name: {cat['name']}")
    else:
        print(f"Warning: No categories discovered in the test set after parallel processing in {discovery_time:.2f} seconds.")
    return categories_list

# --- Worker for processing one image-JSON pair (Multi-Class) ---
def _process_single_image_to_coco_data_multiclass(args_tuple):
    """
    Processes one image and its JSON, preserving original labels.
    Returns dict with image_info and list of annotation_data (pre-ID assignment).
    args_tuple: (img_filename_no_ext, source_data_path, new_img_filename_for_coco)
    """
    img_filename_no_ext, source_data_path, new_img_filename_for_coco = args_tuple
    
    png_file = f"{img_filename_no_ext}.png"
    json_file = f"{img_filename_no_ext}.json"

    source_png_path = os.path.join(source_data_path, png_file)
    source_json_path = os.path.join(source_data_path, json_file)

    if not (os.path.exists(source_png_path) and os.path.exists(source_json_path)):
        return None

    try:
        with open(source_json_path, 'r') as f:
            user_json_data = json.load(f)
        
        image_height = user_json_data.get("imageHeight")
        image_width = user_json_data.get("imageWidth")

        if image_height is None or image_width is None:
            with Image.open(source_png_path) as img_pil:
                pil_width, pil_height = img_pil.size
            if image_width is None: image_width = pil_width
            if image_height is None: image_height = pil_height
        
        if not image_height or not image_width:
            return None

        image_info = {
            "file_name": new_img_filename_for_coco,
            "height": int(image_height),
            "width": int(image_width),
            "original_path": source_png_path 
        }
        
        annotations_data = []
        for shape in user_json_data.get("shapes", []):
            original_label = shape.get("label")
            points = shape.get("points")
            shape_type = shape.get("shape_type")

            if not original_label or not points or shape_type != "rectangle" or len(points) != 2:
                continue
            
            coco_bbox = convert_to_coco_bbox(points, image_height, image_width)
            if coco_bbox:
                annotations_data.append({
                    "category_name": original_label, # Will be mapped to category_id later
                    "bbox": coco_bbox,
                    "area": coco_bbox[2] * coco_bbox[3]
                })
            
        return {"image_info": image_info, "annotations_data": annotations_data, "original_source_path": source_png_path}

    except Exception as e:
        # print(f"Warning (Worker): Error processing {source_png_path} or {source_json_path}: {e}")
        return None

# --- Function to create COCO Test Dataset (Multi-Class) ---
def create_coco_test_dataset_multiclass(
    source_test_data_dir,
    output_coco_test_dir,
    max_workers=None,
    image_output_folder_name="data"
):
    """
    Creates a COCO-formatted test dataset preserving original class labels.
    """
    overall_start_time = time.time()
    print(f"\nProcessing Test Dataset (COCO Format, Multi-Class)...")

    if not os.path.isdir(source_test_data_dir):
        print(f"Error: Source test data directory '{source_test_data_dir}' not found.")
        return
    
    os.makedirs(output_coco_test_dir, exist_ok=True)

    # 1. Discover COCO categories from the test set
    coco_categories = discover_coco_categories_for_test_set(source_test_data_dir, max_workers)
    if not coco_categories:
        print("Error: No categories discovered in the test set. Cannot proceed.")
        return
    category_name_to_id = {cat['name']: cat['id'] for cat in coco_categories}

    # 2. Define output paths
    images_dest_dir = os.path.join(output_coco_test_dir, image_output_folder_name)
    annotations_dest_dir = os.path.join(output_coco_test_dir, "annotations")
    os.makedirs(images_dest_dir, exist_ok=True)
    os.makedirs(annotations_dest_dir, exist_ok=True)

    print(f"  Output images to: {images_dest_dir}")
    print(f"  Output annotations to: {annotations_dest_dir}")

    # 3. Initialize COCO output structure
    coco_output_data = {
        "info": {
            "description": f"COCO-style Test Dataset (Multi-Class)",
            "version": "1.0",
            "year": datetime.date.today().year,
            "date_created": datetime.datetime.utcnow().isoformat(' ')
        },
        "licenses": [{"name": "Placeholder License", "id": 0, "url": ""}],
        "categories": coco_categories, # Use discovered categories
        "images": [],
        "annotations": []
    }

    current_image_id = 1
    current_annotation_id = 1
    tasks_for_processing = []

    # 4. Prepare tasks from the source test directory
    image_basenames = sorted([
        os.path.splitext(f)[0] for f in os.listdir(source_test_data_dir) 
        if f.lower().endswith(".png") and os.path.exists(os.path.join(source_test_data_dir, f"{os.path.splitext(f)[0]}.json"))
    ])
    
    if not image_basenames:
        print(f"  No matching PNG/JSON pairs found in '{source_test_data_dir}'.")
        return

    for img_basename in image_basenames:
        new_img_filename_for_coco = f"{img_basename}.png" 
        tasks_for_processing.append(
            (img_basename, source_test_data_dir, new_img_filename_for_coco)
        )
    
    # 5. Process images and annotations
    print(f"  Found {len(tasks_for_processing)} images to process from '{source_test_data_dir}'...")
    processed_results = []
    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
        future_to_task = {
            executor.submit(_process_single_image_to_coco_data_multiclass, task_args): task_args 
            for task_args in tasks_for_processing
        }
        for future in concurrent.futures.as_completed(future_to_task):
            try:
                result = future.result()
                if result:
                    processed_results.append(result)
            except Exception as exc:
                task_args = future_to_task[future]
                img_filename_no_ext = task_args[0]
                print(f"  Warning: Image '{img_filename_no_ext}.png' generated an exception during processing: {exc}")

    # 6. Aggregate results into COCO format
    images_processed_count = 0
    annotations_added_count = 0
    for result_data in processed_results:
        if not result_data or not result_data.get("image_info"):
            continue

        target_image_path = os.path.join(images_dest_dir, result_data["image_info"]["file_name"])
        try:
            shutil.copy2(result_data["original_source_path"], target_image_path)
        except Exception as e:
            print(f"  Error copying image {result_data['original_source_path']} to {target_image_path}: {e}")
            continue
        
        img_entry = result_data["image_info"]
        img_entry["id"] = current_image_id
        del img_entry["original_path"]
        coco_output_data["images"].append(img_entry)
        images_processed_count += 1
        
        for ann_data in result_data["annotations_data"]:
            category_name = ann_data["category_name"]
            if category_name not in category_name_to_id:
                print(f"  Warning: Category '{category_name}' found in annotation for image {img_entry['file_name']} but not in discovered categories. Skipping this annotation.")
                continue

            ann_entry = {
                "id": current_annotation_id,
                "image_id": current_image_id,
                "category_id": category_name_to_id[category_name], # Use mapped ID
                "bbox": ann_data["bbox"],
                "area": ann_data["area"],
                "iscrowd": 0,
                "segmentation": [] 
            }
            coco_output_data["annotations"].append(ann_entry)
            current_annotation_id += 1
            annotations_added_count +=1
        current_image_id += 1
        
    # 7. Write the COCO JSON annotation file
    output_json_filename = "instances_test.json"
    output_json_path = os.path.join(annotations_dest_dir, output_json_filename)
    try:
        with open(output_json_path, 'w') as f:
            json.dump(coco_output_data, f, indent=4)
        processing_time = time.time() - overall_start_time
        print(f"\n  Successfully created '{output_json_filename}' with {images_processed_count} images and {annotations_added_count} annotations.")
        print(f"Test set processing complete in {processing_time:.2f} seconds!")
        print(f"Test dataset is ready under: {output_coco_test_dir}")
    except Exception as e:
        print(f"  Error writing COCO JSON for test set '{output_json_filename}': {e}")

# --- Configuration and Execution for Test Set (Multi-Class) ---
if __name__ == "__main__":
    # **IMPORTANT**: Modify these paths to match your actual dataset and desired output.
    
    # Path to the single folder containing your raw test images (.png) and annotation files (.json)
    SOURCE_TEST_DATA_DIR = "/blue/hulcr/gmarais/PhD/phase_1_data/1_data_splitting/test_set_output" 
    
    # Path to the directory where the COCO-formatted test set will be saved.
    OUTPUT_COCO_TEST_DIR = "/blue/hulcr/gmarais/PhD/phase_1_data/3_classification_phase_2/coco/test"

    # Optional: Control the number of parallel processes
    MAX_WORKERS = None # os.cpu_count()
    # MAX_WORKERS = 4 

    IMAGE_OUTPUT_FOLDER_NAME = "data" # Or "images", "test_images", etc.

    print("="*50)
    print(f"Starting COCO dataset creation for the MULTI-CLASS TEST SET:")
    print(f"  Source Test Data Directory: {SOURCE_TEST_DATA_DIR}")
    print(f"  Output COCO Test Directory: {OUTPUT_COCO_TEST_DIR}")
    print(f"  Image Output Subfolder: '{IMAGE_OUTPUT_FOLDER_NAME}'")
    max_workers_display = MAX_WORKERS if MAX_WORKERS is not None else f'Default (likely {os.cpu_count()})'
    print(f"  Max Workers for Parallelization: {max_workers_display}")
    print("-" * 30)

    if SOURCE_TEST_DATA_DIR == "/path/to/your/single_folder_test_data_multiclass" or \
       OUTPUT_COCO_TEST_DIR == "/path/to/your/output_coco_test_directory_multiclass":
        print("\nPLEASE UPDATE 'SOURCE_TEST_DATA_DIR' and 'OUTPUT_COCO_TEST_DIR' before running the script!")
    else:
        create_coco_test_dataset_multiclass(
            source_test_data_dir=SOURCE_TEST_DATA_DIR,
            output_coco_test_dir=OUTPUT_COCO_TEST_DIR,
            max_workers=MAX_WORKERS,
            image_output_folder_name=IMAGE_OUTPUT_FOLDER_NAME
        )
        print("\nCOCO Multi-Class Test Set Script execution finished.")

Starting COCO dataset creation for the MULTI-CLASS TEST SET:
  Source Test Data Directory: /blue/hulcr/gmarais/PhD/phase_1_data/1_data_splitting/test_set_output
  Output COCO Test Directory: /blue/hulcr/gmarais/PhD/phase_1_data/3_classification_phase_2/coco/test
  Image Output Subfolder: 'data'
  Max Workers for Parallelization: Default (likely 128)
------------------------------

Processing Test Dataset (COCO Format, Multi-Class)...
Discovering COCO categories from the test set (parallelized)...
Discovered 63 COCO categories in 0.78 seconds:
  ID: 1, Name: Ambrosiodmus_minor
  ID: 2, Name: Ambrosiophilus_atratus
  ID: 3, Name: Anisandrus_dispar
  ID: 4, Name: Anisandrus_sayi
  ID: 5, Name: Cnestus_mutilatus
  ID: 6, Name: Coccotrypes_carpophagus
  ID: 7, Name: Coccotrypes_dactyliperda
  ID: 8, Name: Coptoborus_ricini
  ID: 9, Name: Cryptocarenus_heveae
  ID: 10, Name: Ctonoxylon_hagedorn
  ID: 11, Name: Cyclorhipidion_pelliculosum
  ID: 12, Name: Dendroctonus_rufipennis
  ID: 13, Name