In [1]:
import os
import json
import shutil
# from collections import OrderedDict # Not strictly needed for this version
from PIL import Image
import concurrent.futures
import time
import datetime

# --- COCO Bounding Box Conversion ---
def convert_to_coco_bbox(points, image_height, image_width):
    """
    Converts [[x1, y1], [x2, y2]] points to COCO bbox [x_min, y_min, width, height].
    Clamps coordinates to be within image boundaries.
    """
    x_coords = [p[0] for p in points]
    y_coords = [p[1] for p in points]

    x_min_abs = float(min(x_coords))
    y_min_abs = float(min(y_coords))
    x_max_abs = float(max(x_coords))
    y_max_abs = float(max(y_coords))

    # Clamp to image boundaries
    x_min_abs = max(0.0, x_min_abs)
    y_min_abs = max(0.0, y_min_abs)
    x_max_abs = min(float(image_width - 1), x_max_abs) # image_width is count, so max index is width-1
    y_max_abs = min(float(image_height - 1), y_max_abs) # image_height is count, so max index is height-1
    
    # Ensure valid box dimensions
    if x_min_abs >= x_max_abs or y_min_abs >= y_max_abs:
        return None # Invalid box

    width = x_max_abs - x_min_abs
    height = y_max_abs - y_min_abs
    
    return [x_min_abs, y_min_abs, width, height]

# --- Fixed Category Definition for COCO ---
def get_fixed_coco_category_list(class_name="bark_beetle", class_id=1):
    """Returns a COCO categories list for a single, fixed class."""
    print(f"Defining fixed COCO category: ID: {class_id}, Name: {class_name}")
    return [{
        "id": class_id,
        "name": class_name,
        "supercategory": class_name # Or a more general one if applicable
    }]

# --- Worker for processing one image-JSON pair for COCO (Single Class) ---
def _process_single_image_to_coco_data_single_class(args_tuple):
    """
    Processes one image and its JSON. All shapes are mapped to a single class name.
    Returns dict with image_info and list of annotation_data (pre-ID assignment).
    args_tuple: (img_filename_no_ext, source_data_path, new_img_filename_for_coco, fixed_class_name)
    """
    img_filename_no_ext, source_data_path, new_img_filename_for_coco, fixed_class_name = args_tuple
    
    png_file = f"{img_filename_no_ext}.png"
    json_file = f"{img_filename_no_ext}.json"

    source_png_path = os.path.join(source_data_path, png_file)
    source_json_path = os.path.join(source_data_path, json_file)

    if not (os.path.exists(source_png_path) and os.path.exists(source_json_path)):
        # print(f"Warning (Worker): Missing PNG or JSON for {img_filename_no_ext} in {source_data_path}. Skipping.")
        return None

    try:
        with open(source_json_path, 'r') as f:
            user_json_data = json.load(f)
        
        image_height = user_json_data.get("imageHeight")
        image_width = user_json_data.get("imageWidth")

        # If dimensions are not in JSON, try to get them from the image file
        if image_height is None or image_width is None:
            with Image.open(source_png_path) as img_pil:
                pil_width, pil_height = img_pil.size
            if image_width is None: image_width = pil_width
            if image_height is None: image_height = pil_height
        
        if not image_height or not image_width:
            # print(f"Warning (Worker): Could not determine image dimensions for {source_png_path}. Skipping.")
            return None

        image_info = {
            "file_name": new_img_filename_for_coco, # This is the filename as it will be in COCO dataset
            "height": int(image_height),
            "width": int(image_width),
            "original_path": source_png_path # Temp field for copying the file later
        }
        
        annotations_data = []
        for shape in user_json_data.get("shapes", []):
            points = shape.get("points")
            shape_type = shape.get("shape_type")

            # Assuming LabelMe style rectangle format [[x1,y1],[x2,y2]]
            if not points or shape_type != "rectangle" or len(points) != 2:
                # print(f"Warning (Worker): Skipping non-rectangle or malformed shape in {source_json_path}")
                continue
            
            coco_bbox = convert_to_coco_bbox(points, image_height, image_width)
            if coco_bbox:
                annotations_data.append({
                    "category_name": fixed_class_name, # Use the fixed class name
                    "bbox": coco_bbox,
                    "area": coco_bbox[2] * coco_bbox[3] # width * height
                })
        
        # Include image even if it has no annotations, if needed (currently not forcing)
        # if not annotations_data and not image_info.get('force_include_empty', False):
        #     return None # Or handle as per requirements for empty images
            
        return {"image_info": image_info, "annotations_data": annotations_data, "original_source_path": source_png_path}

    except Exception as e:
        print(f"Warning (Worker): Error processing {source_png_path} or {source_json_path}: {e}")
        return None

# --- Function to create COCO dataset for a single test set ---
def create_coco_test_dataset_single_class(
    source_test_data_dir,
    output_coco_test_dir,
    fixed_class_name="bark_beetle",
    fixed_class_id=1,
    max_workers=None,
    image_output_folder_name="data" # Subfolder name for images, e.g., "data" or "images"
):
    """
    Creates a COCO-formatted dataset for a single test set.
    All objects are mapped to a single class.

    Args:
        source_test_data_dir (str): Path to the directory containing raw test .png images and .json files.
        output_coco_test_dir (str): Path to the directory where the COCO-formatted test set will be saved.
        fixed_class_name (str): The single class name to assign to all annotations.
        fixed_class_id (int): The single class ID to assign.
        max_workers (int, optional): Maximum number of worker processes. Defaults to os.cpu_count().
        image_output_folder_name (str): Name of the subfolder within output_coco_test_dir for storing images.
    """
    overall_start_time = time.time()
    print(f"\nProcessing Test Dataset (COCO Format, Single Class: '{fixed_class_name}')...")

    if not os.path.isdir(source_test_data_dir):
        print(f"Error: Source test data directory '{source_test_data_dir}' not found.")
        return
    
    os.makedirs(output_coco_test_dir, exist_ok=True)

    # 1. Get the fixed COCO category list
    coco_categories = get_fixed_coco_category_list(class_name=fixed_class_name, class_id=fixed_class_id)
    category_name_to_id = {cat['name']: cat['id'] for cat in coco_categories}

    # 2. Define output paths
    images_dest_dir = os.path.join(output_coco_test_dir, image_output_folder_name)
    annotations_dest_dir = os.path.join(output_coco_test_dir, "annotations")
    os.makedirs(images_dest_dir, exist_ok=True)
    os.makedirs(annotations_dest_dir, exist_ok=True)

    print(f"  Output images to: {images_dest_dir}")
    print(f"  Output annotations to: {annotations_dest_dir}")

    # 3. Initialize COCO output structure
    coco_output_data = {
        "info": {
            "description": f"COCO-style Test Dataset (Single Class: {fixed_class_name})",
            "version": "1.0",
            "year": datetime.date.today().year,
            "date_created": datetime.datetime.utcnow().isoformat(' ')
        },
        "licenses": [{"name": "Placeholder License", "id": 0, "url": ""}], # Add actual license if applicable
        "categories": coco_categories,
        "images": [],
        "annotations": []
    }

    current_image_id = 1
    current_annotation_id = 1
    tasks_for_processing = []

    # 4. Prepare tasks from the source test directory
    image_basenames = sorted([
        os.path.splitext(f)[0] for f in os.listdir(source_test_data_dir) 
        if f.lower().endswith(".png") and os.path.exists(os.path.join(source_test_data_dir, f"{os.path.splitext(f)[0]}.json"))
    ])
    
    if not image_basenames:
        print(f"  No matching PNG/JSON pairs found in '{source_test_data_dir}'.")
        return

    for img_basename in image_basenames:
        # The filename for COCO dataset will be the original filename.
        # It will be placed inside the `image_output_folder_name` directory.
        new_img_filename_for_coco = f"{img_basename}.png" 
        tasks_for_processing.append(
            (img_basename, source_test_data_dir, new_img_filename_for_coco, fixed_class_name)
        )
    
    # 5. Process images and annotations
    print(f"  Found {len(tasks_for_processing)} images to process from '{source_test_data_dir}'...")
    processed_results = []
    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
        future_to_task = {
            executor.submit(_process_single_image_to_coco_data_single_class, task_args): task_args 
            for task_args in tasks_for_processing
        }
        for future in concurrent.futures.as_completed(future_to_task):
            try:
                result = future.result()
                if result:
                    processed_results.append(result)
            except Exception as exc:
                task_args = future_to_task[future]
                img_filename_no_ext = task_args[0]
                print(f"  Warning: Image '{img_filename_no_ext}.png' generated an exception during processing: {exc}")

    # 6. Aggregate results into COCO format
    images_processed_count = 0
    annotations_added_count = 0
    for result_data in processed_results:
        if not result_data or not result_data.get("image_info"):
            continue

        # Copy image to the destination image folder
        # result_data["image_info"]["file_name"] is already set to "basename.png" by the worker
        target_image_path = os.path.join(images_dest_dir, result_data["image_info"]["file_name"])
        try:
            shutil.copy2(result_data["original_source_path"], target_image_path)
        except Exception as e:
            print(f"  Error copying image {result_data['original_source_path']} to {target_image_path}: {e}")
            continue # Skip this image if copy fails
        
        img_entry = result_data["image_info"]
        img_entry["id"] = current_image_id
        # The file_name in COCO should be just the filename (e.g., "image1.png").
        # The annotation file will be in "annotations/" and images in "data/" (or "images/").
        # Path relativity is handled by the COCO consumer based on this structure.
        del img_entry["original_path"] # Remove temporary path
        coco_output_data["images"].append(img_entry)
        images_processed_count += 1
        
        for ann_data in result_data["annotations_data"]:
            if ann_data["category_name"] not in category_name_to_id:
                print(f"  Logic Error: Fixed class name '{ann_data['category_name']}' not found in category map. Skipping annotation.")
                continue

            ann_entry = {
                "id": current_annotation_id,
                "image_id": current_image_id, # Link to the current image
                "category_id": category_name_to_id[ann_data["category_name"]],
                "bbox": ann_data["bbox"],
                "area": ann_data["area"],
                "iscrowd": 0,
                "segmentation": [] # Add segmentation if you have it, otherwise empty list for bbox
            }
            coco_output_data["annotations"].append(ann_entry)
            current_annotation_id += 1
            annotations_added_count +=1
        current_image_id += 1 # Increment for the next image
        
    # 7. Write the COCO JSON annotation file
    output_json_filename = "instances_test.json"
    output_json_path = os.path.join(annotations_dest_dir, output_json_filename)
    try:
        with open(output_json_path, 'w') as f:
            json.dump(coco_output_data, f, indent=4)
        processing_time = time.time() - overall_start_time
        print(f"\n  Successfully created '{output_json_filename}' with {images_processed_count} images and {annotations_added_count} annotations.")
        print(f"Test set processing complete in {processing_time:.2f} seconds!")
        print(f"Test dataset is ready under: {output_coco_test_dir}")
    except Exception as e:
        print(f"  Error writing COCO JSON for test set '{output_json_filename}': {e}")

# --- Configuration and Execution for Test Set ---
if __name__ == "__main__":
    # **IMPORTANT**: Modify these paths and names to match your actual dataset and desired output.
    
    # Path to the single folder containing your raw test images (.png) and annotation files (.json)
    # Example: "/mnt/data/my_project/raw_test_images_and_json/"
    SOURCE_TEST_DATA_DIR = "/path/to/your/single_folder_test_data" 
    
    # Path to the directory where the COCO-formatted test set will be saved.
    # This directory will be created if it doesn't exist.
    # Inside, "data/" (or "images/") and "annotations/" subfolders will be created.
    # Example: "/mnt/data/my_project/coco_formatted_test_set/"
    OUTPUT_COCO_TEST_DIR = "/path/to/your/output_coco_test_directory"

    # Define the single class name and its ID (COCO typically starts IDs at 1)
    FIXED_CLASS_NAME = "bark_beetle"  # Or your specific class name
    FIXED_CLASS_ID = 1               # Ensure this ID is consistent if your model expects a specific ID

    # Optional: Control the number of parallel processes for image processing
    # Set to None to use all available CPU cores, or an integer for a specific number.
    MAX_WORKERS = None # os.cpu_count()
    # MAX_WORKERS = 4 # Example: use 4 worker processes

    # Name for the subfolder where images will be copied within OUTPUT_COCO_TEST_DIR
    IMAGE_OUTPUT_FOLDER_NAME = "data" # Common choices: "data", "images", "test_images"

    # --- Run the Test Set Creation ---
    print("="*50)
    print(f"Starting COCO dataset creation for the TEST SET (Single Class: '{FIXED_CLASS_NAME}'):")
    print(f"  Source Test Data Directory: {SOURCE_TEST_DATA_DIR}")
    print(f"  Output COCO Test Directory: {OUTPUT_COCO_TEST_DIR}")
    print(f"  Image Output Subfolder: '{IMAGE_OUTPUT_FOLDER_NAME}'")
    max_workers_display = MAX_WORKERS if MAX_WORKERS is not None else f'Default (likely {os.cpu_count()})'
    print(f"  Max Workers for Parallelization: {max_workers_display}")
    print("-" * 30)

    # Before running, ensure SOURCE_TEST_DATA_DIR and OUTPUT_COCO_TEST_DIR are correctly set!
    if SOURCE_TEST_DATA_DIR == "/path/to/your/single_folder_test_data" or \
       OUTPUT_COCO_TEST_DIR == "/path/to/your/output_coco_test_directory":
        print("\nPLEASE UPDATE 'SOURCE_TEST_DATA_DIR' and 'OUTPUT_COCO_TEST_DIR' before running the script!")
    else:
        create_coco_test_dataset_single_class(
            source_test_data_dir=SOURCE_TEST_DATA_DIR,
            output_coco_test_dir=OUTPUT_COCO_TEST_DIR,
            fixed_class_name=FIXED_CLASS_NAME,
            fixed_class_id=FIXED_CLASS_ID,
            max_workers=MAX_WORKERS,
            image_output_folder_name=IMAGE_OUTPUT_FOLDER_NAME
        )
        print("\nCOCO Test Set Script execution finished.")

Starting COCO dataset creation for the TEST SET (Single Class: 'bark_beetle'):
  Source Test Data Directory: /path/to/your/single_folder_test_data
  Output COCO Test Directory: /path/to/your/output_coco_test_directory
  Image Output Subfolder: 'data'
  Max Workers for Parallelization: Default (likely 128)
------------------------------

PLEASE UPDATE 'SOURCE_TEST_DATA_DIR' and 'OUTPUT_COCO_TEST_DIR' before running the script!


In [2]:
# --- Previous K-Fold COCO Creation (Example - keep as is or run separately) ---
# print(f"Starting COCO dataset creation for Co-DETR with K-Fold CV (Single Class: '{FIXED_CLASS_NAME}'):")
# # ... (your existing k-fold setup and call) ...
# create_kfold_coco_datasets_single_class(
#     source_dir=SOURCE_DIR,
#     base_dest_dir=BASE_DEST_DIR,
#     source_fold_names_str=SOURCE_FOLD_NAMES_STR,
#     fixed_class_name=FIXED_CLASS_NAME,
#     fixed_class_id=FIXED_CLASS_ID,
#     max_workers=MAX_WORKERS
# )
# print("\nCOCO Single-Class K-Fold Script execution finished.")

# --- Configuration for Your TEST Dataset (Single Class COCO) ---
# **IMPORTANT**: Modify these paths to match your actual test dataset.
SOURCE_TEST_DATA_DIR = "/blue/hulcr/gmarais/PhD/phase_1_data/1_data_splitting/test_set_output"  # e.g., "/blue/hulcr/gmarais/PhD/phase_1_data/raw_test_set"
OUTPUT_COCO_TEST_DIR = "/blue/hulcr/gmarais/PhD/phase_1_data/2_object_detection_phase_2/coco/test" # Or any other desired output path

# FIXED_CLASS_NAME and FIXED_CLASS_ID should be the same as used for training/validation
FIXED_CLASS_NAME = "bark_beetle" # (already defined)
FIXED_CLASS_ID = 1 # (already defined)

# MAX_WORKERS can be the same or adjusted
MAX_WORKERS = 4 # (already defined)

# --- Run the Test Set Creation ---
print("\n" + "="*50)
print(f"Starting COCO dataset creation for the TEST SET (Single Class: '{FIXED_CLASS_NAME}'):")
print(f"  Source Test Data Directory: {SOURCE_TEST_DATA_DIR}")
print(f"  Output COCO Test Directory: {OUTPUT_COCO_TEST_DIR}")
print(f"  Max Workers for Parallelization: {MAX_WORKERS if MAX_WORKERS is not None else f'Default (likely {os.cpu_count()})'}")
print("-" * 30)

# Ensure the new function create_coco_test_dataset_single_class is defined by running the cell with its definition first!
create_coco_test_dataset_single_class(
    source_test_data_dir=SOURCE_TEST_DATA_DIR,
    output_coco_test_dir=OUTPUT_COCO_TEST_DIR,
    fixed_class_name=FIXED_CLASS_NAME,
    fixed_class_id=FIXED_CLASS_ID,
    max_workers=MAX_WORKERS,
    image_output_folder_name="data" # You can change this to "images" if you prefer
)

print("\nCOCO Test Set Script execution finished.")


Starting COCO dataset creation for the TEST SET (Single Class: 'bark_beetle'):
  Source Test Data Directory: /blue/hulcr/gmarais/PhD/phase_1_data/1_data_splitting/test_set_output
  Output COCO Test Directory: /blue/hulcr/gmarais/PhD/phase_1_data/2_object_detection_phase_2/coco/test
  Max Workers for Parallelization: 4
------------------------------

Processing Test Dataset (COCO Format, Single Class: 'bark_beetle')...
Defining fixed COCO category: ID: 1, Name: bark_beetle
  Output images to: /blue/hulcr/gmarais/PhD/phase_1_data/2_object_detection_phase_2/coco/test/data
  Output annotations to: /blue/hulcr/gmarais/PhD/phase_1_data/2_object_detection_phase_2/coco/test/annotations
  Found 2031 images to process from '/blue/hulcr/gmarais/PhD/phase_1_data/1_data_splitting/test_set_output'...

  Successfully created 'instances_test.json' with 2031 images and 16480 annotations.
Test set processing complete in 95.37 seconds!
Test dataset is ready under: /blue/hulcr/gmarais/PhD/phase_1_data/2_