In [1]:
import os
import json
import shutil
from collections import OrderedDict
import yaml # Still useful for reading/writing other configs if needed, but mainly json here
from PIL import Image
import concurrent.futures
import time
import datetime

# --- COCO Bounding Box Conversion ---
def convert_to_coco_bbox(points, image_height, image_width):
    """
    Converts [[x1, y1], [x2, y2]] points to COCO bbox [x_min, y_min, width, height].
    Clamps coordinates to be within image boundaries.
    """
    x_coords = [p[0] for p in points]
    y_coords = [p[1] for p in points]

    x_min_abs = float(min(x_coords))
    y_min_abs = float(min(y_coords))
    x_max_abs = float(max(x_coords))
    y_max_abs = float(max(y_coords))

    # Clamp to image boundaries
    x_min_abs = max(0.0, x_min_abs)
    y_min_abs = max(0.0, y_min_abs)
    x_max_abs = min(float(image_width - 1), x_max_abs)
    y_max_abs = min(float(image_height - 1), y_max_abs)
    
    if x_min_abs >= x_max_abs or y_min_abs >= y_max_abs:
        # print(f"Warning: Invalid bbox after clamping: {[x_min_abs, y_min_abs, x_max_abs - x_min_abs, y_max_abs - y_min_abs]}")
        return None

    width = x_max_abs - x_min_abs
    height = y_max_abs - y_min_abs
    
    return [x_min_abs, y_min_abs, width, height]

# --- Parallelized Category Discovery for COCO ---
def _get_labels_from_single_json_for_coco(json_path):
    """Helper function to extract labels from a single JSON file for COCO categories."""
    labels_in_file = set()
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
        for shape in data.get("shapes", []): # Assuming 'shapes' contains the objects
            label = shape.get("label")
            if label:
                labels_in_file.add(label)
    except Exception as e:
        print(f"Warning (Category Discovery Worker): Error reading {json_path}: {e}")
    return labels_in_file

def discover_coco_categories_parallel(source_root_dir, all_fold_names, max_workers=None):
    """Scans all specified folds in parallel to discover unique class labels for COCO categories."""
    print("Discovering COCO categories across all specified folds (parallelized)...")
    overall_start_time = time.time()
    
    json_file_paths = []
    for fold_name in all_fold_names:
        current_fold_path = os.path.join(source_root_dir, fold_name)
        if not os.path.isdir(current_fold_path):
            print(f"Warning (Category Discovery): Source fold '{current_fold_path}' not found. Skipping.")
            continue
        for item_name in os.listdir(current_fold_path):
            if item_name.lower().endswith(".json"): # Assuming .json files contain labels
                json_file_paths.append(os.path.join(current_fold_path, item_name))

    if not json_file_paths:
        print("Warning: No JSON files found for category discovery.")
        return [] # Return empty list if no categories found

    unique_labels = set()
    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
        future_to_path = {executor.submit(_get_labels_from_single_json_for_coco, path): path for path in json_file_paths}
        for future in concurrent.futures.as_completed(future_to_path):
            try:
                labels_from_file = future.result()
                unique_labels.update(labels_from_file)
            except Exception as exc:
                path = future_to_path[future]
                print(f"Warning (Category Discovery Main): Generated an exception for {path}: {exc}")
    
    sorted_labels = sorted(list(unique_labels))
    
    categories_list = []
    # COCO category IDs typically start from 1. Some models might handle 0 as background.
    # Let's start from 1 for distinct objects.
    for i, label_name in enumerate(sorted_labels):
        categories_list.append({
            "id": i + 1, # Category ID
            "name": label_name,
            "supercategory": label_name # Or a more general supercategory if you have one
        })
    
    discovery_time = time.time() - overall_start_time
    if categories_list:
        print(f"Discovered {len(categories_list)} COCO categories in {discovery_time:.2f} seconds:")
        for cat in categories_list:
            print(f"  ID: {cat['id']}, Name: {cat['name']}")
    else:
        print(f"Warning: No categories discovered after parallel processing in {discovery_time:.2f} seconds.")
    return categories_list


# --- Worker for processing one image-JSON pair for COCO ---
def _process_single_image_to_coco_data(args_tuple):
    """
    Processes one image and its JSON.
    Returns dict with image_info and list of annotation_data (pre-ID assignment).
    args_tuple: (img_filename_no_ext, source_fold_path, new_img_filename)
    """
    img_filename_no_ext, source_fold_path, new_img_filename = args_tuple
    
    png_file = f"{img_filename_no_ext}.png"
    json_file = f"{img_filename_no_ext}.json" # Assuming same name for JSON

    source_png_path = os.path.join(source_fold_path, png_file)
    source_json_path = os.path.join(source_fold_path, json_file)

    if not (os.path.exists(source_png_path) and os.path.exists(source_json_path)):
        return None

    try:
        with open(source_json_path, 'r') as f:
            user_json_data = json.load(f)
        
        image_height = user_json_data.get("imageHeight")
        image_width = user_json_data.get("imageWidth")

        # Fallback if dimensions not in JSON
        if image_height is None or image_width is None:
            with Image.open(source_png_path) as img_pil:
                pil_width, pil_height = img_pil.size
            if image_width is None: image_width = pil_width
            if image_height is None: image_height = pil_height
        
        if not image_height or not image_width: # Handles 0 or None
            return None

        image_info = {
            "file_name": new_img_filename, # This will be relative to train/ or val/
            "height": int(image_height),
            "width": int(image_width),
            "original_path": source_png_path # For copying later
        }
        
        annotations_data = []
        for shape in user_json_data.get("shapes", []): # Assuming 'shapes' from user's JSON
            original_label = shape.get("label")
            points = shape.get("points") # Assuming format [[x1,y1],[x2,y2]]
            shape_type = shape.get("shape_type")

            if not original_label or not points or shape_type != "rectangle" or len(points) != 2:
                continue
            
            coco_bbox = convert_to_coco_bbox(points, image_height, image_width)
            if coco_bbox:
                annotations_data.append({
                    "category_name": original_label, # Will be mapped to category_id later
                    "bbox": coco_bbox,
                    "area": coco_bbox[2] * coco_bbox[3]
                })
        
        if not annotations_data and not image_info.get('force_include_empty', False): # Only include if there are annotations
            # Or decide if you want to include images with no annotations
            # For COCO, usually images listed have annotations, but it's not strictly enforced by format
            # For now, let's only return data if there are annotations, or image_info is present.
             if not image_info: return None # No image info, def skip

        return {"image_info": image_info, "annotations_data": annotations_data, "original_source_path": source_png_path}

    except Exception as e:
        # print(f"Warning (COCO Worker): Error processing {source_png_path} or {source_json_path}: {e}")
        return None


# --- Main Orchestrator Function for Jupyter Notebook ---
def create_kfold_coco_datasets(source_dir, base_dest_dir, source_fold_names_str, max_workers=None):
    """
    Main function to create k-fold cross-validation datasets in COCO format.
    """
    overall_start_time = time.time()
    all_original_fold_names = [name.strip() for name in source_fold_names_str.split(',') if name.strip()]

    if not all_original_fold_names or len(all_original_fold_names) < 2:
        print("Error: Please provide at least two source fold names for cross-validation.")
        return
    if not os.path.isdir(source_dir):
        print(f"Error: Source directory '{source_dir}' not found.")
        return
    os.makedirs(base_dest_dir, exist_ok=True)

    # 1. Discover all COCO categories globally first
    coco_categories = discover_coco_categories_parallel(source_dir, all_original_fold_names, max_workers)
    if not coco_categories:
        print("Error: No categories found. Cannot proceed.")
        return
    
    # Create a quick lookup map from category name to category ID
    category_name_to_id = {cat['name']: cat['id'] for cat in coco_categories}

    num_cv_folds = len(all_original_fold_names)
    print(f"\nPreparing data for {num_cv_folds}-Fold Cross-Validation (COCO Format)...")

    for i in range(num_cv_folds): # Loop for each CV iteration
        cv_iteration_start_time = time.time()
        current_val_fold_name = all_original_fold_names[i]
        current_train_fold_names = [f_name for idx, f_name in enumerate(all_original_fold_names) if idx != i]

        cv_iteration_dir_name = f"cv_iteration_{i+1}"
        current_cv_split_output_root = os.path.join(base_dest_dir, cv_iteration_dir_name)
        print(f"\n--- Processing CV Iteration {i+1}/{num_cv_folds} ---")
        print(f"  Output to: {current_cv_split_output_root}")

        # Define paths for this CV iteration
        train_img_dir = os.path.join(current_cv_split_output_root, "train") # Corresponds to e.g. train2017
        val_img_dir = os.path.join(current_cv_split_output_root, "val")     # Corresponds to e.g. val2017
        annotations_dir = os.path.join(current_cv_split_output_root, "annotations")
        os.makedirs(train_img_dir, exist_ok=True)
        os.makedirs(val_img_dir, exist_ok=True)
        os.makedirs(annotations_dir, exist_ok=True)

        # Process TRAIN and VAL splits for the current CV iteration
        for split_type, source_fold_list in [("train", current_train_fold_names), ("val", [current_val_fold_name])]:
            print(f"  Processing {split_type} data for CV Iteration {i+1}...")
            split_start_time = time.time()

            coco_output_data = {
                "info": {
                    "description": f"COCO-style dataset for CV Iteration {i+1} - {split_type}",
                    "version": "1.0",
                    "year": datetime.date.today().year,
                    "date_created": datetime.datetime.utcnow().isoformat(' ')
                },
                "licenses": [{"name": "Placeholder License", "id": 0, "url": ""}], # Add licenses if any
                "categories": coco_categories,
                "images": [],
                "annotations": []
            }
            
            current_image_id = 1  # Reset for each JSON file (train/val)
            current_annotation_id = 1 # Reset for each JSON file

            tasks_for_split = []
            img_counter_for_naming = 0 # To ensure unique names if files from different folds have same name

            target_image_dir_for_split = train_img_dir if split_type == "train" else val_img_dir

            for fold_idx, fold_name in enumerate(source_fold_list):
                source_fold_path = os.path.join(source_dir, fold_name)
                if not os.path.isdir(source_fold_path):
                    print(f"    Warning: Source fold '{source_fold_path}' for {split_type} not found. Skipping.")
                    continue
                
                image_basenames = sorted([os.path.splitext(f)[0] for f in os.listdir(source_fold_path) if f.lower().endswith(".png")])
                for img_basename in image_basenames:
                    # Create a potentially more unique filename for the destination to avoid clashes
                    # if images from different source folds (now combined to train) have same names.
                    new_img_filename = f"{split_type}_fold{fold_idx}_{img_basename}.png"
                    tasks_for_split.append((img_basename, source_fold_path, new_img_filename))
            
            if not tasks_for_split:
                print(f"    No images found to process for {split_type} set in this CV iteration.")
            else:
                processed_results = []
                with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
                    future_to_task = {executor.submit(_process_single_image_to_coco_data, task_args): task_args for task_args in tasks_for_split}
                    for future in concurrent.futures.as_completed(future_to_task):
                        try:
                            result = future.result()
                            if result:
                                processed_results.append(result)
                        except Exception as exc:
                            # task_args_failed = future_to_task[future]
                            # print(f"    Warning (COCO Split Processor): Task for {task_args_failed[0]} generated an exception: {exc}")
                            pass
                
                # Now, sequentially build the COCO lists to ensure unique IDs
                for result_data in processed_results:
                    # Copy image file
                    shutil.copy2(result_data["original_source_path"], os.path.join(target_image_dir_for_split, result_data["image_info"]["file_name"]))
                    
                    # Add image entry
                    img_entry = result_data["image_info"]
                    img_entry["id"] = current_image_id 
                    # Remove helper key before adding to COCO JSON
                    del img_entry["original_path"] 
                    coco_output_data["images"].append(img_entry) 
                    
                    # Add annotation entries
                    for ann_data in result_data["annotations_data"]:
                        if ann_data["category_name"] not in category_name_to_id:
                            # This should ideally not happen if discovery was thorough
                            print(f"    Skipping annotation with unknown category: {ann_data['category_name']}")
                            continue
                        
                        ann_entry = {
                            "id": current_annotation_id,
                            "image_id": current_image_id,
                            "category_id": category_name_to_id[ann_data["category_name"]],
                            "bbox": ann_data["bbox"],
                            "area": ann_data["area"],
                            "iscrowd": 0,
                            "segmentation": [] # Add segmentation if you have it
                        }
                        coco_output_data["annotations"].append(ann_entry)
                        current_annotation_id += 1
                    current_image_id += 1
            
            # Write the COCO JSON file for the current split (train or val)
            output_json_filename = f"instances_{split_type}.json" # e.g. instances_train.json
            output_json_path = os.path.join(annotations_dir, output_json_filename)
            
            try:
                with open(output_json_path, 'w') as f:
                    json.dump(coco_output_data, f, indent=4) # indent for readability
                split_processing_time = time.time() - split_start_time
                print(f"    Successfully created '{output_json_filename}' with {len(coco_output_data['images'])} images and {len(coco_output_data['annotations'])} annotations in {split_processing_time:.2f}s.")
            except Exception as e:
                print(f"    Error writing COCO JSON '{output_json_filename}': {e}")
        
        cv_iteration_time = time.time() - cv_iteration_start_time
        print(f"  CV Iteration {i+1} processing took {cv_iteration_time:.2f}s.")

    total_script_time = time.time() - overall_start_time
    print(f"\n{num_cv_folds}-Fold Cross-Validation COCO dataset preparation complete in {total_script_time:.2f} seconds!")
    print(f"All CV iteration datasets are ready under: {base_dest_dir}")

In [None]:
# --- Configuration for Your Dataset (COCO Format) ---

# **IMPORTANT**: Modify these paths and names to match your actual dataset and desired output.
SOURCE_DIR = "/blue/hulcr/gmarais/PhD/phase_1_data/1_data_splitting/classification_folds_output"  # CHANGE THIS
BASE_DEST_DIR = "/blue/hulcr/gmarais/PhD/phase_1_data/3_classification_phase_2/coco" # CHANGE THIS
SOURCE_FOLD_NAMES_STR = "fold1,fold2,fold3,fold4,fold5" # CHANGE THIS

# Optional: Control the number of parallel processes
import os # if you want to use os.cpu_count() explicitly
MAX_WORKERS = 6 # Defaults to os.cpu_count()
# MAX_WORKERS = 4 # Or set a specific number

# --- Run the Dataset Creation ---
print(f"Starting COCO dataset creation for Co-DETR with K-Fold CV:")
print(f"  Source Directory: {SOURCE_DIR}")
print(f"  Base Destination Directory: {BASE_DEST_DIR}")
print(f"  Source Fold Names: {SOURCE_FOLD_NAMES_STR.split(',')}")
print(f"  Max Workers for Parallelization: {MAX_WORKERS if MAX_WORKERS is not None else f'Default (likely {os.cpu_count()})'}")
print("-" * 30)

# Ensure the function create_kfold_coco_datasets is defined by running the cell above first!
create_kfold_coco_datasets(
    source_dir=SOURCE_DIR,
    base_dest_dir=BASE_DEST_DIR,
    source_fold_names_str=SOURCE_FOLD_NAMES_STR,
    max_workers=MAX_WORKERS
)

print("\nCOCO Script execution finished in this cell.")

Starting COCO dataset creation for Co-DETR with K-Fold CV:
  Source Directory: /blue/hulcr/gmarais/PhD/phase_1_data/1_data_splitting/classification_folds_output
  Base Destination Directory: /blue/hulcr/gmarais/PhD/phase_1_data/3_classification_phase_2/coco
  Source Fold Names: ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']
  Max Workers for Parallelization: 6
------------------------------
Discovering COCO categories across all specified folds (parallelized)...
Discovered 63 COCO categories in 1.57 seconds:
  ID: 1, Name: Ambrosiodmus_minor
  ID: 2, Name: Ambrosiophilus_atratus
  ID: 3, Name: Anisandrus_dispar
  ID: 4, Name: Anisandrus_sayi
  ID: 5, Name: Cnestus_mutilatus
  ID: 6, Name: Coccotrypes_carpophagus
  ID: 7, Name: Coccotrypes_dactyliperda
  ID: 8, Name: Coptoborus_ricini
  ID: 9, Name: Cryptocarenus_heveae
  ID: 10, Name: Ctonoxylon_hagedorn
  ID: 11, Name: Cyclorhipidion_pelliculosum
  ID: 12, Name: Dendroctonus_rufipennis
  ID: 13, Name: Dendroctonus_terebrans
  ID: 14, N