In [1]:
# Paste the ENTIRE Python script from the previous response here.
# This includes all imports, all function definitions:
# convert_yolo_format, get_fixed_class_map, _process_single_image_json_pair,
# process_single_fold_for_yolo_parallel, and create_kfold_yolo_datasets.

import os
import json
import shutil
from collections import OrderedDict
import yaml # For data.yaml
from PIL import Image # To verify/get image dimensions if needed
import concurrent.futures
import time # For timing

# --- Utility Functions (largely unchanged but crucial) ---
def convert_yolo_format(image_width, image_height, points, class_id):
    # ... (full function code as provided before) ...
    x_coords = [p[0] for p in points]
    y_coords = [p[1] for p in points]

    x_min_abs = min(x_coords)
    y_min_abs = min(y_coords)
    x_max_abs = max(x_coords)
    y_max_abs = max(y_coords)

    x_min_abs = max(0, x_min_abs)
    y_min_abs = max(0, y_min_abs)
    x_max_abs = min(image_width - 1, x_max_abs)
    y_max_abs = min(image_height - 1, y_max_abs)

    if x_min_abs >= x_max_abs or y_min_abs >= y_max_abs:
        return None

    box_width_abs = x_max_abs - x_min_abs
    box_height_abs = y_max_abs - y_min_abs

    x_center_abs = x_min_abs + box_width_abs / 2.0
    y_center_abs = y_min_abs + box_height_abs / 2.0

    x_center_norm = x_center_abs / image_width
    y_center_norm = y_center_abs / image_height
    width_norm = box_width_abs / image_width
    height_norm = box_height_abs / image_height

    return f"{class_id} {x_center_norm:.6f} {y_center_norm:.6f} {width_norm:.6f} {height_norm:.6f}"


# --- Modified Class "Discovery" ---
def get_fixed_class_map():
    # ... (full function code as provided before) ...
    print("Setting up fixed class: 'bark_beetle'")
    master_class_to_id_map = OrderedDict([("bark_beetle", 0)])
    print(f"Using 1 class:")
    for name, idx in master_class_to_id_map.items():
        print(f"  '{name}': {idx}")
    return master_class_to_id_map


# --- Parallelized File Processing for a Single Fold ---
def _process_single_image_json_pair(args_tuple):
    # ... (full function code as provided before) ...
    img_filename_no_ext, source_fold_path, master_class_to_id_map, \
    output_images_dir, output_labels_dir = args_tuple

    bark_beetle_class_id = master_class_to_id_map["bark_beetle"]
    png_file = f"{img_filename_no_ext}.png"
    json_file = f"{img_filename_no_ext}.json"
    source_png_path = os.path.join(source_fold_path, png_file)
    source_json_path = os.path.join(source_fold_path, json_file)

    if not (os.path.exists(source_png_path) and os.path.exists(source_json_path)):
        return False
    dest_png_path = os.path.join(output_images_dir, png_file)
    try:
        shutil.copy2(source_png_path, dest_png_path)
        with open(source_json_path, 'r') as f:
            data = json.load(f)
    except Exception as e:
        if os.path.exists(dest_png_path): os.remove(dest_png_path)
        return False
    image_height = data.get("imageHeight")
    image_width = data.get("imageWidth")
    if image_height is None or image_width is None:
        try:
            with Image.open(source_png_path) as img:
                image_width_pil, image_height_pil = img.size
            if image_height is None: image_height = image_height_pil
            if image_width is None: image_width = image_width_pil
        except Exception:
            if os.path.exists(dest_png_path): os.remove(dest_png_path)
            return False
    if not image_height or not image_width:
        if os.path.exists(dest_png_path): os.remove(dest_png_path)
        return False
    yolo_annotations = []
    for shape in data.get("shapes", []):
        points = shape.get("points")
        shape_type = shape.get("shape_type")
        if not points or shape_type != "rectangle" or len(points) != 2:
            continue
        yolo_str = convert_yolo_format(image_width, image_height, points, bark_beetle_class_id)
        if yolo_str:
            yolo_annotations.append(yolo_str)
    if yolo_annotations:
        dest_label_path = os.path.join(output_labels_dir, f"{img_filename_no_ext}.txt")
        with open(dest_label_path, 'w') as f_label:
            f_label.write("\n".join(yolo_annotations) + "\n")
        return True
    elif os.path.exists(dest_png_path):
        return False
    return False


def process_single_fold_for_yolo_parallel(source_fold_path, image_basenames_in_fold,
                                          master_class_to_id_map,
                                          output_images_dir, output_labels_dir, max_workers=None):
    # ... (full function code as provided before) ...
    processed_count = 0
    tasks = []
    for img_basename in image_basenames_in_fold:
        tasks.append((img_basename, source_fold_path, master_class_to_id_map,
                      output_images_dir, output_labels_dir))
    if not tasks:
        return 0
    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
        future_to_task = {executor.submit(_process_single_image_json_pair, task_args): task_args for task_args in tasks}
        for future in concurrent.futures.as_completed(future_to_task):
            try:
                if future.result():
                    processed_count += 1
            except Exception as exc:
                pass
    return processed_count


# --- Main Orchestrator Function for Jupyter Notebook ---
def create_kfold_yolo_datasets(source_dir, base_dest_dir, source_fold_names_str, max_workers=None):
    # ... (full function code as provided before) ...
    overall_start_time = time.time()
    all_original_fold_names = [name.strip() for name in source_fold_names_str.split(',') if name.strip()]
    if not all_original_fold_names or len(all_original_fold_names) < 2:
        print("Error: Please provide at least two source fold names for cross-validation.")
        return
    if not os.path.isdir(source_dir):
        print(f"Error: Source directory '{source_dir}' not found.")
        return
    os.makedirs(base_dest_dir, exist_ok=True)
    master_class_to_id_map = get_fixed_class_map()
    num_cv_folds = len(all_original_fold_names)
    print(f"\nPreparing data for {num_cv_folds}-Fold Cross-Validation (all objects as 'bark_beetle')...")
    for i in range(num_cv_folds):
        cv_iteration_start_time = time.time()
        current_val_fold_name = all_original_fold_names[i]
        current_train_fold_names = [f_name for idx, f_name in enumerate(all_original_fold_names) if idx != i]
        cv_iteration_dir_name = f"cv_iteration_{i+1}"
        current_cv_split_output_root = os.path.join(base_dest_dir, cv_iteration_dir_name)
        print(f"\n--- Processing CV Iteration {i+1}/{num_cv_folds} ---")
        print(f"  Validation Fold: {current_val_fold_name}")
        print(f"  Training Folds: {', '.join(current_train_fold_names)}")
        print(f"  Output to: {current_cv_split_output_root}")
        train_images_dir = os.path.join(current_cv_split_output_root, "images", "train")
        train_labels_dir = os.path.join(current_cv_split_output_root, "labels", "train")
        val_images_dir = os.path.join(current_cv_split_output_root, "images", "val")
        val_labels_dir = os.path.join(current_cv_split_output_root, "labels", "val")
        os.makedirs(train_images_dir, exist_ok=True)
        os.makedirs(train_labels_dir, exist_ok=True)
        os.makedirs(val_images_dir, exist_ok=True)
        os.makedirs(val_labels_dir, exist_ok=True)
        print(f"  Processing training data for CV Iteration {i+1}...")
        total_train_images_for_cv_iter = 0
        for train_fold_name in current_train_fold_names:
            fold_proc_start_time = time.time()
            source_fold_path = os.path.join(source_dir, train_fold_name)
            if not os.path.isdir(source_fold_path):
                print(f"  Warning: Training source fold '{source_fold_path}' not found. Skipping.")
                continue
            image_basenames = {os.path.splitext(f)[0] for f in os.listdir(source_fold_path) if f.lower().endswith(".png")}
            if not image_basenames:
                print(f"  No PNG images found in training source fold '{source_fold_path}'.")
                continue
            count = process_single_fold_for_yolo_parallel(
                source_fold_path, list(image_basenames), master_class_to_id_map,
                train_images_dir, train_labels_dir, max_workers
            )
            total_train_images_for_cv_iter += count
            fold_proc_time = time.time() - fold_proc_start_time
            print(f"    Processed {count} image files (labels generated if objects found) from source train fold '{train_fold_name}' in {fold_proc_time:.2f}s.")
        print(f"  Total training images with labels for CV Iteration {i+1}: {total_train_images_for_cv_iter}")
        print(f"  Processing validation data for CV Iteration {i+1}...")
        total_val_images_for_cv_iter = 0
        source_val_fold_path = os.path.join(source_dir, current_val_fold_name)
        if not os.path.isdir(source_val_fold_path):
            print(f"  Warning: Validation source fold '{source_val_fold_path}' not found. Skipping val set for this iter.")
        else:
            fold_proc_start_time = time.time()
            image_basenames_val = {os.path.splitext(f)[0] for f in os.listdir(source_val_fold_path) if f.lower().endswith(".png")}
            if not image_basenames_val:
                print(f"  No PNG images found in validation source fold '{source_val_fold_path}'.")
            else:
                count_val = process_single_fold_for_yolo_parallel(
                    source_val_fold_path, list(image_basenames_val), master_class_to_id_map,
                    val_images_dir, val_labels_dir, max_workers
                )
                total_val_images_for_cv_iter = count_val
                fold_proc_time = time.time() - fold_proc_start_time
                print(f"    Processed {count_val} image files (labels generated if objects found) from source validation fold '{current_val_fold_name}' in {fold_proc_time:.2f}s.")
        print(f"  Total validation images with labels for CV Iteration {i+1}: {total_val_images_for_cv_iter}")
        data_yaml_content = {
            'path': os.path.abspath(current_cv_split_output_root),
            'train': os.path.join('images', 'train'),
            'val': os.path.join('images', 'val'),
            'nc': len(master_class_to_id_map),
            'names': list(master_class_to_id_map.keys())
        }
        data_yaml_path = os.path.join(current_cv_split_output_root, "data.yaml")
        try:
            with open(data_yaml_path, 'w') as f:
                yaml.dump(data_yaml_content, f, sort_keys=False, default_flow_style=False)
            cv_iteration_time = time.time() - cv_iteration_start_time
            print(f"  Successfully created 'data.yaml' for CV Iteration {i+1}. Iteration took {cv_iteration_time:.2f}s.")
        except Exception as e:
            print(f"  Error writing data.yaml for CV Iteration {i+1}: {e}")
    total_script_time = time.time() - overall_start_time
    print(f"\n{num_cv_folds}-Fold Cross-Validation dataset preparation complete in {total_script_time:.2f} seconds!")
    print(f"All CV iteration datasets are ready under: {base_dest_dir}")
    print("All objects have been mapped to the class 'bark_beetle'.")

# Note: The actual call to create_kfold_yolo_datasets will be in the next cell.
# Ensure this cell is run first to define all functions.

In [None]:
# --- Configuration for Your Dataset ---

# **IMPORTANT**: Modify these paths and names to match your actual dataset and desired output.
# Example for a Linux-like environment:
SOURCE_DIR = "/blue/hulcr/gmarais/PhD/phase_1_data/1_data_splitting/detection_folds_output"
BASE_DEST_DIR = "/blue/hulcr/gmarais/PhD/phase_1_data/2_object_detection_phase_2/ultralytics"
SOURCE_FOLD_NAMES_STR = "detection_fold1,detection_fold2,detection_fold3,detection_fold4,detection_fold5" # Comma-separated

# Example for a Windows environment (use raw strings r"..." or double backslashes \\):
# SOURCE_DIR = r"C:\Users\YourUser\Documents\datasets\my_original_beetle_data_folds"
# BASE_DEST_DIR = r"C:\Users\YourUser\Documents\datasets\yolo_kfold_bark_beetle_output"
# SOURCE_FOLD_NAMES_STR = "beetle_fold_01,beetle_fold_02,beetle_fold_03,beetle_fold_04,beetle_fold_05"


# Optional: Control the number of parallel processes.
# 'None' will default to the number of CPUs on your machine (os.cpu_count()).
# You can set a specific number if needed, e.g., MAX_WORKERS = 4.
# For I/O heavy tasks, sometimes more workers than CPUs can be beneficial,
# but for CPU-bound tasks within processes, os.cpu_count() is a good start.
import os # Import os here if you plan to use os.cpu_count() explicitly.
MAX_WORKERS = 6 # Let Python's ProcessPoolExecutor decide based on os.cpu_count()
# MAX_WORKERS = 4 # Or, set a specific number of workers


# --- Run the Dataset Creation ---
if __name__ == '__main__': # This condition is true when running a .py script,
                           # but in Jupyter, cells run in a global scope related to the kernel.
                           # It's often included for consistency but isn't strictly necessary
                           # for just calling a function defined in a previous cell.
                           # We'll call the function directly.

    print(f"Starting dataset creation with 'bark_beetle' as the single class:")
    print(f"  Source Directory: {SOURCE_DIR}")
    print(f"  Base Destination Directory: {BASE_DEST_DIR}")
    print(f"  Source Fold Names: {SOURCE_FOLD_NAMES_STR.split(',')}") # Show as a list for clarity
    print(f"  Max Workers for Parallelization: {MAX_WORKERS if MAX_WORKERS is not None else f'Default (likely {os.cpu_count()})'}")
    print("-" * 30)

    # Make sure the function create_kfold_yolo_datasets is defined by running the cell above first!
    create_kfold_yolo_datasets(
        source_dir=SOURCE_DIR,
        base_dest_dir=BASE_DEST_DIR,
        source_fold_names_str=SOURCE_FOLD_NAMES_STR,
        max_workers=MAX_WORKERS
    )

    print("\nScript execution finished in this cell.")

Starting dataset creation with 'bark_beetle' as the single class:
  Source Directory: /blue/hulcr/gmarais/PhD/phase_1_data/1_data_splitting/detection_folds_output
  Base Destination Directory: /blue/hulcr/gmarais/PhD/phase_1_data/2_object_detection_phase_2/ultralytics
  Source Fold Names: ['detection_fold1', 'detection_fold2', 'detection_fold3', 'detection_fold4', 'detection_fold5']
  Max Workers for Parallelization: 6
------------------------------
Setting up fixed class: 'bark_beetle'
Using 1 class:
  'bark_beetle': 0

Preparing data for 5-Fold Cross-Validation (all objects as 'bark_beetle')...

--- Processing CV Iteration 1/5 ---
  Validation Fold: detection_fold1
  Training Folds: detection_fold2, detection_fold3, detection_fold4, detection_fold5
  Output to: /blue/hulcr/gmarais/PhD/phase_1_data/2_object_detection_phase_2/ultralytics/cv_iteration_1
  Processing training data for CV Iteration 1...
    Processed 1640 image files (labels generated if objects found) from source train f