In [1]:
import os
import glob
import shutil
import random
from pathlib import Path
from tqdm import tqdm
import bs4 as bs

# ===================================================================
# FIX 1: Always create label files (even if empty)
# ===================================================================

def convertPascal2YOLOv8(filePath, outputDir):
    """
    Convert Pascal VOC XML to YOLO format

    Args:
        filePath: Path to XML file
        outputDir: Where to save the .txt label file
    """
    class_mapping = {
        "D00": 0,
        "D10": 1,
        "D20": 2,
        "D40": 3
    }

    with open(filePath) as f:
        soup = bs.BeautifulSoup(f.read(), "xml")

    size = soup.find("size")
    w = int(size.width.text)
    h = int(size.height.text)

    objects = soup.find_all("object")

    os.makedirs(outputDir, exist_ok=True)

    outputFile = os.path.join(
        outputDir,
        os.path.basename(filePath).replace(".xml", ".txt")
    )

    lines = []

    for obj in objects:
        name = obj.find("name").text
        if name not in class_mapping:
            continue

        cls = class_mapping[name]
        box = obj.find("bndbox")

        xmin = float(box.xmin.text)
        ymin = float(box.ymin.text)
        xmax = float(box.xmax.text)
        ymax = float(box.ymax.text)

        cx = ((xmin + xmax) / 2) / w
        cy = ((ymin + ymax) / 2) / h
        bw = (xmax - xmin) / w
        bh = (ymax - ymin) / h

        lines.append(f"{cls} {cx:.6f} {cy:.6f} {bw:.6f} {bh:.6f}")

    # ✅ FIX: Always write file, even if empty
    with open(outputFile, "w") as f:
        if lines:
            f.write("\n".join(lines))
        # else: creates empty file (YOLO requirement)


# ===================================================================
# FIX 2: Convert labels DURING split (not before)
# ===================================================================

def CopyDatasetSplit(baseDir):
    """
    Split dataset AND convert labels at the same time

    Args:
        baseDir: Base directory containing images/ and annotations/ folders
    """
    random.seed(1337)

    baseOutputDir = r"C:\Projects Datasets\RoadDamageDetection\handled_imbalance_processed_data\YOLOv8"
    countryName = Path(baseDir).parents[0].name

    baseImageDir = os.path.join(baseDir, "images")
    baseAnnotDir = os.path.join(baseDir, "annotations")  # ✅ XML annotations

    image_list_all = sorted(glob.glob(os.path.join(baseImageDir, "*")))
    annot_list_all = sorted(glob.glob(os.path.join(baseAnnotDir, "*.xml")))  # ✅ XML files

    # Match images to annotations
    image_annot_pairs = []
    for img_path in image_list_all:
        img_name = Path(img_path).stem
        annot_path = os.path.join(baseAnnotDir, f"{img_name}.xml")

        if os.path.exists(annot_path):
            image_annot_pairs.append((img_path, annot_path))
        else:
            print(f"⚠️ Warning: No annotation for {img_name}")

    # Background image filtering
    backgroundImages_Percentage = 0.1
    filtered_pairs = []
    background_count = 0
    max_background = int(len(image_annot_pairs) * backgroundImages_Percentage)

    for img_path, annot_path in image_annot_pairs:
        # Check if annotation has objects
        with open(annot_path) as f:
            soup = bs.BeautifulSoup(f.read(), "xml")
            objects = soup.find_all("object")

        if objects:  # Has annotations
            filtered_pairs.append((img_path, annot_path))
        elif background_count < max_background:  # Background image
            filtered_pairs.append((img_path, annot_path))
            background_count += 1

    # Split dataset
    dataset_length = len(filtered_pairs)
    split_ratio = 0.85
    middle_point = round(split_ratio * dataset_length)

    indices = list(range(dataset_length))
    random.shuffle(indices)

    train_indices = indices[:middle_point]
    val_indices = indices[middle_point:]

    print(f"Training/Validation Samples: {len(train_indices)}, {len(val_indices)}")

    # ✅ FIX: Process train split
    print(f"Processing training data for {countryName}")
    for i in tqdm(train_indices):
        img_path, annot_path = filtered_pairs[i]

        # Copy image
        outputImagesDir = os.path.join(baseOutputDir, countryName, "images", "train")
        os.makedirs(outputImagesDir, exist_ok=True)
        shutil.copy2(img_path, outputImagesDir)

        # Convert and save label
        outputLabelsDir = os.path.join(baseOutputDir, countryName, "labels", "train")
        convertPascal2YOLOv8(annot_path, outputLabelsDir)

    # ✅ FIX: Process validation split
    print(f"Processing validation data for {countryName}")
    for i in tqdm(val_indices):
        img_path, annot_path = filtered_pairs[i]

        # Copy image
        outputImagesDir = os.path.join(baseOutputDir, countryName, "images", "val")
        os.makedirs(outputImagesDir, exist_ok=True)
        shutil.copy2(img_path, outputImagesDir)

        # Convert and save label
        outputLabelsDir = os.path.join(baseOutputDir, countryName, "labels", "val")
        convertPascal2YOLOv8(annot_path, outputLabelsDir)


# ===================================================================
# USAGE
# ===================================================================

# Run the corrected pipeline
baseDir = r"C:\Projects Datasets\RoadDamageDetection\handled_imbalance_processed_data"

CopyDatasetSplit(baseDir)

Training/Validation Samples: 593, 105
Processing training data for RoadDamageDetection


100%|██████████| 593/593 [00:01<00:00, 377.34it/s]


Processing validation data for RoadDamageDetection


100%|██████████| 105/105 [00:00<00:00, 386.74it/s]
