In [6]:

import random, shutil
from pathlib import Path

import cv2, numpy as np
from tqdm import tqdm


# CONFIG

ROOT = Path("/home/imadb/Ghiles/lost_and_found")
IMG_ROOT = ROOT / "leftImg8bit"
GT_ROOT  = ROOT / "gtCoarse"
OUT_ROOT = ROOT / "yolo_lostfound"

SEED = 42
VAL_RATIO = 0.10

OBSTACLE_VALUES = (1,)   # change si besoin après np.unique(mask)
MIN_W, MIN_H = 10, 10

CLS_ID = 0
CLASS_NAME = "obstacle"



# HELPERS

def ensure_dirs():
    for sp in ["train", "val", "test"]:
        (OUT_ROOT / "images" / sp).mkdir(parents=True, exist_ok=True)
        (OUT_ROOT / "labels" / sp).mkdir(parents=True, exist_ok=True)

def list_images(split: str):
    p = IMG_ROOT / split
    if not p.exists():
        return []
    return sorted(p.rglob("*_leftImg8bit.png"))

def label_write(label_out: Path, lines):
    label_out.parent.mkdir(parents=True, exist_ok=True)
    label_out.write_text("\n".join(lines) + ("\n" if lines else ""), encoding="utf-8")

def mask_path(img_path: Path, out_split: str, base_split_for_val: str = "train") -> Path:
    """
    out_split: 'train'/'val'/'test' = split dans la sortie YOLO
    base_split: où se trouvent vraiment les fichiers sources (images+masks)
      - si val est auto-splitté depuis train, base_split = 'train'
      - sinon base_split = out_split
    """
    base_split = base_split_for_val if out_split == "val" else out_split

    rel = img_path.relative_to(IMG_ROOT / base_split)
    mname = img_path.name.replace("_leftImg8bit.png", "_gtCoarse_labelTrainIds.png")
    return GT_ROOT / base_split / rel.parent / mname

def mask_to_yolo(mask_p: Path, img_p: Path, label_out: Path) -> bool:
    mask = cv2.imread(str(mask_p), cv2.IMREAD_GRAYSCALE)
    img  = cv2.imread(str(img_p))

    if mask is None or img is None:
        label_write(label_out, [])
        return False

    h, w = mask.shape[:2]
    binm = np.isin(mask, OBSTACLE_VALUES).astype(np.uint8) * 255
    contours, _ = cv2.findContours(binm, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    lines = []
    for cnt in contours:
        x, y, bw, bh = cv2.boundingRect(cnt)
        if bw < MIN_W or bh < MIN_H:
            continue
        xc = (x + bw / 2) / w
        yc = (y + bh / 2) / h
        bw = bw / w
        bh = bh / h
        lines.append(f"{CLS_ID} {xc:.6f} {yc:.6f} {bw:.6f} {bh:.6f}")

    label_write(label_out, lines)
    return True

def convert_list(img_list, out_split: str, base_split_for_val: str):
    out_img_dir = OUT_ROOT / "images" / out_split
    out_lab_dir = OUT_ROOT / "labels" / out_split

    ok = 0
    miss = 0

    for img_p in tqdm(img_list, desc=f"convert {out_split}", total=len(img_list)):
        out_img = out_img_dir / img_p.name
        if not out_img.exists():
            shutil.copy2(img_p, out_img)

        lbl_out = out_lab_dir / (img_p.stem + ".txt")

        mpath = mask_path(img_p, out_split, base_split_for_val=base_split_for_val)
        if not mpath.exists():
            miss += 1
            label_write(lbl_out, [])
            continue

        if mask_to_yolo(mpath, img_p, lbl_out):
            ok += 1

    print(f"[{out_split}] images={len(img_list)} ok={ok} masks_missing={miss}")


# =========================
# MAIN
# =========================
# Clean + recreate output
if OUT_ROOT.exists():
    shutil.rmtree(OUT_ROOT)
ensure_dirs()

random.seed(SEED)

raw_train = list_images("train")
raw_val   = list_images("val")
raw_test  = list_images("test")

print("Found raw:", "train", len(raw_train), "val", len(raw_val), "test", len(raw_test))

# Decide how val is sourced
val_from_train = False

# If no real val folder, create val split from train
if len(raw_val) == 0 and len(raw_train) > 0:
    val_from_train = True
    random.shuffle(raw_train)
    n_val = int(len(raw_train) * VAL_RATIO)
    yolo_val_imgs = raw_train[:n_val]
    yolo_train_imgs = raw_train[n_val:]
    print("Auto split:", "train", len(yolo_train_imgs), "val", len(yolo_val_imgs), "(val from train)")
else:
    yolo_train_imgs = raw_train
    yolo_val_imgs = raw_val
    print("Using existing val folder.")

# Convert train
if yolo_train_imgs:
    convert_list(yolo_train_imgs, "train", base_split_for_val="train")

# Convert val
# If val_from_train=True, images are in leftImg8bit/train so base_split_for_val must be 'train'
base_for_val = "train" if val_from_train else "val"
if yolo_val_imgs:
    convert_list(yolo_val_imgs, "val", base_split_for_val=base_for_val)

# Convert test (labels empty if masks missing)
if raw_test:
    convert_list(raw_test, "test", base_split_for_val="train")

# Write data.yaml
yaml = f"""path: {OUT_ROOT}
train: images/train
val: images/val
test: images/test

nc: 1
names: ["{CLASS_NAME}"]
"""
(OUT_ROOT / "data.yaml").write_text(yaml, encoding="utf-8")
print(" Wrote:", OUT_ROOT / "data.yaml")

# Final counts
print("Counts YOLO:")
print("train images:", len(list((OUT_ROOT/'images/train').glob('*.png'))))
print("val images:",   len(list((OUT_ROOT/'images/val').glob('*.png'))))
print("train labels:", len(list((OUT_ROOT/'labels/train').glob('*.txt'))))
print("val labels:",   len(list((OUT_ROOT/'labels/val').glob('*.txt'))))

print("\nTrain YOLOv8l:")
print(f"yolo detect train model=yolov8l.pt data={OUT_ROOT/'data.yaml'} epochs=50 imgsz=640 batch=8")


Found raw: train 1036 val 0 test 1203
Auto split: train 933 val 103 (val from train)


convert train: 100%|██████████| 933/933 [01:10<00:00, 13.16it/s]


[train] images=933 ok=933 masks_missing=0


convert val: 100%|██████████| 103/103 [00:08<00:00, 12.15it/s]


[val] images=103 ok=103 masks_missing=0


convert test: 100%|██████████| 1203/1203 [01:14<00:00, 16.21it/s]

[test] images=1203 ok=832 masks_missing=371
✅ Wrote: /home/imadb/Ghiles/lost_and_found/yolo_lostfound/data.yaml
Counts YOLO:
train images: 933
val images: 103
train labels: 933
val labels: 103

Train YOLOv8l:
yolo detect train model=yolov8l.pt data=/home/imadb/Ghiles/lost_and_found/yolo_lostfound/data.yaml epochs=50 imgsz=640 batch=8



