In [None]:
!pip install numpy

In [None]:
#  — Colby Bowie, July 2025
#  Creates Testing, Fixed_Num, and Best_case directory trees
#  exactly as described in Colby Bowie's July-23 design doc.
#
#  Usage (from repo root):


from pathlib import Path
import shutil, math
import numpy as np

# ------------ CONFIG ---------------------------------------------------------
SRC_ROOT      = Path("frames")     # flat src dir from Roboflow export
IMG_DIR       = SRC_ROOT / "train/images"
LBL_DIR       = SRC_ROOT / "train/labels"

DEST_ROOT     = Path("Data_Frames")                # new root
TEST_FRACTION = 0.10                               # 10 %  → every 10th frame
TRAIN_FRAC_S  = 0.70                               # single-light   (Fixed_Num)
VAL_FRAC_S    = 0.20

LIGHT_KEYS    = {"daylight": "day",
                 "shoplight": "shop",
                 "led": "led"}

RANDOM_SEED   = 42                                 # only for numpy shuffle reproducibility
np.random.seed(RANDOM_SEED)

# -----------------------------------------------------------------------------
def even_sample(items, k):
    """Return k indices spaced as evenly as possible through items (deterministic)."""
    if k <= 0:
        return []
    idx = np.linspace(0, len(items) - 1, k, endpoint=False)
    return [items[int(round(i))] for i in idx]

def copy_pair(img_path, dst_img, dst_lbl):
    dst_img.parent.mkdir(parents=True, exist_ok=True)
    dst_lbl.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy2(img_path, dst_img)
    shutil.copy2(LBL_DIR / f"{img_path.stem}.txt", dst_lbl)

# ------------ Build per-lighting master lists --------------------------------
lighting_map = {k: [] for k in LIGHT_KEYS}   # e.g. lighting_map["daylight"] = [Path(...), …]

for img in sorted(IMG_DIR.iterdir()):
    low = img.name.lower()
    for key in LIGHT_KEYS:
        if key in low:
            lighting_map[key].append(img)
            break
    else:
        raise ValueError(f"No lighting keyword found in {img.name}")

print(" Frames per lighting:",
      {LIGHT_KEYS[k]: len(v) for k, v in lighting_map.items()})

# ------------ STEP 1: create TEST sets --------------------------------------
for key, imgs in lighting_map.items():
    short = LIGHT_KEYS[key]            # “day”, “shop”, or “led”
    test_imgs = imgs[9::10]            # every 10th (index 9, 19, …)
    for img in test_imgs:
        dst_img = DEST_ROOT / "Testing" / f"{short}_test/images" / img.name
        dst_lbl = DEST_ROOT / "Testing" / f"{short}_test/labels" / f"{img.stem}.txt"
        copy_pair(img, dst_img, dst_lbl)
    # remove test images from the pool
    lighting_map[key] = [im for im in imgs if im not in test_imgs]
    print(f"✓  {short}: {len(test_imgs)} test frames")

# helper: quick counts after test removal
N_per_light = {k: len(v) for k, v in lighting_map.items()}
print("Frames remaining (after test removal):", N_per_light)

# ---------------------------------------------------------------------------
#  PART A — FIXED_Num (equal total images across model types)
# ---------------------------------------------------------------------------
fixed_train_single = math.floor(N_per_light["daylight"] * TRAIN_FRAC_S)
fixed_val_single   = math.floor(N_per_light["daylight"] * VAL_FRAC_S)

print(f"\nFixed_Num target: {fixed_train_single} train + "
      f"{fixed_val_single} val  (= {TRAIN_FRAC_S*100:.0f}% / {VAL_FRAC_S*100:.0f}%)\n")

def build_fixed_subset(lights_in_model, tag):
    """Create train/val splits for a given tag (e.g., 'day_shop')."""
    m = len(lights_in_model)
    train_per_light = math.floor(fixed_train_single / m)
    val_per_light   = math.floor(fixed_val_single   / m)

    for light_key in lights_in_model:
        short = LIGHT_KEYS[light_key]
        imgs  = lighting_map[light_key]
        # deterministic order → evenly spaced sampling
        train_imgs = even_sample(imgs, train_per_light)
        rest       = [im for im in imgs if im not in train_imgs]
        val_imgs   = even_sample(rest, val_per_light)

        for phase, subset in [("train", train_imgs), ("val", val_imgs)]:
            for img in subset:
                dst_img = (DEST_ROOT / "Fixed_Num" / tag /
                           f"{phase}/images" / img.name)
                dst_lbl = (DEST_ROOT / "Fixed_Num" / tag /
                           f"{phase}/labels" / f"{img.stem}.txt")
                copy_pair(img, dst_img, dst_lbl)

#  single-lighting
build_fixed_subset(["daylight"], "day_only")
build_fixed_subset(["shoplight"], "shop_only")
build_fixed_subset(["led"], "led_only")

# two-lighting
build_fixed_subset(["daylight", "shoplight"], "day_shop")
build_fixed_subset(["daylight", "led"],       "day_led")
build_fixed_subset(["shoplight", "led"],      "shop_led")

# three-lighting
build_fixed_subset(list(LIGHT_KEYS.keys()),   "all_lighting")

print("Fixed_Num directory built.")

# ---------------------------------------------------------------------------
#  PART B — Best_case (use full 70 % / 20 % of any light the model includes)
# ---------------------------------------------------------------------------
def build_best_subset(lights_in_model, tag):
    for light_key in lights_in_model:
        short = LIGHT_KEYS[light_key]
        imgs  = lighting_map[light_key]

        n_train = math.floor(len(imgs) * TRAIN_FRAC_S)
        n_val   = math.floor(len(imgs) * VAL_FRAC_S)

        train_imgs = even_sample(imgs, n_train)
        rest       = [im for im in imgs if im not in train_imgs]
        val_imgs   = even_sample(rest, n_val)

        for phase, subset in [("train", train_imgs), ("val", val_imgs)]:
            for img in subset:
                dst_img = (DEST_ROOT / "Best_case" / tag /
                           f"{phase}/images" / img.name)
                dst_lbl = (DEST_ROOT / "Best_case" / tag /
                           f"{phase}/labels" / f"{img.stem}.txt")
                copy_pair(img, dst_img, dst_lbl)

# single-lighting
build_best_subset(["daylight"], "day")
build_best_subset(["shoplight"], "shop")
build_best_subset(["led"],      "led")

# two-lighting
build_best_subset(["daylight", "shoplight"], "day_shop")
build_best_subset(["daylight", "led"],       "day_led")
build_best_subset(["shoplight", "led"],      "shop_led")

# three-lighting
build_best_subset(list(LIGHT_KEYS.keys()),   "all_lighting")

print("Best_case directory built.")
print("\nAll splits created under", DEST_ROOT.resolve())


🗂️  Frames per lighting: {'day': 1867, 'shop': 1867, 'led': 1867}
✓  day: 186 test frames
✓  shop: 186 test frames
✓  led: 186 test frames
Frames remaining (after test removal): {'daylight': 1681, 'shoplight': 1681, 'led': 1681}

🔧 Fixed_Num target: 1176 train + 336 val  (= 70% / 20%)

✅  Fixed_Num directory built.
✅  Best_case directory built.

🎉  All splits created under C:\Users\11bow\Documents\Chinnok summer research\Data_Frames
