In [None]:
# Use q to exit, and a and d for previous or next filter (first filter doesnt show when exiting recording)

#Check camera settings

# Adjusting Parameters:
# Canny Edge Detector: You can adjust the threshold values (100, 200) in cv2.Canny to fine-tune edge detection.

# level 6 : Adjust the threshold = 0.6 . look for method = cv2.TM_CCOEFF_NORMED
# look for other objects than hands? or want to filter the detections in level 7? use model.set_classes(["hands"])


In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os, cv2, time, torch, numpy as np, functools, unicodedata, tkinter as tk
from tkinter import simpledialog, messagebox
from dataclasses import dataclass
from collections import defaultdict
from ultralytics import YOLO, YOLOWorld

# ---------- Small runtime niceties ----------
os.environ.setdefault("OPENCV_VIDEOIO_PRIORITY_MSMF", "0")  # helps some webcams on Windows
torch.backends.cudnn.benchmark = True

# ---------- Colors ----------
COLOR_PALETTE = [
    (255, 0, 0), (0, 255, 0), (0, 0, 255),
    (255, 255, 0), (255, 0, 255), (0, 255, 255),
    (128, 0, 0), (0, 128, 0), (0, 0, 128), (128, 128, 0)
]


WIN_NAME = 'Visualization - Press q to quit'

def init_window(mode='maximize'):
    """Create the OpenCV window and size it."""
    cv2.namedWindow(WIN_NAME, cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO | cv2.WINDOW_GUI_EXPANDED)
    if mode == 'fullscreen':
        cv2.setWindowProperty(WIN_NAME, cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)
    elif mode == 'maximize':
        maximize_window()

def _screen_size():
    """Best-effort primary screen size (fallback to 1920x1080)."""
    try:
        import tkinter as tk
        r = tk.Tk(); r.withdraw()
        w, h = r.winfo_screenwidth(), r.winfo_screenheight()
        r.destroy()
        return int(w), int(h)
    except Exception:
        return 1920, 1080

def maximize_window():
    """Resize/move the window to effectively maximize on the primary display."""
    sw, sh = _screen_size()
    cv2.resizeWindow(WIN_NAME, sw, sh)
    cv2.moveWindow(WIN_NAME, 0, 0)
    # keep aspect ratio when users drag the window
    try:
        cv2.setWindowProperty(WIN_NAME, cv2.WND_PROP_ASPECT_RATIO, cv2.WINDOW_KEEPRATIO)
    except Exception:
        pass  # not available on some builds

def fit_window_to_frame(frame):
    """Set window size to the frame's native resolution."""
    h, w = frame.shape[:2]
    cv2.resizeWindow(WIN_NAME, int(w), int(h))

def toggle_fullscreen():
    """Toggle fullscreen on/off."""
    try:
        fs = cv2.getWindowProperty(WIN_NAME, cv2.WND_PROP_FULLSCREEN)
        new_state = cv2.WINDOW_NORMAL if int(fs) == int(cv2.WINDOW_FULLSCREEN) else cv2.WINDOW_FULLSCREEN
        cv2.setWindowProperty(WIN_NAME, cv2.WND_PROP_FULLSCREEN, new_state)
    except Exception:
        # fallback: just maximize if fullscreen not supported
        maximize_window()


# ---------- Text sanitizer + safe put_text ----------
def _sanitize_ascii(s: str) -> str:
    if not isinstance(s, str):
        s = str(s)
    s = (s.replace("“", '"').replace("”", '"')
           .replace("‘", "'").replace("’", "'")
           .replace("–", "-").replace("—", "-")
           .replace("…", "..."))
    s = ''.join(c for c in unicodedata.normalize('NFKD', s) if ord(c) < 128)
    return s

def put_text(img, text, org, font_scale=0.8, color=(255,255,255),
             thickness=2, line_type=cv2.LINE_AA, font=cv2.FONT_HERSHEY_SIMPLEX):
    text = _sanitize_ascii(text)
    cv2.putText(img, text, org, font, font_scale, color, thickness, line_type)

# ---------- Language dictionary ----------
LANG = {
    "en": {
        "levels": [
            "Preview (idle)",                 # 0
            "Recording",                      # 1
            "Step 1: RGB numbers vs pixels",  # 2
            "Step 2: Convolution sweep",      # 3
            "Step 3: Edges (white)",          # 4
            "Step 4: Color quantization",     # 5
            "Step 5: Edges + quantization",   # 6
            "Step 6: Template matching",      # 7
            "Step 7: YOLO-World detection",   # 8
            "Step 8: Person tracking",        # 9
            "Step 9: Pose estimation",        #10
            "Step 10: Left vs Right people",  #11  <-- NEW
            "Step 11: Live YOLO-World (classes.txt)" #12 <-- NEW (real-time)
        ],
        "ui": {
            "record_caption": "Recording…",
            "footer": "q: quit • A: previous • D: next • P: toggle autoplay • Space: pause",
            "idle_hint": "Preview — press D to start recording",
            "popup_title": "Demo Settings",
            "popup_autoplay": "Enable autoplay?",
            "popup_secs": "Seconds per step:",
            "popup_lang": "Language (en/nl):",
            "left": "Left",
            "right": "Right"
        }
    },
    "nl": {
        "levels": [
            "Voorbeeld (idle)",
            "Opnemen",
            "Stap 1: RGB-getallen vs pixels",
            "Stap 2: Convolutie-scan",
            "Stap 3: Randen (wit)",
            "Stap 4: Kleurquantisatie",
            "Stap 5: Randen + quantisatie",
            "Stap 6: Template matching",
            "Stap 7: YOLO-World detectie",
            "Stap 8: Persoon-tracking",
            "Stap 9: Pose schatting",
            "Stap 10: Links vs Rechts personen",
            "Stap 11: Live YOLO-World (classes.txt)"
        ],
        "ui": {
            "record_caption": "Opname…",
            "footer": "q: stop • A: vorige • D: volgende • P: autoplay • Spatie: pauze",
            "idle_hint": "Voorbeeld — druk D om opname te starten",
            "popup_title": "Instellingen",
            "popup_autoplay": "Autoplay inschakelen?",
            "popup_secs": "Seconden per stap:",
            "popup_lang": "Taal (en/nl):",
            "left": "Links",
            "right": "Rechts"
        }
    },
}

def T(lang: str, group: str, key: str = None, idx: int = None):
    L = LANG.get(lang, LANG["en"])
    if group == "levels":
        arr = L["levels"]; base = LANG["en"]["levels"]
        arr = arr if idx is None or idx < len(arr) else base
        return arr[idx] if idx is not None else arr
    return L["ui"].get(key, LANG["en"]["ui"][key])

# ---------- Config + popup ----------
@dataclass
class Config:
    language: str = "en"
    autoplay: bool = False
    seconds_per_step: int = 10

def prompt_user_settings(defaults: Config) -> Config:
    cfg = Config(**defaults.__dict__)
    try:
        root = tk.Tk(); root.withdraw()
        lang = simpledialog.askstring(
            T(cfg.language, "ui", "popup_title"),
            T(cfg.language, "ui", "popup_lang"),
            initialvalue=cfg.language, parent=root
        )
        if lang:
            lang = lang.strip().lower()
            if lang in LANG: cfg.language = lang

        if messagebox.askyesno(T(cfg.language, "ui", "popup_title"),
                               T(cfg.language, "ui", "popup_autoplay"),
                               parent=root):
            cfg.autoplay = True
            secs = simpledialog.askinteger(
                T(cfg.language, "ui", "popup_title"),
                T(cfg.language, "ui", "popup_secs"),
                initialvalue=cfg.seconds_per_step, minvalue=3, maxvalue=120, parent=root
            )
            if secs: cfg.seconds_per_step = int(secs)
    except Exception as e:
        print(f"[warn] settings popup failed ({e}); using defaults.")
    finally:
        try: root.destroy()
        except Exception: pass
    return cfg

# ---------- Pause controller ----------
class PauseController:
    """Tracks paused intervals so elapsed time ignores pauses."""
    def __init__(self):
        self.paused = False
        self._t_start = None
        self._cum = 0.0
    def toggle(self):
        if not self.paused:
            self.paused = True
            self._t_start = time.monotonic()
        else:
            self.paused = False
            self._cum += time.monotonic() - (self._t_start or time.monotonic())
            self._t_start = None
    def mark(self) -> float:
        return self._cum + ((time.monotonic() - self._t_start) if self.paused and self._t_start else 0.0)
    def reset(self):
        self.paused = False; self._t_start = None; self._cum = 0.0

def _draw_footer(img, footer_text, paused=False):
    h, w = img.shape[:2]
    overlay = img.copy()
    cv2.rectangle(overlay, (0, h - 70), (w, h), (0, 0, 0), -1)
    out = cv2.addWeighted(overlay, 0.6, img, 0.4, 0)
    put_text(out, footer_text, (10, h - 15), font_scale=0.6, color=(200,200,200), thickness=1)
    if paused:
        cv2.rectangle(out, (8, 8), (268, 52), (0,0,0), -1)
        put_text(out, "PAUSED", (18, 40), font_scale=0.9, color=(255,255,255), thickness=2)
    return out

# ---------- Model + camera ----------
def initialize_model():
    model = YOLOWorld("yolov8s-worldv2.pt")
    try: model.set_classes(["person", "hand", "face", "bottle", "cell phone", "backpack", "book", "chair"])
    except Exception: pass
    if torch.cuda.is_available():
        model = model.to('cuda')
        print(f"Using GPU ({torch.cuda.get_device_name(0)}) for inference.")
    else:
        print("GPU not available; using CPU.")
    return model

def initialize_camera():
    cap = None
    for i in range(5):
        c = cv2.VideoCapture(i, cv2.CAP_DSHOW)
        if c.isOpened(): cap = c; print(f"Camera {i} opened successfully."); break
        else: print(f"Camera {i} not found.")
    if cap is None: raise RuntimeError("No camera found")
    desired_w, desired_h = 1920, 1200
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, desired_w)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, desired_h)
    fw, fh = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    if (fw, fh) != (desired_w, desired_h):
        print(f"Desired {desired_w}x{desired_h} not supported. Using {fw}x{fh}.")
    else:
        print(f"Camera initialized with resolution: {fw}x{fh}")
    return cap, fw, fh

# ---------- Level 1 helpers (better numbers stacking) ----------
def _compute_text_metrics_for_block(block_size: int):
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = max(0.35, block_size / 70.0)
    thickness = 1
    pad = max(2, int(block_size * 0.15))
    for _ in range(3):
        (_, th), _ = cv2.getTextSize("000", font, font_scale, thickness)
        line_step = th + 2
        if (3 * line_step + 2) <= (block_size - 2 * pad):
            break
        font_scale *= 0.9
    return font_scale, line_step, pad, thickness, font

def visualize_rgb_region(frame, regions, block_size=25):
    out = frame.copy()
    font_scale, line_step, pad, thickness, font = _compute_text_metrics_for_block(block_size)
    for (x0, y0, ww, hh) in regions:
        x1, y1 = min(x0 + ww, frame.shape[1]), min(y0 + hh, frame.shape[0])
        roi = frame[y0:y1, x0:x1]
        if roi.size == 0: continue
        numbers = np.zeros_like(roi)
        roi_h, roi_w = roi.shape[:2]
        for by in range(0, roi_h, block_size):
            for bx in range(0, roi_w, block_size):
                by1, bx1 = min(by + block_size, roi_h), min(bx + block_size, roi_w)
                block = roi[by:by1, bx:bx1]
                if block.size == 0: continue
                mean = block.mean(axis=(0, 1)).astype(int)
                r, g, b = f"{mean[2]:03d}", f"{mean[1]:03d}", f"{mean[0]:03d}"
                color = (int(mean[0]), int(mean[1]), int(mean[2]))  # BGR
                tx, ty = bx + pad, by + pad
                put_text(numbers, f"[{r}", (tx, ty), font_scale, color, thickness, font=cv2.FONT_HERSHEY_SIMPLEX)
                put_text(numbers, f" {g}", (tx, ty + line_step), font_scale, color, thickness, font=cv2.FONT_HERSHEY_SIMPLEX)
                put_text(numbers, f" {b}]", (tx, ty + 2 * line_step), font_scale, color, thickness, font=cv2.FONT_HERSHEY_SIMPLEX)
        out[y0:y1, x0:x1] = numbers
    return out, "Left: Pixelated. Right: numbers show RGB per block."

def visualize_rgb_region_black_background(frame, regions, block_size=25):
    out = frame.copy()
    font_scale, line_step, pad, thickness, font = _compute_text_metrics_for_block(block_size)
    for (x0, y0, ww, hh) in regions:
        x1, y1 = min(x0 + ww, frame.shape[1]), min(y0 + hh, frame.shape[0])
        roi = frame[y0:y1, x0:x1]
        if roi.size == 0: continue
        canvas = np.zeros_like(roi)
        roi_h, roi_w = roi.shape[:2]
        for by in range(0, roi_h, block_size):
            for bx in range(0, roi_w, block_size):
                by1, bx1 = min(by + block_size, roi_h), min(bx + block_size, roi_w)
                block = roi[by:by1, bx:bx1]
                if block.size == 0: continue
                mean = block.mean(axis=(0, 1)).astype(int)
                r, g, b = f"{mean[2]:03d}", f"{mean[1]:03d}", f"{mean[0]:03d}"
                color = (int(mean[0]), int(mean[1]), int(mean[2]))  # BGR
                tx, ty = bx + pad, by + pad
                put_text(canvas, f"[{r}", (tx, ty), font_scale, color, thickness)
                put_text(canvas, f" {g}", (tx, ty + line_step), font_scale, color, thickness)
                put_text(canvas, f" {b}]", (tx, ty + 2 * line_step), font_scale, color, thickness)
        out[y0:y1, x0:x1] = canvas
    return out, "A kernel looks at neighbors to update pixels."

def pixelate_image(frame, block_size=25):
    h, w = frame.shape[:2]
    temp = cv2.resize(frame, (max(1, w // block_size), max(1, h // block_size)), interpolation=cv2.INTER_LINEAR)
    return cv2.resize(temp, (w, h), interpolation=cv2.INTER_NEAREST)

def visualize_level_1(frame, block_size=25):
    h, w = frame.shape[:2]
    pixelated = pixelate_image(frame, block_size)
    x_start = w // 2
    right_numbers, _ = visualize_rgb_region(frame, [(x_start, 0, w - x_start, h)], block_size)
    combined = pixelated.copy(); combined[:, x_start:] = right_numbers[:, x_start:]
    return combined, "Left: Pixelated. Right: numbers show RGB values per block."

# ---------- Level 2 (diagonal sweep) ----------
def enhance_saturation(frame, sat_thr=100, sat_mul=1.5):
    hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV).astype(float)
    h, s, v = cv2.split(hsv)
    mask = s > sat_thr
    s[mask] *= sat_mul; s[s > 255] = 255; s[~mask] = 0
    return cv2.cvtColor(cv2.merge([h, s, v]).astype(np.uint8), cv2.COLOR_HSV2BGR)

def visualize_rgb_region_black_background(frame, regions, block_size=25):
    """
    Replace regions with black and overlay neatly stacked [R G B] numbers per block.
    """
    out = frame.copy()
    font_scale, line_step, pad, thickness, font = _compute_text_metrics_for_block(block_size)

    for (x0, y0, ww, hh) in regions:
        x1, y1 = min(x0 + ww, frame.shape[1]), min(y0 + hh, frame.shape[0])
        roi = frame[y0:y1, x0:x1]
        if roi.size == 0:
            continue

        canvas = np.zeros_like(roi)  # black background
        roi_h, roi_w = roi.shape[:2]

        for by in range(0, roi_h, block_size):
            for bx in range(0, roi_w, block_size):
                by1, bx1 = min(by + block_size, roi_h), min(bx + block_size, roi_w)
                block = roi[by:by1, bx:bx1]
                if block.size == 0:
                    continue

                mean = block.mean(axis=(0, 1)).astype(int)
                r, g, b = f"{mean[2]:03d}", f"{mean[1]:03d}", f"{mean[0]:03d}"
                color = (int(mean[0]), int(mean[1]), int(mean[2]))  # BGR

                tx, ty = bx + pad, by + pad
                put_text(canvas, f"[{r}", (tx, ty), font_scale, color, thickness)
                put_text(canvas, f" {g}", (tx, ty + line_step), font_scale, color, thickness)
                put_text(canvas, f" {b}]", (tx, ty + 2 * line_step), font_scale, color, thickness)

        out[y0:y1, x0:x1] = canvas

    return out, "A kernel looks at neighbors to update pixels."
# (We already defined visualize_rgb_region_black_background above.)

def visualize_level_2(frame, filtered_frame, processed_mask, kernel_position, kernel_size=500, block_size=25):
    h, w = frame.shape[:2]
    x, y = kernel_position
    x_end, y_end = min(x + kernel_size, w), min(y + kernel_size, h)

    third = kernel_size // 3
    cx0, cy0 = max(0, x + third), max(0, y + third)
    cx1, cy1 = min(w, x + 2*third), min(h, y + 2*third)
    processed_mask[cy0:cy1, cx0:cx1] = 255

    out = frame.copy()
    out[processed_mask == 255] = filtered_frame[processed_mask == 255]

    v1, v2 = x + third, x + 2*third
    h1, h2 = y + third, y + 2*third
    cv2.line(out, (v1, y), (v1, y_end), (0, 255, 0), 2)
    cv2.line(out, (v2, y), (v2, y_end), (0, 255, 0), 2)
    cv2.line(out, (x, h1), (x_end, h1), (0, 255, 0), 2)
    cv2.line(out, (x, h2), (x_end, h2), (0, 255, 0), 2)
    cv2.rectangle(out, (x, y), (x_end, y_end), (0, 255, 0), 2)

    regions = []
    for i in range(3):
        for j in range(3):
            if i == 1 and j == 1: continue
            sx = x + j * third; sy = y + i * third
            regions.append((sx, sy, third, third))
    out, _ = visualize_rgb_region_black_background(frame=out, regions=regions, block_size=block_size)
    return out, processed_mask, "Convolutions: the kernel moves; the center is updated from its neighbors."

# ---------- Level 3 (white edges) ----------
def visualize_level_3(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 100, 200)
    edge_rgb = np.zeros_like(frame)
    edge_rgb[edges > 0] = (255, 255, 255)
    return edge_rgb, "Edge detection highlights object boundaries."

# ---------- Level 4 (quantization, denoise + temporal smoothing) ----------
@functools.lru_cache(maxsize=None)
def _palette_cache():
    palette_bgr = np.array([
        [255,   0,   0], [  0, 255,   0], [  0,   0, 255],
        [255, 255,   0], [255,   0, 255], [  0, 255, 255],
        [255, 165,   0], [128,   0, 128],
    ], dtype=np.uint8)
    pal_lab = cv2.cvtColor(palette_bgr.reshape(-1,1,3), cv2.COLOR_BGR2LAB).reshape(-1,3).astype(np.float32)
    return palette_bgr, pal_lab.T.copy(), (pal_lab * pal_lab).sum(axis=1)

def visualize_level_4(frame, scale=0.5, brighten=1.2, bias=30):
    """
    Original fast palette quantization:
      - optional downscale
      - nearest palette color in LAB (precomputed palette)
      - simple brighten + bias
    """
    palette_bgr, pal_lab_T, pal_lab_norm = _palette_cache()

    resized = (cv2.resize(frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
               if 0 < scale < 1.0 else frame)

    lab = cv2.cvtColor(resized, cv2.COLOR_BGR2LAB).astype(np.float32)
    X = lab.reshape(-1, 3)
    X2 = (X * X).sum(axis=1)

    # squared distance to each palette color (use precomputed pal_lab_T and pal_lab_norm)
    d2 = X2[:, None] + pal_lab_norm[None, :] - 2.0 * (X @ pal_lab_T)
    idx = np.argmin(d2, axis=1).astype(np.int32)

    quant_bgr = palette_bgr[idx].reshape(resized.shape)

    output = (cv2.resize(quant_bgr, (frame.shape[1], frame.shape[0]), interpolation=cv2.INTER_NEAREST)
              if resized is not frame else quant_bgr)

    out = output.astype(np.float32)
    out = out * float(brighten) + float(bias)
    out = np.clip(out, 0, 255).astype(np.uint8)

    explanation = "Each pixel is replaced by the closest color in a fixed palette (computed once)."
    return out, explanation

# ---------- Level 5 (white edges over quant) ----------
def visualize_level_5(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 100, 200)
    edges = cv2.dilate(edges, np.ones((3,3), np.uint8), iterations=1)
    edge_mask = (edges > 0).astype(np.uint8) * 255
    quantized, _ = visualize_level_4(frame)
    edge_mask_3 = cv2.cvtColor(edge_mask, cv2.COLOR_GRAY2BGR)
    white = np.full_like(quantized, 255)
    fused = np.where(edge_mask_3 == 255, white, quantized)
    return fused, "Combining white edges with color groups makes boundaries pop."

# ---------- Level 6 (template matching) ----------
def get_color_map(names): return {n: COLOR_PALETTE[i % len(COLOR_PALETTE)] for i, n in enumerate(names)}

def _to_gray(img):
    if img is None: return None
    if img.ndim == 2: return img
    if img.shape[2] == 4: img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
    return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

def non_max_suppression_fast(boxes, overlapThresh):
    if len(boxes) == 0: return []
    boxes = boxes.astype("float")
    pick = []; x1=boxes[:,0]; y1=boxes[:,1]; x2=boxes[:,0]+boxes[:,2]; y2=boxes[:,1]+boxes[:,3]
    area = (x2-x1+1)*(y2-y1+1); idxs = np.argsort(y2)
    while len(idxs) > 0:
        last = len(idxs)-1; i = idxs[last]; pick.append(i)
        xx1 = np.maximum(x1[i], x1[idxs[:last]]); yy1 = np.maximum(y1[i], y1[idxs[:last]])
        xx2 = np.minimum(x2[i], x2[idxs[:last]]); yy2 = np.minimum(y2[i], y2[idxs[:last]])
        w = np.maximum(0, xx2-xx1+1); h = np.maximum(0, yy2-yy1+1)
        overlap = (w*h) / area[idxs[:last]]
        idxs = np.delete(idxs, np.concatenate(([last], np.where(overlap > overlapThresh)[0])))
    return pick

def visualize_level_6(frame):
    out = frame.copy()
    if not hasattr(visualize_level_6, "templates"):
        visualize_level_6.templates = []
        tdir = 'templates/'
        if os.path.isdir(tdir):
            for fn in os.listdir(tdir):
                if fn.lower().endswith(('.png','.jpg','.jpeg','.bmp','.tiff','.tif')):
                    path = os.path.join(tdir, fn)
                    img = cv2.imread(path, cv2.IMREAD_UNCHANGED)
                    gray = _to_gray(img)
                    if gray is not None:
                        name = os.path.splitext(fn)[0].replace('_template','').replace('Template','').replace('template','').strip()
                        visualize_level_6.templates.append((name, gray))
                        if len(visualize_level_6.templates) >= 5: break
        visualize_level_6.color_map = get_color_map([t[0] for t in visualize_level_6.templates])

    if not getattr(visualize_level_6, "templates", []):
        return out, "Add a few small PNG templates to the 'templates/' folder."

    gray = cv2.cvtColor(out, cv2.COLOR_BGR2GRAY)
    method = cv2.TM_CCOEFF_NORMED; thr = 0.6
    det = []; H, W = gray.shape[:2]
    for name, templ in visualize_level_6.templates:
        th, tw = templ.shape[:2]
        if th >= H or tw >= W: continue
        res = cv2.matchTemplate(gray, templ, method)
        loc = np.where(res >= thr)
        for (x, y) in zip(loc[1], loc[0]):
            det.append({'name': name, 'xywh': (int(x), int(y), int(tw), int(th)), 'conf': float(res[y, x])})
    if not det:
        return out, "No template matches yet—try moving a known object into view."
    boxes = np.array([d['xywh'] for d in det], dtype=np.int32)
    keep = non_max_suppression_fast(boxes, 0.5)
    for i in keep:
        x,y,w,h = boxes[i]; name = det[i]['name']; conf = det[i]['conf']
        color = visualize_level_6.color_map.get(name, (0,0,0))
        cv2.rectangle(out, (x,y), (x+w,y+h), color, 2)
        label = f"{name}: {conf:.2f}"
        (tw2, th2), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
        cv2.rectangle(out, (x, y - th2 - 6), (x + tw2, y), color, cv2.FILLED)
        put_text(out, label, (x, y - 4), font_scale=0.5, color=(255,255,255), thickness=1)
    return out, "Template matching: compare a small example image with each frame."

# ---------- Level 7 (YOLO-World detection) ----------
def visualize_level_7(frame, model):
    with torch.inference_mode():
        res = model(frame, imgsz=1056, verbose=False)
    return res[0].plot(), "YOLO-World: open-vocabulary object detection."

# ---------- Level 8 (tracking) ----------
def visualize_level_8(frame, model_lvl8, track_history, reset=False):
    """
    Person tracking with persistent per-ID colors, dot trail + connecting lines.
    Args:
      frame: BGR frame
      model_lvl8: ultralytics.YOLO model (e.g., YOLO('yolov8n.pt'))
      track_history: dict[int -> list[(x,y)]], persists across calls
      reset: if True, clears trails and color assignments (use at start of replay loop)
    Returns:
      annotated_frame, explanation
    """
    import colorsys

    # --- persistent color map per ID (lives across calls) ---
    if not hasattr(visualize_level_8, "_id_colors"):
        visualize_level_8._id_colors = {}
    id_colors = visualize_level_8._id_colors

    if reset:
        track_history.clear()
        id_colors.clear()

    def color_for_id(tid: int):
        # Deterministic vivid color per ID (HSV → BGR, golden ratio step for good spread)
        if tid in id_colors:
            return id_colors[tid]
        h = (tid * 0.61803398875) % 1.0
        s, v = 0.95, 1.0
        r, g, b = colorsys.hsv_to_rgb(h, s, v)
        col = (int(b * 255), int(g * 255), int(r * 255))  # BGR
        id_colors[tid] = col
        return col

    # --- run tracker (BoT-SORT preferred; ByteTrack fallback) ---
    tracker_name = "botsort.yaml"
    try:
        with torch.inference_mode():
            results = model_lvl8.track(
                frame,
                persist=True,
                classes=0,              # person 
                imgsz=(960, 544),
                conf=0.35,
                iou=0.5,
                tracker=tracker_name,
                verbose=False
            )
    except Exception:
        tracker_name = "bytetrack.yaml"
        with torch.inference_mode():
            results = model_lvl8.track(
                frame,
                persist=True,
                classes=0,
                imgsz=(960, 544),
                conf=0.35,
                iou=0.5,
                tracker=tracker_name,
                verbose=False
            )

    annotated = frame.copy()
    if not results or results[0] is None or results[0].boxes is None or len(results[0].boxes) == 0:
        return annotated, f"Person tracking ({tracker_name}): no people this frame."

    r = results[0]
    boxes = r.boxes

    # Sort large boxes first so labels remain visible when overlapping
    try:
        xyxy = boxes.xyxy.cpu().numpy()
    except Exception:
        xyxy = boxes.xyxy.numpy()
    areas = (xyxy[:, 2] - xyxy[:, 0]) * (xyxy[:, 3] - xyxy[:, 1])
    order = np.argsort(-areas)

    ids = boxes.id
    confs = boxes.conf

    MAX_TRAIL = 100       # how many points per ID to keep
    DOT_EVERY = 1         # draw every n-th dot (1 = every point)
    for idx in order:
        x1, y1, x2, y2 = xyxy[idx].astype(int).tolist()
        tid = None
        if ids is not None:
            try:
                tid = int(ids[idx].item() if hasattr(ids[idx], "item") else ids[idx])
            except Exception:
                tid = None
        conf = float(confs[idx].item() if hasattr(confs[idx], "item") else confs[idx]) if confs is not None else 0.0

        if tid is None:
            # draw box in grey if tracker didn't assign an ID (rare)
            cv2.rectangle(annotated, (x1, y1), (x2, y2), (200, 200, 200), 1)
            continue

        col = color_for_id(tid)

        # Update path with current center
        cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
        path = track_history[tid]
        path.append((cx, cy))
        if len(path) > MAX_TRAIL:
            del path[:-MAX_TRAIL]

        # Draw connecting lines along the path (same stable color)
        for j in range(1, len(path)):
            p1, p2 = path[j - 1], path[j]
            # Slightly thicker for recent segments
            thickness = 2 + int(j > len(path) * 0.7)
            cv2.line(annotated, p1, p2, col, thickness, cv2.LINE_AA)

        # Draw dots on the path (skip some if very long)
        for j, p in enumerate(path):
            if (j % DOT_EVERY) != 0:
                continue
            radius = 2 if j < len(path) - 1 else 3  # current point a tad bigger
            cv2.circle(annotated, p, radius, col, -1, cv2.LINE_AA)

        # Current bbox + label in the same ID color
        cv2.rectangle(annotated, (x1, y1), (x2, y2), col, 2)
        label = f"ID {tid}  {conf:.2f}"
        (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.55, 2)
        y_lab = max(0, y1 - 6)
        cv2.rectangle(annotated, (x1, y_lab - th - 4), (x1 + tw + 6, y_lab + 2), col, cv2.FILLED)
        cv2.putText(annotated, label, (x1 + 3, y_lab),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.55, (0, 0, 0), 1, cv2.LINE_AA)

    return annotated, f"Person tracking ({tracker_name}): persistent per-ID colors with dot paths and lines."

# ---------- Level 9 (pose) ----------
def visualize_level_9(frame, model9):
    with torch.inference_mode():
        res = model9(frame, imgsz=640, verbose=False)
    return res[0].plot(), "Pose estimation tracks joints to understand motion."

# ---------- NEW Level 10: Left vs Right people counter ----------
def _draw_dashed_vline(img, x, color=(255,255,255), thickness=2, dash=12, gap=8):
    h = img.shape[0]
    y = 0
    while y < h:
        y2 = min(y + dash, h)
        cv2.line(img, (x, y), (x, y2), color, thickness)
        y = y2 + gap

def visualize_level_10_people_sides(frame, model8, left_label="Left", right_label="Right", conf_th=0.25):
    h, w = frame.shape[:2]
    mid_x = w // 2
    with torch.inference_mode():
        res = model8(frame, classes=0, imgsz=640, verbose=False)
    r0 = res[0]
    annotated = frame.copy()

    # dashed midline
    _draw_dashed_vline(annotated, mid_x, color=(255,255,255), thickness=2, dash=14, gap=10)

    left = right = 0
    if r0.boxes is not None and len(r0.boxes) > 0:
        for b in r0.boxes:
            conf = float(b.conf[0])
            if conf < conf_th: 
                continue
            x1, y1, x2, y2 = map(int, b.xyxy[0])
            cx = int((x1 + x2) * 0.5)
            color = (0, 200, 255) if cx < mid_x else (0, 255, 128)
            cv2.rectangle(annotated, (x1, y1), (x2, y2), color, 2)
            if cx < mid_x: left += 1
            else: right += 1

    # counts
    put_text(annotated, f"{left_label}: {left}", (20, 40), font_scale=1.0, color=(255,255,255), thickness=2)
    text = f"{right_label}: {right}"
    (tw, th), _ = cv2.getTextSize(_sanitize_ascii(text), cv2.FONT_HERSHEY_SIMPLEX, 1.0, 2)
    put_text(annotated, text, (w - tw - 20, 40), font_scale=1.0, color=(255,255,255), thickness=2)
    return annotated, "Count people on the left and right side of the screen."
# ---------- NEW Level 11: Live YOLO-World with classes.txt ----------
class ClassFileWatcher:
    def __init__(self, path):
        self.path = path
        self.last_mtime = None
        self.classes = []

    def load(self):
        try:
            if not os.path.isfile(self.path):
                return False
            m = os.path.getmtime(self.path)
            if self.last_mtime is None or m != self.last_mtime:
                with open(self.path, "r", encoding="utf-8") as f:
                    lines = [ln.strip() for ln in f.readlines()]
                cls = [ln for ln in lines if ln and not ln.startswith("#")]
                self.classes = cls
                self.last_mtime = m
                return True
        except Exception as e:
            print(f"[warn] failed reading classes file: {e}")
        return False

    def get(self):
        return self.classes[:]

def live_world_loop(cap, model_live, watcher: ClassFileWatcher, pause: PauseController,
                    footer_text: str, level_name: str, autoplay: bool, should_advance_cb):
    """Run a live camera loop for YOLO-World with dynamic classes.txt."""
    last_frame = None
    # initial load
    changed = watcher.load()
    if changed:
        try:
            model_live.set_classes(watcher.get())
            print(f"[live] set classes: {watcher.get()}")
        except Exception as e:
            print(f"[live] set_classes failed: {e}")

    while True:
        if not pause.paused:
            ok, frame = cap.read()
            if not ok: continue
            frame = cv2.flip(frame, 1)
            last_frame = frame.copy()
        frame = last_frame if last_frame is not None else np.zeros((int(cap.get(4)), int(cap.get(3)), 3), np.uint8)

        # reload classes if file changed
        if watcher.load():
            try:
                model_live.set_classes(watcher.get())
                print(f"[live] set classes: {watcher.get()}")
            except Exception as e:
                print(f"[live] set_classes failed: {e}")

        # run detection only if not paused (freeze otherwise)
        if not pause.paused:
            with torch.inference_mode():
                res = model_live(frame, imgsz=(frame.shape[1], frame.shape[0]), verbose=False, conf=0.1)
            annotated = res[0].plot()
        else:
            annotated = frame.copy()

        # show list of classes on screen (top-left)
        classes_txt = ", ".join(watcher.get()) if watcher.get() else "(no classes)"
        put_text(annotated, f"Classes: {classes_txt}", (10, 30), font_scale=0.7, color=(255,255,255), thickness=2)

        annotated = _draw_footer(annotated, footer_text, paused=pause.paused)
        put_text(annotated, level_name, (10, annotated.shape[0] - 45), font_scale=1.0, thickness=2)
        cv2.imshow('Visualization - Press q to quit', annotated)

        key = cv2.waitKey(1) & 0xFF
        if key == ord('q'): return "quit"
        elif key == ord('a'): return "prev"
        elif key == ord('d'): return "next"
        elif key == ord('p'): autoplay = not autoplay
        elif key == ord(' '): pause.toggle()

        if should_advance_cb():
            return "next"

# ---------- Process loop (idle → record → replay/manual or autoplay) ----------
def process_video(cap, model_world, fw, fh, cfg: Config):
    levels = T(cfg.language, "levels")
    total = len(levels)

    # longer duration when autoplay for detection/tracking/live steps
    EXTENDED = {8, 9, 10, 11, 12}  # 1-based indices in levels list
    DEFAULT_SECS = cfg.seconds_per_step if cfg.autoplay else float("inf")
    EXTENDED_SECS = (max(20, cfg.seconds_per_step) if cfg.autoplay else float("inf"))

    index = 0
    frames = []
    autoplay = cfg.autoplay
    pause = PauseController()

    # models used later
    model8 = None                 # YOLOv8n for people/tracking
    model9 = None                 # yolov11n-pose for pose
    model_live_world = None       # separate YOLO-World instance for live level
    watcher = ClassFileWatcher(os.path.join('templates', 'classes.txt'))
    track_history = defaultdict(lambda: [])

    # step 2 diagonal state
    diag_positions = None
    pos_i = 0
    time_per_position = 1.0

    def elapsed_since(t0, paused0):
        return time.monotonic() - t0 - (pause.mark() - paused0)

    while True:
        name = levels[index % total]
        level_t0 = time.monotonic()
        paused0 = pause.mark()

        def should_advance():
            if not autoplay: return False
            step_num = (index % total) + 1
            dur = EXTENDED_SECS if step_num in EXTENDED else DEFAULT_SECS
            return elapsed_since(level_t0, paused0) >= dur

        # ---------- Idle preview ----------
        if name == levels[0]:
            if autoplay:
                index = (index + 1) % total
                continue
            last_disp = None
            while True:
                if not pause.paused:
                    ok, frame = cap.read()
                    if not ok: continue
                    frame = cv2.flip(frame, 1)
                    last_disp = frame.copy()
                disp = last_disp if last_disp is not None else np.zeros((fh, fw, 3), np.uint8)
                disp = _draw_footer(disp, T(cfg.language, "ui", "footer"), paused=pause.paused)
                put_text(disp, T(cfg.language, "ui", "idle_hint"), (10, 30), font_scale=0.8, thickness=2)
                cv2.imshow('Visualization - Press q to quit', disp)
                key = cv2.waitKey(1) & 0xFF
                if key == ord('q'): return False
                elif key == ord('d'): index = (index + 1) % total; break
                elif key == ord('p'): autoplay = not autoplay
                elif key == ord(' '): pause.toggle()
            continue

        # ---------- Recording ----------
        if name == levels[1]:
            frames = []
            record_secs = 10
            last_frame = None
            while elapsed_since(level_t0, paused0) < record_secs:
                if not pause.paused:
                    ok, frame = cap.read()
                    if not ok: break
                    frame = cv2.flip(frame, 1)
                    last_frame = frame.copy()
                    frames.append(frame.copy())
                disp = last_frame if last_frame is not None else np.zeros((fh, fw, 3), np.uint8)
                disp = _draw_footer(disp, T(cfg.language, "ui", "footer"), paused=pause.paused)
                rec_txt = f"● {T(cfg.language,'ui','record_caption')}  {int(elapsed_since(level_t0, paused0)):>2}s"
                put_text(disp, rec_txt, (10, 40), font_scale=1.0, color=(0,0,255), thickness=3)
                cv2.imshow('Visualization - Press q to quit', disp)
                key = cv2.waitKey(1) & 0xFF
                if key == ord('q'): return False
                elif key == ord('a'): index = (index - 1) % total; break
                elif key == ord('d'): index = (index + 1) % total; break
                elif key == ord('p'): autoplay = not autoplay
                elif key == ord(' '): pause.toggle()
            else:
                index = (index + 1) % total
            continue

        # ---------- Step 2 (diagonal sweep) ----------
        if name == levels[3]:
            if not frames:
                ok, live = cap.read()
                if ok: frames = [cv2.flip(live, 1)]
            h, w = frames[0].shape[:2]
            ksize = min(500, min(w, h) // 2 if min(w, h) >= 600 else 300)
            stride = max(1, ksize // 3)
            if not diag_positions:
                xs = list(range(0, max(1, w - ksize + 1), stride))
                ys = list(range(0, max(1, h - ksize + 1), stride))
                diags = []
                for s in range(len(xs) + len(ys) - 1):
                    diag = []
                    i_min = max(0, s - (len(ys) - 1))
                    i_max = min(s, len(xs) - 1)
                    for i in range(i_min, i_max + 1):
                        j = s - i
                        diag.append((xs[i], ys[j]))
                    diags.append(diag)
                diag_positions = [p for d in diags for p in d]
            processed_mask = np.zeros((h, w), dtype=np.uint8)
            pos_i = 0
            pos_t0 = time.monotonic(); pos_paused0 = pause.mark()
            frame_idx = 0
            while True:
                frame = frames[frame_idx % len(frames)]
                filtered = enhance_saturation(frame)
                if not pause.paused and (time.monotonic() - pos_t0 - (pause.mark() - pos_paused0)) >= time_per_position:
                    pos_i = (pos_i + 1) % len(diag_positions)
                    if pos_i == 0: processed_mask.fill(0)
                    pos_t0 = time.monotonic(); pos_paused0 = pause.mark()
                out, processed_mask, _ = visualize_level_2(
                    frame, filtered, processed_mask, diag_positions[pos_i], ksize, block_size=25
                )
                out = _draw_footer(out, T(cfg.language, "ui", "footer"), paused=pause.paused)
                put_text(out, name, (10, h - 45), font_scale=1.0, thickness=2)
                cv2.imshow('Visualization - Press q to quit', out)
                key = cv2.waitKey(1) & 0xFF
                if key == ord('q'): return False
                elif key == ord('a'): index = (index - 1) % total; break
                elif key == ord('d'): index = (index + 1) % total; break
                elif key == ord('p'): autoplay = not autoplay
                elif key == ord(' '): pause.toggle()
                if should_advance(): index = (index + 1) % total; break
                if not pause.paused: frame_idx += 1
            continue

        # ---------- All other steps ----------
        step_idx = (index % total)
        # Special LIVE level (Step 11)
        if step_idx == 12:
            if model_live_world is None:
                model_live_world = YOLOWorld("yolov8s-worldv2.pt")
                if torch.cuda.is_available(): model_live_world = model_live_world.to('cuda')
                # set initial classes if file exists
                if watcher.load():
                    try: model_live_world.set_classes(watcher.get())
                    except Exception as e: print(f"[live] set_classes failed: {e}")
            def _sa(): return should_advance()
            ret = live_world_loop(cap, model_live_world, watcher, pause,
                                  footer_text=T(cfg.language, "ui", "footer"),
                                  level_name=name, autoplay=autoplay, should_advance_cb=_sa)
            if ret == "quit": return False
            elif ret == "prev": index = (index - 1) % total
            elif ret == "next": index = (index + 1) % total
            continue

        frame_i = 0
        while True:
            if frames:
                frame = frames[frame_i % len(frames)]
                if not pause.paused: frame_i += 1
            else:
                if not pause.paused:
                    ok, live = cap.read()
                    if not ok: continue
                    frame = cv2.flip(live, 1)
                else:
                    frame = np.zeros((fh, fw, 3), np.uint8)

            # route per level
            if step_idx == 2:
                processed, expl = visualize_level_1(frame)
            elif step_idx == 4:
                processed, expl = visualize_level_3(frame)
            elif step_idx == 5:
                processed, expl = visualize_level_4(frame)
            elif step_idx == 6:
                processed, expl = visualize_level_5(frame)
            elif step_idx == 7:
                processed, expl = visualize_level_6(frame)
            elif step_idx == 8:
                processed, expl = visualize_level_7(frame, model_world)
            elif step_idx == 9:  # Step 8: Person tracking
                if model8 is None:
                    model8 = YOLO("yolov8n.pt")
                    if torch.cuda.is_available():
                        model8 = model8.to('cuda')

                # Reset trails when the recorded clip loops back to the start
                reset_paths = False
                if frames:  # only relevant when replaying the recorded clip
                    cur_mod = frame_i % len(frames)
                    prev_mod = getattr(process_video, "_prev_mod_l8", None)
                    reset_paths = (cur_mod == 0 and prev_mod not in (None, 0))
                    process_video._prev_mod_l8 = cur_mod

                # Call the tracking visualizer with the PERSISTENT history and the reset flag
                processed, expl = visualize_level_8(
                    frame,           # current frame (live or from replay)
                    model8,          # YOLOv8n model
                    track_history,   # <-- persistent dict defined above (DO NOT recreate it here)
                    reset=reset_paths
                )
            elif step_idx == 10:
                if model9 is None:
                    model9 = YOLO("yolo11n-pose.pt")
                    if torch.cuda.is_available(): model9 = model9.to('cuda')
                processed, expl = visualize_level_9(frame, model9)
            elif step_idx == 11:  # NEW: people left/right
                if model8 is None:
                    model8 = YOLO("yolov8n.pt")
                    if torch.cuda.is_available(): model8 = model8.to('cuda')
                processed, expl = visualize_level_10_people_sides(
                    frame, model8,
                    left_label=T(cfg.language, "ui", "left"),
                    right_label=T(cfg.language, "ui", "right")
                )
            else:
                processed, expl = frame, ""

            processed = _draw_footer(processed, T(cfg.language, "ui", "footer"), paused=pause.paused)
            put_text(processed, name, (10, processed.shape[0] - 45), font_scale=1.0, thickness=2)
            cv2.imshow('Visualization - Press q to quit', processed)

            key = cv2.waitKey(1) & 0xFF
            if key == ord('q'): return False
            elif key == ord('a'): index = (index - 1) % total; break
            elif key == ord('d'): index = (index + 1) % total; break
            elif key == ord('p'): autoplay = not autoplay
            elif key == ord(' '): pause.toggle()

            if should_advance():
                index = (index + 1) % total
                break

def main():
    cfg = prompt_user_settings(Config())
    model = initialize_model()
    cap, fw, fh = initialize_camera()
    print(T(cfg.language, "ui", "footer"))
    try:
        while True:
            if process_video(cap, model, fw, fh, cfg) is False:
                break
    finally:
        cap.release(); cv2.destroyAllWindows(); print("Application closed.")

if __name__ == "__main__":
    main()


Using GPU (NVIDIA GeForce RTX 4070 Ti) for inference.
Camera 0 opened successfully.
Desired 1920x1200 not supported. Using 1920x1080.
q: quit • A: previous • D: next • P: toggle autoplay • Space: pause


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


[live] set classes: ['Smart', '...']
[live] set classes: ['Cute', '...']
[live] set classes: ['Student', '...']
[live] set classes: ['Girl', '...']
[live] set classes: ['Girl', 'Boy', '...']
[live] set classes: ['Girl', 'Beard', '...']
[live] set classes: ['Girl', 'Boy,Beard', '...']
[live] set classes: ['Girl', 'Boy', '...']
[live] set classes: ['Girl', 'Boy', 'eyes', '...']
[live] set classes: ['Girl', 'Boy', 'eye', '...']
[live] set classes: ['Girl', 'Boy', 'penis', '...']
[live] set classes: ['Girl', 'Boy', 'penis', 'boobs', '...']
[live] set classes: ['Girl', 'Boy', 'penis', 'boobs', 'bra', '...']
[live] set classes: ['Girl', 'Boy', 'penis', 'boobs', 'bra', '...']
[live] set classes: ['Eyes', 'nose']
[live] set classes: ['Eyes']
[live] set classes: ['Eyes,']
[live] set classes: ['Person']
Application closed.
