# Pose Estimation with camera in real time

In [1]:
import cv2
from ultralytics import YOLO

model = YOLO("yolov8n-pose.pt")
cap = cv2.VideoCapture(1)

while True:
    ok, frame = cap.read()
    if not ok: break
    r = model(frame, verbose=False)[0]
    cv2.imshow("pose", r.plot())
    if cv2.waitKey(1) == 27:  # ESC
        break

cap.release()
cv2.destroyAllWindows()


# Showing only Keypoints

Hinweis: Die Kopf-Keypoints (COCO-Indices 0–4: Nase, linkes/rechtes Auge, linkes/rechtes Ohr) werden vor dem Zeichnen ausgeblendet, indem ihre Konfidenz auf 0 gesetzt wird.

Gruppen-Definitionen (COCO 17 KP):
- head: [0–4]
- shoulders: [5,6]
- arms: [5–10]
- wrists: [9,10]
- hips: [11,12]
- legs: [11–16]
- knees: [13,14]
- ankles: [15,16]
- torso: [5,6,11,12]
- all: [0–16]

Steuerung: Passe in der folgenden Zelle die Variable HIDE_GROUPS an, z. B. ["head"], ["head","arms"], ["legs"], ["all"].

# Hand- und Finger-Keypoints (21 KPs)

Wir erweitern die Visualisierung um Hand- und Finger-Keypoints mit einem separaten Hand-Pose-Modell (`yolo11n-pose -hand.pt`).

- Pro Hand werden 21 Keypoints erwartet (0 = Handgelenk, dann je 4 Punkte für Daumen/Zeige-/Mittel-/Ring-/Kleiner Finger).
- Finger-Skelette (Standard-Konvention, ggf. je nach Modell leicht unterschiedlich):
  - Daumen: 0–1–2–3–4
  - Zeigefinger: 0–5–6–7–8
  - Mittelfinger: 0–9–10–11–12
  - Ringfinger: 0–13–14–15–16
  - Kleiner Finger: 0–17–18–19–20
- Die Hand-Keypoints werden farbig gezeichnet (jeder Finger eigene Farbe) und auf denselben schwarzen Canvas gelegt wie die Körper-Keypoints.

Hinweise
- Die Datei hat ein Leerzeichen im Namen (`yolo11n-pose -hand.pt`). Das ist ok, der Pfad bleibt ein normaler String.
- Performance: Zwei Modelle pro Frame (Körper + Hände) sind rechenintensiver. Bei Bedarf die Auflösung/FPS reduzieren oder `conf` erhöhen.
- Steuerung: Über `DRAW_HANDS`, `HAND_MIN_CONF` und `HAND_MODEL_PATH` kann das Verhalten angepasst werden.

In [1]:
import cv2, numpy as np
from ultralytics import YOLO
from ultralytics.utils.plotting import Annotator

# optional: pyvirtualcam for virtual webcam output
try:
    import pyvirtualcam
    from pyvirtualcam import PixelFormat
    HAVE_PYVIRTUALCAM = True
except Exception:
    HAVE_PYVIRTUALCAM = False

# optional: mss for desktop screen capture
try:
    from mss import mss
    HAVE_MSS = True
except Exception:
    HAVE_MSS = False
    print("mss not installed. Install with: pip install mss")

# -------------------- configurable options --------------------
BODY_MODEL_PATH = "yolov8n-pose.pt"
HAND_MODEL_PATH = r"C:\Users\adria\Documents\HandPose\runs\hand\overnight_run\weights\best.pt"
CAM_INDEX = 1

# Toggle & Schwellenwerte
DRAW_BODY = False       # nur Hände anzeigen (Körper nicht rendern)
DRAW_HANDS = True
USE_WRIST_CROPS = True   # schneller: Hände nur um die Handgelenke herum inferieren
HAND_MIN_CONF = 0.25     # Keypoint-Konfidenz für Zeichnen der Handpunkte
HAND_PRED_CONF = 0.25    # Inferenz-Konfidenz für das Handmodell
BODY_MIN_CONF = 0.30     # Keypoint-Konfidenz für Körper (z. B. Ellbogen/Handgelenk)

# Wrist-Crop Parameter
HAND_CROP_SCALE = 2.0    # Seitenlänge ~ scale * Unterarmlänge
HAND_CROP_MIN = 96
HAND_CROP_MAX = 320

# -------------------- load models & camera --------------------
body_model = YOLO(BODY_MODEL_PATH)
hand_model = YOLO(HAND_MODEL_PATH) if DRAW_HANDS else None

cap = cv2.VideoCapture(CAM_INDEX)
if not cap.isOpened():
    # fallback to camera 0 if 1 is not available
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        raise RuntimeError("No camera available")

# grab a first frame to determine size for the virtual camera
ok, frame = cap.read()
if not ok:
    cap.release()
    raise RuntimeError("Can't read from camera")

height, width = frame.shape[:2]

# Initialize screen capture
sct = mss() if HAVE_MSS else None
screen_monitor = None
if sct:
    # Get the primary monitor
    screen_monitor = sct.monitors[1]  # monitor 0 is all monitors, 1 is primary

# Source mode: 'camera' or 'desktop'
source_mode = 'camera'

cam = None
if HAVE_PYVIRTUALCAM:
    try:
        # use RGB pixel format (pyvirtualcam expects RGB frames)
        cam = pyvirtualcam.Camera(width=width, height=height, fps=20, fmt=PixelFormat.RGB)
        print('pyvirtualcam opened:', cam.device)
    except Exception as e:
        print('Failed to open virtual camera:', e)
        cam = None

# COCO 17 keypoints: names and index groups
KEYPOINT_NAMES = [
    "nose", "left_eye", "right_eye", "left_ear", "right_ear",
    "left_shoulder", "right_shoulder", "left_elbow", "right_elbow", "left_wrist", "right_wrist",
    "left_hip", "right_hip", "left_knee", "right_knee", "left_ankle", "right_ankle"
]

GROUP_IDXS = {
    # head and facial
    "head": [0, 1, 2, 3, 4],
    # upper body
    "shoulders": [5, 6],
    "arms": [5, 6, 7, 8, 9, 10],
    "wrists": [9, 10],
    # lower body
    "hips": [11, 12],
    "legs": [11, 12, 13, 14, 15, 16],
    "knees": [13, 14],
    "ankles": [15, 16],
    # torso rectangle (shoulders + hips)
    "torso": [5, 6, 11, 12],
    # convenience
    "all": list(range(17)),
}

# Which groups to hide (zero confidence before drawing)
# Examples: ["head"], ["head", "arms"], ["legs"], ["all"]
HIDE_GROUPS = [""]

def merged_idxs(groups):
    s = set()
    for g in groups:
        s.update(GROUP_IDXS.get(g, []))
    return sorted(s)

HIDE_IDXS = merged_idxs(HIDE_GROUPS)

# ---------- drawing helpers ----------
# Ensure points are within image bounds for drawing
def _clip_point(pt, w, h):
    x, y = int(round(pt[0])), int(round(pt[1]))
    return (max(0, min(w - 1, x)), max(0, min(h - 1, y)))

def draw_cube(img, center, u, wv, side, depth_scale=0.6, color=(0, 255, 0), thickness=2):
    """
    Draws a faux-3D cube at 'center' using oriented basis vectors u (arm direction) and
    wv (its image-plane perpendicular). 'side' is the base square side length in pixels.
    depth_scale controls the offset between the two squares.
    """
    # normalize basis just in case
    u = np.asarray(u, dtype=float)
    wv = np.asarray(wv, dtype=float)
    nu = np.linalg.norm(u) or 1.0
    nw = np.linalg.norm(wv) or 1.0
    u /= nu
    wv /= nw

    a = side / 2.0
    c = np.asarray(center, dtype=float)
    # base square corners (counter-clockwise)
    c0 = c + (-a) * u + (-a) * wv
    c1 = c + ( a) * u + (-a) * wv
    c2 = c + ( a) * u + ( a) * wv
    c3 = c + (-a) * u + ( a) * wv
    base = [c0, c1, c2, c3]

    # top square offset (simple isometric-ish offset)
    off = (u + wv) * (a * depth_scale)
    top = [p + off for p in base]

    h, w = img.shape[:2]
    base_i = [_clip_point(p, w, h) for p in base]
    top_i  = [_clip_point(p, w, h) for p in top]

    # draw squares
    for pts in (base_i, top_i):
        for i in range(4):
            p1, p2 = pts[i], pts[(i + 1) % 4]
            cv2.line(img, p1, p2, color, thickness)
    # connect corresponding corners
    for i in range(4):
        cv2.line(img, base_i[i], top_i[i], color, thickness)

# ----- Hand drawing: 21 keypoints, 5 fingers -----
# Finger definitions (common 21-KP scheme):
THUMB  = [0, 1, 2, 3, 4]
INDEX  = [0, 5, 6, 7, 8]
MIDDLE = [0, 9, 10, 11, 12]
RING   = [0, 13, 14, 15, 16]
PINKY  = [0, 17, 18, 19, 20]
HAND_FINGERS = [THUMB, INDEX, MIDDLE, RING, PINKY]
FINGER_COLORS = [
    (0, 255, 255),   # thumb - cyan
    (0, 255, 0),     # index - green
    (255, 0, 255),   # middle - magenta
    (255, 255, 0),   # ring - yellow
    (0, 128, 255),   # pinky - orange-ish
]

def draw_hand_skeleton(img, kp_any, min_conf=HAND_MIN_CONF):
    """Draws hand keypoints and finger bones with colored lines.
    Accepts either a torch.Tensor (N,3) or a NumPy array (N,3), where N can be 21 (hands) or others.
    """
    # coerce to numpy
    if hasattr(kp_any, 'detach'):
        pts = kp_any[:, :2].detach().cpu().numpy()
        if kp_any.shape[1] >= 3:
            conf = kp_any[:, 2].detach().cpu().numpy()
        else:
            conf = np.ones((pts.shape[0],), dtype=np.float32)
    else:
        arr = np.asarray(kp_any)
        pts = arr[:, :2]
        if arr.shape[1] >= 3:
            conf = arr[:, 2]
        else:
            conf = np.ones((pts.shape[0],), dtype=np.float32)

    n = pts.shape[0]
    h, w = img.shape[:2]

    # draw keypoints
    for i, (x, y) in enumerate(pts):
        if i >= n:
            break
        if conf[i] < min_conf:
            continue
        xi, yi = _clip_point((x, y), w, h)
        cv2.circle(img, (xi, yi), 3, (0, 200, 255), -1)  # small dot

    # draw finger lines (only indices inside range)
    for fi, finger in enumerate(HAND_FINGERS):
        color = FINGER_COLORS[fi % len(FINGER_COLORS)]
        # keep only valid indices
        valid = [j for j in finger if 0 <= j < n]
        for a, b in zip(valid[:-1], valid[1:]):
            if conf[a] >= min_conf and conf[b] >= min_conf:
                p1 = _clip_point(pts[a], w, h)
                p2 = _clip_point(pts[b], w, h)
                cv2.line(img, p1, p2, color, 2)


def _square_roi(center_xy, side, w, h):
    """Return clipped integer ROI (x0,y0,x1,y1) for a square centered at center_xy."""
    cx, cy = center_xy
    a = side / 2.0
    x0 = int(round(cx - a)); x1 = int(round(cx + a))
    y0 = int(round(cy - a)); y1 = int(round(cy + a))
    x0 = max(0, min(w - 1, x0)); x1 = max(0, min(w, x1))
    y0 = max(0, min(h - 1, y0)); y1 = max(0, min(h, y1))
    if x1 <= x0 or y1 <= y0:
        return None
    return x0, y0, x1, y1

print("Controls:")
print("  Press '1' to switch to camera")
print("  Press '2' to switch to desktop screen")
print("  Press 'ESC' to quit")

while True:
    # Capture frame based on current source mode
    if source_mode == 'camera':
        ok, frame = cap.read()
        if not ok:
            break
    elif source_mode == 'desktop':
        if sct and screen_monitor:
            # Capture desktop screen
            screenshot = sct.grab(screen_monitor)
            frame = np.array(screenshot)
            # Convert BGRA to BGR
            frame = cv2.cvtColor(frame, cv2.COLOR_BGRA2BGR)
            # Resize to match camera resolution for consistency
            frame = cv2.resize(frame, (width, height))
            ok = True
        else:
            print("Desktop capture not available (mss not installed)")
            source_mode = 'camera'
            continue
    else:
        ok = False

    if not ok:
        break

    # black canvas for drawing
    canvas = np.zeros_like(frame)
    ann = Annotator(canvas, line_width=2)

    # --------- BODY POSE ---------
    res_body = body_model(frame, verbose=False)[0]

    # Prepare wrist/elbow landmarks (for crops), optionally draw body
    left_wrist = right_wrist = None
    left_elbow = right_elbow = None

    if hasattr(res_body, 'keypoints') and res_body.keypoints is not None:
        for kp in res_body.keypoints.data:  # torch.Tensor shape (17,3)
            # extract wrists/elbows (first person only for speed)
            if left_wrist is None:
                lx, ly, lc = kp[9].tolist()   # left_wrist
                rx, ry, rc = kp[10].tolist()  # right_wrist
                lex, ley, lec = kp[7].tolist()  # left_elbow
                rex, rey, rec = kp[8].tolist()  # right_elbow
                left_wrist = (lx, ly, lc)
                right_wrist = (rx, ry, rc)
                left_elbow = (lex, ley, lec)
                right_elbow = (rex, rey, rec)

            if DRAW_BODY:
                kpf = kp.clone()
                if HIDE_IDXS:
                    kpf[HIDE_IDXS, 2] = 0.0  # hide selected groups by zeroing confidence
                ann.kpts(kpf, radius=3)      # draw remaining kpts + bones

                # draw cube on left arm (elbow->wrist)
                ex, ey, ec = kp[7].tolist()
                wx, wy, wc = kp[9].tolist()
                if ec >= BODY_MIN_CONF and wc >= BODY_MIN_CONF:
                    v = np.array([wx - ex, wy - ey], dtype=float)
                    L = np.linalg.norm(v)
                    if L > 1.0:
                        u = v / L
                        wv = np.array([-u[1], u[0]], dtype=float)  # 90° rotate in image plane
                        center = ((ex + wx) / 2.0, (ey + wy) / 2.0)
                        side = float(np.clip(0.6 * L, 20.0, 80.0))
                        draw_cube(canvas, center, u, wv, side, depth_scale=0.7, color=(0, 255, 255), thickness=2)

    # --------- HAND POSE (optional) ---------
    if DRAW_HANDS and hand_model is not None:
        if USE_WRIST_CROPS and left_wrist is not None and right_wrist is not None:
            h, w = frame.shape[:2]
            crops = []
            # left
            lx, ly, lc = left_wrist
            if lc >= BODY_MIN_CONF:
                if left_elbow is not None:
                    lex, ley, lec = left_elbow
                    forearm = np.linalg.norm([lx - lex, ly - ley]) if lec >= BODY_MIN_CONF else 80.0
                else:
                    forearm = 80.0
                side = float(np.clip(HAND_CROP_SCALE * forearm, HAND_CROP_MIN, HAND_CROP_MAX))
                roi = _square_roi((lx, ly), side, w, h)
                if roi is not None:
                    crops.append(roi)
            # right
            rx, ry, rc = right_wrist
            if rc >= BODY_MIN_CONF:
                if right_elbow is not None:
                    rex, rey, rec = right_elbow
                    forearm = np.linalg.norm([rx - rex, ry - rey]) if rec >= BODY_MIN_CONF else 80.0
                else:
                    forearm = 80.0
                side = float(np.clip(HAND_CROP_SCALE * forearm, HAND_CROP_MIN, HAND_CROP_MAX))
                roi = _square_roi((rx, ry), side, w, h)
                if roi is not None:
                    crops.append(roi)

            for (x0, y0, x1, y1) in crops:
                roi_img = frame[y0:y1, x0:x1]
                if roi_img.size == 0:
                    continue
                res_hand = hand_model(roi_img, conf=HAND_PRED_CONF, verbose=False)[0]
                if hasattr(res_hand, 'keypoints') and res_hand.keypoints is not None:
                    for hkpt in res_hand.keypoints.data:  # (N,3) where N may be 21
                        hk = hkpt.detach().cpu().numpy().copy()
                        hk[:, 0] += x0
                        hk[:, 1] += y0
                        draw_hand_skeleton(canvas, hk, min_conf=HAND_MIN_CONF)
        else:
            # full-frame hand inference
            res_hand = hand_model(frame, conf=HAND_PRED_CONF, verbose=False)[0]
            if hasattr(res_hand, 'keypoints') and res_hand.keypoints is not None:
                for hkpt in res_hand.keypoints.data:  # (N,3)
                    draw_hand_skeleton(canvas, hkpt, min_conf=HAND_MIN_CONF)

    out = ann.result()

    # Add source indicator text
    text = f"Source: {source_mode.upper()}"
    cv2.putText(out, text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

    # send to virtual camera (pyvirtualcam expects RGB order)
    if HAVE_PYVIRTUALCAM and cam is not None:
        try:
            rgb = cv2.cvtColor(out, cv2.COLOR_BGR2RGB)
            cam.send(rgb)
            cam.sleep_until_next_frame()
        except Exception as e:
            print('pyvirtualcam send failed:', e)
            HAVE_PYVIRTUALCAM = False

    # local preview
    cv2.imshow("pose", out)
    
    # Handle keyboard input
    key = cv2.waitKey(1) & 0xFF
    if key == 27:  # ESC
        break
    elif key == ord('1'):
        source_mode = 'camera'
        print("Switched to camera")
    elif key == ord('2'):
        if HAVE_MSS:
            source_mode = 'desktop'
            print("Switched to desktop screen")
        else:
            print("Desktop capture not available. Install mss: pip install mss")

cap.release()
if cam is not None:
    cam.close()
if sct is not None:
    sct.close()
cv2.destroyAllWindows()


pyvirtualcam opened: OBS Virtual Camera
Controls:
  Press '1' to switch to camera
  Press '2' to switch to desktop screen
  Press 'ESC' to quit
