In [63]:
def get_main_person_box(img, model):
    """Use YOLO to detect the largest person box (class 0)."""
    results = model(img)[0]
    boxes = results.boxes.xywh.cpu().numpy()
    classes = results.boxes.cls.cpu().numpy()

    # Filter to person class only
    person_boxes = [box for i, box in enumerate(boxes) if int(classes[i]) == 0]

    if not person_boxes:
        return None

    # Choose the largest box (main athlete)
    person_boxes = sorted(person_boxes, key=lambda b: b[2] * b[3], reverse=True)
    return person_boxes[0]  # [x, y, w, h]

def crop_person(img, box):
    """Crop person bounding box area from image."""
    x, y, w, h = box
    x1, y1 = int(x - w / 2), int(y - h / 2)
    x2, y2 = int(x + w / 2), int(y + h / 2)
    h_img, w_img = img.shape[:2]

    # Clip coordinates to image boundaries
    x1, y1 = max(0, x1), max(0, y1)
    x2, y2 = min(w_img, x2), min(h_img, y2)

    return img[y1:y2, x1:x2]

In [64]:
import os
import cv2
import numpy as np
from ultralytics import YOLO
import mediapipe as mp
import os

In [65]:
def extract_frames(video_path, num_frames=20):
    """Extract `num_frames` evenly spaced frames from the video."""
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    if total_frames < num_frames:
        num_frames = total_frames  # fallback if video is too short

    # Calculate the frame indices to sample
    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)

    frames = []
    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        success, frame = cap.read()
        if success:
            frames.append(frame)

    cap.release()
    return frames

In [66]:
yolo_model = YOLO("yolov8n.pt")
mp_pose = mp.solutions.pose
pose_model = mp_pose.Pose(static_image_mode=True)

In [67]:
frames = extract_frames("input.mp4")


In [68]:
def compute_joint_angles(landmarks):
    # landmarks: shape (33, 4)
    return np.zeros(6)

In [76]:
sequence = []
prev_kps = None
for img in frames:
        # Detect main person
        box = get_main_person_box(img, yolo_model)
        if box is None:
            keypoints = np.zeros(33 * 4)          # 132 keypoints
            joint_angles = np.zeros(6)  # define this
            velocities = np.zeros(99)             # 99 velocities
            feat_vec = np.concatenate([keypoints, joint_angles, velocities])
            sequence.append(feat_vec)
            continue


        # Crop the main person and feed to MediaPipe
        cropped = crop_person(img, box)
        img_rgb = cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB)
        results = pose_model.process(img_rgb)

        keypoints = []
        if not results.pose_landmarks:
            continue
        kps = np.array([[lm.x, lm.y, lm.z, lm.visibility] for lm in results.pose_landmarks.landmark])
        if kps.shape != (33, 4):
            continue
        kps_flat = kps.flatten()
        joint_angles = compute_joint_angles(kps)
        # 99 velocities (x,y,z for 33 keypoints)
        if prev_kps is None:
            velocities = np.zeros(99)
        else:
            velocities = (kps[:, :3] - prev_kps[:, :3]).flatten()
        prev_kps = kps.copy()
        feat_vec = np.concatenate([kps_flat, joint_angles, velocities])  # (237,)
        sequence.append(feat_vec)


0: 384x640 (no detections), 104.9ms
Speed: 2.8ms preprocess, 104.9ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 1 baseball bat, 98.5ms
Speed: 4.2ms preprocess, 98.5ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 1 baseball bat, 81.5ms
Speed: 3.7ms preprocess, 81.5ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 1 baseball bat, 74.2ms
Speed: 3.0ms preprocess, 74.2ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 baseball bat, 115.3ms
Speed: 3.6ms preprocess, 115.3ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 136.2ms
Speed: 4.6ms preprocess, 136.2ms inference, 2.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 149.8ms
Speed: 4.3ms preprocess, 149.8ms inference, 2.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 157.8ms
Sp

In [77]:
for x in sequence:
    print(len(x))

237
237
237
237
237
237
237
237
237
237
237
237
237
237
237
237
