In [1]:
import os
import cv2
import numpy as np
from ultralytics import YOLO
import mediapipe as mp

In [2]:
video_path = r"D:\\bha\\app\\mybacked\\input.mp4"
output_X_path = r"D:\\bha\\app\\mybacked\\output\\x.npy"
output_y_path = r"D:\\bha\\app\\mybacked\\output\\y.npy"

In [3]:
def extract_frames(video_path):
    """Extract frames from video and return as a list."""
    cap = cv2.VideoCapture(video_path)
    frames = []
    success, frame = cap.read()
    while success:
        frames.append(frame)
        success, frame = cap.read()
    cap.release()
    return frames

In [4]:
frames = extract_frames(video_path)

In [9]:
# Step 2: Load models
yolo_model = YOLO("yolov8n.pt")
mp_pose = mp.solutions.pose
pose_model = mp_pose.Pose(static_image_mode=True)

In [10]:
def get_main_person_box(img, model):
    """Use YOLO to detect the largest person box (class 0)."""
    results = model(img)[0]
    boxes = results.boxes.xywh.cpu().numpy()
    classes = results.boxes.cls.cpu().numpy()

    # Filter to person class only
    person_boxes = [box for i, box in enumerate(boxes) if int(classes[i]) == 0]

    if not person_boxes:
        return None

    # Choose the largest box (main athlete)
    person_boxes = sorted(person_boxes, key=lambda b: b[2] * b[3], reverse=True)
    return person_boxes[0]  # [x, y, w, h]


In [11]:
def crop_person(img, box):
    """Crop person bounding box area from image."""
    x, y, w, h = box
    x1, y1 = int(x - w / 2), int(y - h / 2)
    x2, y2 = int(x + w / 2), int(y + h / 2)
    h_img, w_img = img.shape[:2]

    # Clip coordinates to image boundaries
    x1, y1 = max(0, x1), max(0, y1)
    x2, y2 = min(w_img, x2), min(h_img, y2)

    return img[y1:y2, x1:x2]

In [12]:
def extract_pose_sequence(frames, yolo_model, pose_model):
    """Extract sequence of MediaPipe keypoints for the largest person in each frame."""
    sequence = []

    for img in frames:
        # Detect main person
        box = get_main_person_box(img, yolo_model)
        if box is None:
            keypoints = [0] * (33 * 4)
            sequence.append(keypoints)
            continue

        # Crop the main person and feed to MediaPipe
        cropped = crop_person(img, box)
        img_rgb = cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB)
        results = pose_model.process(img_rgb)

        keypoints = []
        if results.pose_landmarks:
            for lm in results.pose_landmarks.landmark:
                keypoints.extend([lm.x, lm.y, lm.z, lm.visibility])
        else:
            keypoints = [0] * (33 * 4)

        sequence.append(keypoints)

    # Ensure sequence is exactly 20 frames
    if len(sequence) >= 20:
        return sequence[:20]
    else:
        missing = 20 - len(sequence)
        zero_frame = [0] * (33 * 4)
        return sequence + [zero_frame] * missing


In [13]:
sequence = extract_pose_sequence(frames, yolo_model, pose_model)
    


0: 384x640 (no detections), 1239.2ms
Speed: 311.8ms preprocess, 1239.2ms inference, 61.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 127.4ms
Speed: 73.6ms preprocess, 127.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 87.5ms
Speed: 7.4ms preprocess, 87.5ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 92.9ms
Speed: 6.7ms preprocess, 92.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 umbrella, 115.3ms
Speed: 6.1ms preprocess, 115.3ms inference, 47.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 87.1ms
Speed: 5.8ms preprocess, 87.1ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 94.3ms
Speed: 5.1ms preprocess, 94.3ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 88.9ms
Speed: 6.0ms preprocess, 

KeyboardInterrupt: 