## 1Ô∏è‚É£ Extracting Landmarks from a Single Video with world landmarks

### Method 1: Using the Command Line (CLI)

In [None]:
# Use CLI to convert a video to a .pose file
# Change the path according to your file
!video_to_pose --format mediapipe -i "path/to/your/video.mp4" -o "output.pose"

In [None]:
# With additional settings for higher accuracy
!video_to_pose --format mediapipe -i "path/to/your/video.mp4" -o "output_hq.pose" \
  --additional-config="model_complexity=2,smooth_landmarks=false,refine_face_landmarks=true"

### Method 2: Using Python Directly

In [2]:
import os
from pose_format import Pose
from pose_format.utils.holistic import load_holistic
from simple_video_utils.metadata import video_metadata
from simple_video_utils.frames import read_frames_exact

In [7]:

def extract_pose_from_video(video_path: str, output_path: str, model_complexity: int = 1):
    """
        Extract landmarks from a video and save them as a .pose file

        Parameters:
        -----------
        video_path : str
            Path to the input video
        output_path : str
            Directory to save the output .pose file
        model_complexity : int
            Model complexity (0, 1, or 2). 2 = highest accuracy but slower
        """
    print(f"üé¨ Loading video: {video_path}")
    
    # Read video metadata
    metadata = video_metadata(video_path)
    width = metadata.width
    height = metadata.height
    fps = metadata.fps
    
    print(f"üìê Dimensions: {width}x{height}")
    print(f"üéûÔ∏è Frame rate: {fps} FPS")
    
    # Read video frames
    frames = read_frames_exact(video_path)
    
    # MediaPipe Holistic settings
    holistic_config = {
        'model_complexity': model_complexity,
        'smooth_landmarks': True,
        'refine_face_landmarks': True  # for iris landmarks
    }
    
    print("‚öôÔ∏è Extracting landmarks...")
    
    # Extract pose
    pose = load_holistic(
        frames,
        fps=fps,
        width=width,
        height=height,
        progress=True,  # Show progress bar
        additional_holistic_config=holistic_config
    )
    
    # Save result
    print(f"üíæ Saving to: {output_path}")
    file_name = os.path.basename(video_path)
    name, _ = os.path.splitext(file_name)
    output_path = os.path.join(output_path, f"{name}.pose")
    with open(output_path, 'wb') as f:
        pose.write(f)
    
    print("‚úÖ Done!")
    return pose

In [8]:
# Example usage
# Change the paths according to your files

VIDEO_PATH = "../data/original_videos/example.mp4"
OUTPUT_PATH = "../data/pose_files/"

# Extract pose
pose = extract_pose_from_video(VIDEO_PATH, OUTPUT_PATH, model_complexity=2)

üé¨ Loading video: ../data/original_videos/example.mp4
üìê Dimensions: 640x480
üéûÔ∏è Frame rate: 24.166666666666668 FPS
‚öôÔ∏è Extracting landmarks...


133it [00:40,  3.28it/s]

üíæ Saving to: ../data/pose_files/
‚úÖ Done!





## 2Ô∏è‚É£ Extracting Landmarks from a Folder of Videos without world landmarks

### Method 1: Using the CLI

In [None]:
# Batch process a folder with multiple workers (parallel processing)
!videos_to_poses --format mediapipe --directory "/path/to/videos" --num-workers 4 --recursive

### Method 2: Using Python

In [3]:
from pathlib import Path
import cv2
import mediapipe as mp

from pose_format import Pose
from pose_format.utils.holistic import load_holistic
from pose_format.utils.openpose import load_openpose_directory

from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [4]:
# Mediapipe Holistic face mesh contour points
mp_holistic = mp.solutions.holistic
FACEMESH_CONTOURS_POINTS = [str(p) for p in sorted(set([p for p_tup in list(mp_holistic.FACEMESH_CONTOURS) for p in p_tup]))]

SUPPORTED_VIDEO_FORMATS = [".mp4", ".mov", ".avi", ".mkv", ".flv", ".wmv", ".webm"]


In [5]:
def load_video_frames(cap: cv2.VideoCapture):
    """Generator: yields frames as RGB numpy arrays from an open VideoCapture."""
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        yield cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    cap.release()

def find_videos(directory: str, recursive: bool = False):
    directory = Path(directory)
    glob_method = directory.rglob if recursive else directory.glob
    videos = []
    for ext in SUPPORTED_VIDEO_FORMATS:
        videos.extend(glob_method(f"*{ext}"))
    return sorted(videos)

In [6]:
def process_single_video(video_path: Path, output_dir: Path, model_complexity: int = 1, reduce: bool = False):
    """
    Process a single video and save the result in the specified folder.
    Implements full pose estimation inline (opens video, runs load_holistic, writes .pose).
    """
    try:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        output_path = output_dir / f"{video_path.stem}.pose"

        # Skip if file exists
        if output_path.exists():
            return f"‚è≠Ô∏è Skipped (exists): {video_path.name}"

        # Fallback: open with cv2.VideoCapture and stream frames to load_holistic
        cap = cv2.VideoCapture(str(video_path))
        if not cap.isOpened():
            return f"‚ùå Failed {video_path.name}: cannot open video"

        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        fps = cap.get(cv2.CAP_PROP_FPS) or 30.0

        frames_gen = load_video_frames(cap)

        additional_holistic_config = {
            'model_complexity': model_complexity,
            'smooth_landmarks': True,
            'refine_face_landmarks': True
        }

        pose = load_holistic(
            frames_gen,
            fps=fps,
            width=width,
            height=height,
            progress=False,
            additional_holistic_config=additional_holistic_config
        )

        # Keep core 2D components
        pose = pose.get_components(["POSE_LANDMARKS", "FACE_LANDMARKS", "LEFT_HAND_LANDMARKS", "RIGHT_HAND_LANDMARKS"])

        if reduce:
            pose = pose.get_components(
                ["POSE_LANDMARKS", "FACE_LANDMARKS", "LEFT_HAND_LANDMARKS", "RIGHT_HAND_LANDMARKS"],
                {"FACE_LANDMARKS": FACEMESH_CONTOURS_POINTS}
            )

        with open(output_path, "wb") as f:
            pose.write(f)

        return f"‚úÖ Done: {video_path.name}"

    except Exception as e:
        return f"‚ùå Failed {video_path.name}: {e}"

In [7]:

def batch_process_videos(directory: str, output_dir: str, recursive: bool = False, num_workers: int = 1, model_complexity: int = 1, reduce: bool = False):
    """
    Batch process a folder of videos in parallel.
    """
    videos = find_videos(directory, recursive)
    print(f"üìÅ Found {len(videos)} videos in {directory}")
    if not videos:
        print("‚ö†Ô∏è No videos found.")
        return

    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    results = []
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = {executor.submit(process_single_video, v, output_dir, model_complexity, reduce): v for v in videos}
        for future in tqdm(as_completed(futures), total=len(videos), desc="Processing"):
            result = future.result()
            results.append(result)
            print(result)

    success = sum(1 for r in results if r.startswith("‚úÖ"))
    skipped = sum(1 for r in results if r.startswith("‚è≠Ô∏è"))
    failed = sum(1 for r in results if r.startswith("‚ùå"))

    print("\nüìä Statistics:")
    print(f"   ‚úÖ Success: {success}")
    print(f"   ‚è≠Ô∏è Skipped: {skipped}")
    print(f"   ‚ùå Failed: {failed}")


In [9]:
VIDEOS_DIRECTORY = "../data/original_videos"
OUTPUT_DIR = "../data/pose_files"

batch_process_videos(
    directory=VIDEOS_DIRECTORY,
    output_dir=OUTPUT_DIR,
    recursive=True,
    num_workers=2,
    model_complexity=2,
    reduce=True
)

üìÅ Found 14 videos in ../data/original_videos


Processing:   0%|          | 0/14 [00:00<?, ?it/s]

‚è≠Ô∏è Skipped (exists): 554365435.mp4
‚è≠Ô∏è Skipped (exists): SGB_FSS_rotation_left.mp4
‚è≠Ô∏è Skipped (exists): SGB_FSS_original.mp4
‚è≠Ô∏è Skipped (exists): SGB_FSS_rotation_right.mp4
‚è≠Ô∏è Skipped (exists): SGB_FSS_skew_left .mp4
‚è≠Ô∏è Skipped (exists): SGB_FSS_skew_right.mp4
‚è≠Ô∏è Skipped (exists): SGB_FSS_zoomin.mp4
‚è≠Ô∏è Skipped (exists): SGB_FSS_zoomout .mp4


Processing:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 8/14 [00:00<00:00, 69.37it/s]

‚è≠Ô∏è Skipped (exists): ÿ≠ÿ®.mp4
‚è≠Ô∏è Skipped (exists): ÿ≠ŸÖŸÑ.mp4
‚è≠Ô∏è Skipped (exists): ÿ≤ŸÅÿßŸÅ.mp4
‚è≠Ô∏è Skipped (exists): ÿ∑ŸÑÿßŸÇ.mp4
‚è≠Ô∏è Skipped (exists): ŸÇÿ±Ÿäÿ®.mp4


Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14/14 [00:52<00:00,  3.72s/it]

‚úÖ Done: SGB_FSS.mp4

üìä Statistics:
   ‚úÖ Success: 1
   ‚è≠Ô∏è Skipped: 13
   ‚ùå Failed: 0





## 3Ô∏è‚É£ Understanding the Extracted Data Structure

### MediaPipe Holistic extracts:

In [19]:
# to understand the extracted data
from pose_format import Pose

def analyze_pose_file(pose_path: str):
    """
    Analyze a pose file and display its information
    """
    with open(pose_path, 'rb') as f:
        pose = Pose.read(f.read())
    
    print("=" * 60)
    print("üìä Pose File Information")
    print("=" * 60)
    
    # Header Information
    print(f"\nüìê Dimensions:")
    print(f"   Width: {pose.header.dimensions.width}")
    print(f"   Height: {pose.header.dimensions.height}")
    print(f"   Depth: {pose.header.dimensions.depth}")
    
    # Body Information
    print(f"\nüéûÔ∏è Body Information:")
    print(f"   FPS: {pose.body.fps} FPS")
    print(f"   Data Shape: {pose.body.data.shape}")
    print(f"   (Frames, People, Points, Dimensions)")
    
    frames, people, points, dims = pose.body.data.shape
    print(f"\n   üìπ Number of Frames: {frames}")
    print(f"   üë• Number of People: {people}")
    print(f"   üìç Total Number of Points: {points}")
    print(f"   üìè Number of Dimensions: {dims} (X, Y, Z)")
    
    # Components
    print(f"\nüß© Components:")
    total_points = 0
    for i, comp in enumerate(pose.header.components):
        print(f"\n   {i+1}. {comp.name}")
        print(f"      Points: {len(comp.points)}")
        print(f"      Limbs: {len(comp.limbs)}")
        print(f"      Format: {comp.format}")
        total_points += len(comp.points)
    
    print(f"\n   üìç Total Points: {total_points}")
    
    return pose

In [20]:
# Analyze a pose file
# Change the path if needed
FILE_PATH = "../data/pose_files/example.3.reduce.pose"

pose = analyze_pose_file(FILE_PATH)

üìä Pose File Information

üìê Dimensions:
   Width: 640
   Height: 480
   Depth: 0

üéûÔ∏è Body Information:
   FPS: 24.0 FPS
   Data Shape: (133, 1, 203, 3)
   (Frames, People, Points, Dimensions)

   üìπ Number of Frames: 133
   üë• Number of People: 1
   üìç Total Number of Points: 203
   üìè Number of Dimensions: 3 (X, Y, Z)

üß© Components:

   1. POSE_LANDMARKS
      Points: 33
      Limbs: 35
      Format: XYZC

   2. FACE_LANDMARKS
      Points: 128
      Limbs: 196
      Format: XYZC

   3. LEFT_HAND_LANDMARKS
      Points: 21
      Limbs: 21
      Format: XYZC

   4. RIGHT_HAND_LANDMARKS
      Points: 21
      Limbs: 21
      Format: XYZC

   üìç Total Points: 203


## 4Ô∏è‚É£ Accessing Specific Landmarks

In [4]:
import numpy as np

def get_specific_landmarks(pose: Pose):
    """
    Get specific landmarks from the pose data
    """
    # Raw data
    data = pose.body.data          # Coordinates (frames, people, points, dims)
    confidence = pose.body.confidence  # Confidence scores (frames, people, points)
    
    # Get specific component
    # Example: Get only body landmarks
    body_pose = pose.get_components(["POSE_LANDMARKS"])
    print(f"Body landmarks only: {body_pose.body.data.shape}")
    
    # Example: Get only hand landmarks
    hands_pose = pose.get_components(["LEFT_HAND_LANDMARKS", "RIGHT_HAND_LANDMARKS"])
    print(f"Hand landmarks only: {hands_pose.body.data.shape}")
    
    # Access specific landmark by name
    # Example: Get index of the nose landmark
    nose_index = pose.header.get_point_index("POSE_LANDMARKS", "NOSE")
    print(f"\nIndex of the nose landmark: {nose_index}")
    
    # Nose coordinates in all frames
    nose_coords = data[:, 0, nose_index, :]  # (frames, 3)
    print(f"Shape of nose coordinates: {nose_coords.shape}")
    
    # Example: Get right wrist landmark
    right_wrist_index = pose.header.get_point_index("POSE_LANDMARKS", "RIGHT_WRIST")
    right_wrist_coords = data[:, 0, right_wrist_index, :]
    print(f"\nRight wrist coordinates in the first 5 frames:")
    print(right_wrist_coords[:5])
    
    return data, confidence

In [5]:
# Example usage
data, confidence = get_specific_landmarks(pose)

Body landmarks only: (42, 1, 33, 3)
Hand landmarks only: (42, 1, 42, 3)

Index of the nose landmark: 0
Shape of nose coordinates: (42, 3)

Right wrist coordinates in the first 5 frames:
[[762.5533447265625 833.9669799804688 -0.924421489238739]
 [750.7291870117188 827.2061157226562 -0.9257319569587708]
 [736.3405151367188 819.6056518554688 -0.932299017906189]
 [726.1067504882812 815.055908203125 -0.9344683885574341]
 [723.3048706054688 811.894775390625 -0.8845396041870117]]


## 5Ô∏è‚É£ List of MediaPipe Holistic Points

In [6]:
# Body points (33 points)
BODY_POINTS = [
    'NOSE', 'LEFT_EYE_INNER', 'LEFT_EYE', 'LEFT_EYE_OUTER',
    'RIGHT_EYE_INNER', 'RIGHT_EYE', 'RIGHT_EYE_OUTER',
    'LEFT_EAR', 'RIGHT_EAR', 'MOUTH_LEFT', 'MOUTH_RIGHT',
    'LEFT_SHOULDER', 'RIGHT_SHOULDER', 'LEFT_ELBOW', 'RIGHT_ELBOW',
    'LEFT_WRIST', 'RIGHT_WRIST', 'LEFT_PINKY', 'RIGHT_PINKY',
    'LEFT_INDEX', 'RIGHT_INDEX', 'LEFT_THUMB', 'RIGHT_THUMB',
    'LEFT_HIP', 'RIGHT_HIP', 'LEFT_KNEE', 'RIGHT_KNEE',
    'LEFT_ANKLE', 'RIGHT_ANKLE', 'LEFT_HEEL', 'RIGHT_HEEL',
    'LEFT_FOOT_INDEX', 'RIGHT_FOOT_INDEX'
]

# Hand points (21 points per hand)
HAND_POINTS = [
    'WRIST', 'THUMB_CMC', 'THUMB_MCP', 'THUMB_IP', 'THUMB_TIP',
    'INDEX_FINGER_MCP', 'INDEX_FINGER_PIP', 'INDEX_FINGER_DIP', 'INDEX_FINGER_TIP',
    'MIDDLE_FINGER_MCP', 'MIDDLE_FINGER_PIP', 'MIDDLE_FINGER_DIP', 'MIDDLE_FINGER_TIP',
    'RING_FINGER_MCP', 'RING_FINGER_PIP', 'RING_FINGER_DIP', 'RING_FINGER_TIP',
    'PINKY_MCP', 'PINKY_PIP', 'PINKY_DIP', 'PINKY_TIP'
]

# Face points (468 or 478 points with iris)
print(f"Body points: {len(BODY_POINTS)}")
print(f"Hand points (one hand): {len(HAND_POINTS)}")
print(f"Face points: 468 (or 478 with iris)")
print(f"\nTotal: 33 + 468 + 21 + 21 + 33 = 576 (without iris) or 586 (with iris)")

Body points: 33
Hand points (one hand): 21
Face points: 468 (or 478 with iris)

Total: 33 + 468 + 21 + 21 + 33 = 576 (without iris) or 586 (with iris)
