In [2]:
import os
import cv2
import json
import xml.etree.ElementTree as ET
import mediapipe as mp
from tqdm import tqdm


In [3]:
# Paths
DATA_DIR = "workspace/images"
OUTPUT_FILE = "hand_landmarks.json"

In [4]:
# Init MediaPipe
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.5)


In [None]:
# Store results
results_data = []

# Loop over train, test, val
for split in ["train", "test", "val"]:
    split_dir = os.path.join(DATA_DIR, split)
    for file in tqdm(os.listdir(split_dir)):
        if not file.endswith(".jpg"):
            continue

        img_path = os.path.join(split_dir, file)
        xml_path = os.path.join(split_dir, file.replace(".jpg", ".xml"))

        # Load image
        image = cv2.imread(img_path)
        if image is None or not os.path.exists(xml_path):
            continue

        # Parse XML
        tree = ET.parse(xml_path)
        root = tree.getroot()
        bbox = root.find("object").find("bndbox")
        xmin = int(bbox.find("xmin").text)
        ymin = int(bbox.find("ymin").text)
        xmax = int(bbox.find("xmax").text)
        ymax = int(bbox.find("ymax").text)

        label = root.find("object").find("name").text

        # Crop to bounding box
        cropped = image[ymin:ymax, xmin:xmax]
        if cropped.size == 0:
            continue

        # Convert to RGB
        cropped_rgb = cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB)

        # Run MediaPipe
        result = hands.process(cropped_rgb)
        if result.multi_hand_landmarks:
            for hand_landmarks in result.multi_hand_landmarks:
                landmark_list = []
                for lm in hand_landmarks.landmark:
                    landmark_list.append({
                        "x": lm.x,
                        "y": lm.y,
                        "z": lm.z
                    })
                results_data.append({
                    "image": file,
                    "split": split,
                    "label": label,
                    "bbox": [xmin, ymin, xmax, ymax],
                    for lm in hand_landmarks.landmark:
    cx = int(lm.x * (xmax - xmin)) + xmin
    cy = int(lm.y * (ymax - ymin)) + ymin
    landmark_list.append({"x": cx, "y": cy, "z": lm.z})
                })

# Save as JSON
with open(OUTPUT_FILE, "w") as f:
    json.dump(results_data, f, indent=2)

print(f"Done! Saved landmark data to {OUTPUT_FILE}")

 10%|█         | 5240/50932 [02:19<20:19, 37.46it/s]


KeyboardInterrupt: 