In [1]:
import torch
print(torch.__version__)

2.5.1+cu121


In [2]:
import numpy
numpy.version.version

'1.26.4'

In [3]:
from ultralytics import YOLO
import cv2

# Load the correct YOLOv8 model for pose estimation
model = YOLO("yolo11n-pose.pt")  # You can replace with yolov8s-pose.pt for better accuracy

# Open the webcam
cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Error: Cannot open webcam.")
    exit()

while True:
    ret, frame = cap.read()
    if not ret:
        print("Error: Can't read frame from camera.")
        break

    # Run pose estimation
    results = model(frame)

    # Plot the results (draw skeleton/keypoints)
    annotated_frame = results[0].plot()

    # Display the frame
    cv2.imshow("YOLOv8 Pose Estimation", annotated_frame)

    # Exit on pressing 'q'
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Cleanup
cap.release()
cv2.destroyAllWindows()



0: 480x640 1 person, 92.2ms
Speed: 7.7ms preprocess, 92.2ms inference, 203.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 19.4ms
Speed: 4.1ms preprocess, 19.4ms inference, 5.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 12.6ms
Speed: 2.4ms preprocess, 12.6ms inference, 5.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 11.2ms
Speed: 1.9ms preprocess, 11.2ms inference, 2.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 16.8ms
Speed: 2.5ms preprocess, 16.8ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 15.2ms
Speed: 2.9ms preprocess, 15.2ms inference, 4.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 12.7ms
Speed: 2.2ms preprocess, 12.7ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 11.8ms
Speed: 2.8ms preprocess, 11.8ms inference, 3.0ms postprocess per image at shape (1, 

In [None]:
import cv2
import mediapipe as mp

# Initialize MediaPipe Pose model
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(min_detection_confidence=0.5, min_tracking_confidence=0.5)

# Initialize MediaPipe drawing utilities (for visualizing keypoints)
mp_drawing = mp.solutions.drawing_utils

# Open the webcam (0 = default camera)
cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Error: Cannot open webcam.")
    exit()

while True:
    ret, frame = cap.read()
    if not ret:
        print("Error: Can't read frame from camera.")
        break

    # Flip the frame horizontally for a more natural interaction
    frame = cv2.flip(frame, 1)

    # Convert the BGR frame to RGB (MediaPipe uses RGB)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Process the frame and get the pose landmarks
    results = pose.process(rgb_frame)

    # If landmarks are detected, draw them on the frame
    if results.pose_landmarks:
        mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)

        # Get the 3D coordinates of the left and right wrists (fists)
        left_wrist = results.pose_landmarks.landmark[15]
        right_wrist = results.pose_landmarks.landmark[16]

        # Calculate the circle radius based on the z-coordinate (depth)
        # The z-value is normalized, so we need to multiply it by a factor for better visualization
        left_radius = int(abs(left_wrist.z) * 100)  # Adjust the factor (100) for better scaling
        right_radius = int(abs(right_wrist.z) * 100)

        # Draw circles on the wrists (fists)
        left_center = (int(left_wrist.x * frame.shape[1]), int(left_wrist.y * frame.shape[0]))
        right_center = (int(right_wrist.x * frame.shape[1]), int(right_wrist.y * frame.shape[0]))

        # Draw the left wrist circle with varying radius based on z value
        cv2.circle(frame, left_center, left_radius, (0, 255, 255), 2)  # Yellow circle

        # Draw the right wrist circle with varying radius based on z value
        cv2.circle(frame, right_center, right_radius, (0, 255, 255), 2)  # Yellow circle

        # Optional: Draw the coordinates next to each wrist
        left_text = f"Left Wrist:  z={left_wrist.z:.2f}"
        right_text = f"Right Wrist:  z={right_wrist.z:.2f}"

        cv2.putText(frame, left_text, (left_center[0] + 10, left_center[1] - 10), 
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 0), 2, cv2.LINE_AA)
        cv2.putText(frame, right_text, (right_center[0] + 10, right_center[1] - 10), 
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 0), 2, cv2.LINE_AA)

    # Display the frame with the landmarks and circles
    cv2.imshow("MediaPipe Pose Estimation with Circle", frame)

    # Exit on pressing 'q'
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
cv2.destroyAllWindows()


In [3]:
from ultralytics import YOLO
import cv2
import socket
import json
import numpy as np

model = YOLO("yolo11n-pose.pt")
cap = cv2.VideoCapture(0)

# Setup socket
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.bind(('localhost', 5050))
sock.listen(1)
print("Waiting for Godot to connect...")
conn, addr = sock.accept()
print(f"Connected by {addr}")

def normalize_point(x, y, width, height):
    norm_x = (x / width) * 2 - 1
    norm_y = (y / height) * 2 - 1
    return [norm_x, -norm_y]

# Store last known valid hand positions
last_left_hand = [0, 0]
last_right_hand = [0, 0]

def is_valid_point(x, y):
    return not (x == 0 and y == 0)

def get_hand_positions(keypoints, width, height):
    global last_left_hand, last_right_hand

    if len(keypoints) >= 11:
        lx, ly = keypoints[9][:2]
        rx, ry = keypoints[10][:2]

        if lx > 0 and ly > 0:
            last_left_hand = normalize_point(lx, ly, width, height)
        if rx > 0 and ry > 0:
            last_right_hand = normalize_point(rx, ry, width, height)

    return last_left_hand, last_right_hand

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)
    height, width = frame.shape[:2]

    results = model(frame)
    keypoints = results[0].keypoints.xy[0].cpu().numpy() if len(results[0].keypoints.xy) > 0 else np.zeros((17, 2))

    left, right = get_hand_positions(keypoints, width, height)
    data = json.dumps({'left': left, 'right': right})

    try:
        conn.sendall(data.encode('utf-8') + b'\n')
    except BrokenPipeError:
        print("Godot disconnected.")
        break

    cv2.imshow("Pose", results[0].plot())
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

conn.close()
cap.release()
cv2.destroyAllWindows()


Waiting for Godot to connect...
Connected by ('127.0.0.1', 29695)

0: 480x640 1 person, 16.5ms
Speed: 4.8ms preprocess, 16.5ms inference, 2.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 9.0ms
Speed: 1.5ms preprocess, 9.0ms inference, 1.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 8.7ms
Speed: 1.8ms preprocess, 8.7ms inference, 1.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 11.3ms
Speed: 1.6ms preprocess, 11.3ms inference, 1.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 9.6ms
Speed: 2.0ms preprocess, 9.6ms inference, 3.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 11.9ms
Speed: 2.8ms preprocess, 11.9ms inference, 3.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 11.5ms
Speed: 1.8ms preprocess, 11.5ms inference, 2.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 10.5ms
Speed: 1.6ms preprocess, 10.5