In [1]:
!pip install mediapipe opencv-python

Collecting mediapipe
  Downloading mediapipe-0.10.11-cp311-cp311-win_amd64.whl.metadata (9.8 kB)
Collecting absl-py (from mediapipe)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting flatbuffers>=2.0 (from mediapipe)
  Downloading flatbuffers-24.3.7-py2.py3-none-any.whl.metadata (849 bytes)
Collecting jax (from mediapipe)
  Downloading jax-0.4.25-py3-none-any.whl.metadata (24 kB)
Collecting opencv-contrib-python (from mediapipe)
  Downloading opencv_contrib_python-4.9.0.80-cp37-abi3-win_amd64.whl.metadata (20 kB)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.4.6-py3-none-win_amd64.whl.metadata (1.4 kB)
Collecting ml-dtypes>=0.2.0 (from jax->mediapipe)
  Downloading ml_dtypes-0.3.2-cp311-cp311-win_amd64.whl.metadata (20 kB)
Collecting opt-einsum (from jax->mediapipe)
  Using cached opt_einsum-3.3.0-py3-none-any.whl.metadata (6.5 kB)
Downloading mediapipe-0.10.11-cp311-cp311-win_amd64.whl (50.8 MB)
   ---------------------------------

In [2]:
import mediapipe as mp
import cv2
import numpy as np

import os
import uuid

In [3]:
# Function to preprocess input gesture representation
def preprocess_gesture_representation(image_path, target_size=(256, 256)):
    # Load the gesture representation image
    gesture_image = cv2.imread(image_path)
    # Resize the image to the target size
    gesture_image_resized = cv2.resize(gesture_image, target_size)
    # Convert the image to grayscale
    gesture_image_gray = cv2.cvtColor(gesture_image_resized, cv2.COLOR_BGR2GRAY)
    return gesture_image_gray


In [4]:
# Function to preprocess test video data
def preprocess_test_video(video_path):
    # Open the test video
    cap = cv2.VideoCapture(video_path)
    frames = []
    # Read each frame from the video
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        # Resize the frame to the desired size (optional)
        frame_resized = cv2.resize(frame, (640, 480))
        frames.append(frame_resized)
    # Release video capture object
    cap.release()
    return frames

In [5]:
gesture_representation_path = 'test_image.jpg'
test_video_path = 'test_video.mp4'

In [6]:
preprocessed_gesture_representation = preprocess_gesture_representation(gesture_representation_path)
preprocessed_test_video = preprocess_test_video(test_video_path)

In [7]:
def detect_and_annotate_gesture(test_video_frames, gesture_representation, output_dir):
    # Loop through each frame in the test video
    for i, frame in enumerate(test_video_frames):
        # Convert the current frame to grayscale
        frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        
        # Match template (gesture representation) with the current frame
        res = cv2.matchTemplate(frame_gray, gesture_representation, cv2.TM_CCOEFF_NORMED)
        
        # Define a threshold for detection
        threshold = 0.8
        
        # If the maximum value of the match is above the threshold, gesture is detected
        if np.max(res) > threshold:
            # Annotate the frame with "DETECTED" in bright green on the top right corner
            cv2.putText(frame, 'DETECTED', (frame.shape[1]-200, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
        
        # Save annotated frame
        frame_filename = os.path.join(output_dir, f'frame_{i:04d}.jpg')
        cv2.imwrite(frame_filename, frame)

# Define output directory for annotated frames
output_dir = 'Annotated_Frames'
os.makedirs(output_dir, exist_ok=True)

# Call the function to detect gesture in frames and annotate if detected
detect_and_annotate_gesture(preprocessed_test_video, preprocessed_gesture_representation, output_dir)

In [8]:
import shutil
shutil.make_archive('annotated_frames', 'zip', 'Annotated_Frames')


'C:\\Users\\ashle\\annotated_frames.zip'

In [9]:
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5)


In [10]:
cap = cv2.VideoCapture(0)


In [19]:
# import cv2
# import mediapipe as mp
# import numpy as np

# # Define the get_bounding_box function
# def get_bounding_box(hand_landmarks, frame_shape):
#     landmarks_x = [lm.x for lm in hand_landmarks.landmark]
#     landmarks_y = [lm.y for lm in hand_landmarks.landmark]
#     min_x = min(landmarks_x)
#     max_x = max(landmarks_x)
#     min_y = min(landmarks_y)
#     max_y = max(landmarks_y)

#     image_height, image_width, _ = frame_shape
#     bbox = [
#         int(min_x * image_width),
#         int(min_y * image_height),
#         int(max_x * image_width),
#         int(max_y * image_height)
#     ]

#     return bbox

# # Initialize MediaPipe Hands model
# mp_hands = mp.solutions.hands
# hands = mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5)
# mp_drawing = mp.solutions.drawing_utils  # Import drawing_utils for drawing landmarks

# # Capture video from webcam
# cap = cv2.VideoCapture(0)

# # Define font and text parameters for annotation
# font = cv2.FONT_HERSHEY_SIMPLEX
# font_scale = 1
# font_thickness = 2
# text_color = (0, 255, 0)  # Bright green

# # Define variables for hand motion detection
# prev_landmarks = None
# movement_threshold = 0.02  # Adjust this threshold as needed

# while cap.isOpened():
#     ret, frame = cap.read()
#     if not ret:
#         break
    
#     frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
#     results = hands.process(frame_rgb)
    
#     if results.multi_hand_landmarks:
#         for hand_landmarks in results.multi_hand_landmarks:
#             bbox = get_bounding_box(hand_landmarks, frame.shape)
#             cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 0, 0), 2)
            
#             curr_landmarks = np.array([(lm.x, lm.y) for lm in hand_landmarks.landmark])
#             # 
#             if prev_landmarks is not None:
#                 displacement = np.linalg.norm(curr_landmarks - prev_landmarks)
#                 if displacement > movement_threshold:
#                     cv2.putText(frame, 'DETECTED', (bbox[0]+bbox[2]-200, bbox[1]+30), font, font_scale, text_color, font_thickness, cv2.LINE_AA)
#                 else:
#                     cv2.putText(frame, 'NOT DETECTED', (bbox[0]+bbox[2]-250, bbox[1]+30), font, font_scale, text_color, font_thickness, cv2.LINE_AA)
            
#             prev_landmarks = curr_landmarks
    
#     cv2.imshow('Hand Tracking', frame)
    
#     if cv2.waitKey(1) & 0xFF == ord('q') or not ret:
#         break

# cap.release()
# cv2.destroyAllWindows()


In [35]:
def get_bounding_box(hand_landmarks, frame_shape):
    landmarks_x = [lm.x for lm in hand_landmarks.landmark]
    landmarks_y = [lm.y for lm in hand_landmarks.landmark]
    min_x = min(landmarks_x)
    max_x = max(landmarks_x)
    min_y = min(landmarks_y)
    max_y = max(landmarks_y)

    image_height, image_width, _ = frame_shape
    bbox = [
        int(min_x * image_width),
        int(min_y * image_height),
        int(max_x * image_width),
        int(max_y * image_height)
    ]

    return bbox

# Initialize MediaPipe Hands model
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils  # Import drawing_utils for drawing landmarks

# Capture video from webcam
cap = cv2.VideoCapture(0)

# Define font and text parameters for annotation
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 1
font_thickness = 2
text_color = (0, 255, 0)  # Bright green

# Define variables for hand motion detection
prev_landmarks = None
detected = False
movement_threshold = 0.02  # Adjust this threshold as needed

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)
    
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            bbox = get_bounding_box(hand_landmarks, frame.shape)
            cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 0, 0), 2)
            
            curr_landmarks = np.array([(lm.x, lm.y) for lm in hand_landmarks.landmark])
            
            if prev_landmarks is not None:
                displacement = np.linalg.norm(curr_landmarks - prev_landmarks)
                if displacement > movement_threshold:
                    detected = True
                else:
                    detected = False
            
            if detected:
                cv2.putText(frame, 'DETECTED', (bbox[0]+bbox[2]-200, bbox[1]+30), font, font_scale, text_color, font_thickness, cv2.LINE_AA)
            else:
                cv2.putText(frame, 'NOT DETECTED', (bbox[0]+bbox[2]-250, bbox[1]+30), font, font_scale, text_color, font_thickness, cv2.LINE_AA)
            
            prev_landmarks = curr_landmarks
    
    cv2.imshow('Hand Tracking', frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q') or not ret:
        break

cap.release()
cv2.destroyAllWindows()