In [None]:
import cv2
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from sklearn.preprocessing import MinMaxScaler
from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
import pickle
import numpy as np
import xgboost as xgb
import time
from datetime import datetime
import joblib
from collections import deque


# Load the saved model, label encoder, and scaler
model = xgb.XGBClassifier()
model.load_model('models/best_xgb_model.json')
# model = joblib.load('models/rf_model.pkl')
label_encoder = pickle.load(open('models/label_encoder.pkl', 'rb'))
# scaler = pickle.load(open('models/scaler.pkl', 'rb'))

MARGIN = 10  # pixels
FONT_SIZE = 1
FONT_THICKNESS = 1
HANDEDNESS_TEXT_COLOR = (88, 205, 54) # vibrant green

def draw_landmarks_on_image(rgb_image, detection_result):
  hand_landmarks_list = detection_result.hand_landmarks
  handedness_list = detection_result.handedness
  annotated_image = np.copy(rgb_image)

  # Loop through the detected hands to visualize.
  for idx in range(len(hand_landmarks_list)):
    hand_landmarks = hand_landmarks_list[idx]
    handedness = handedness_list[idx]

    # Draw the hand landmarks.
    hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
    hand_landmarks_proto.landmark.extend([
      landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks
    ])
    solutions.drawing_utils.draw_landmarks(
      annotated_image,
      hand_landmarks_proto,
      solutions.hands.HAND_CONNECTIONS,
      solutions.drawing_styles.get_default_hand_landmarks_style(),
      solutions.drawing_styles.get_default_hand_connections_style())

    # Get the top left corner of the detected hand's bounding box.
    height, width, _ = annotated_image.shape
    x_coordinates = [landmark.x for landmark in hand_landmarks]
    y_coordinates = [landmark.y for landmark in hand_landmarks]
    text_x = int(min(x_coordinates) * width)
    text_y = int(min(y_coordinates) * height) - MARGIN

    # Draw handedness (left or right hand) on the image.
    cv2.putText(annotated_image, f"{handedness[0].category_name}",
                (text_x, text_y), cv2.FONT_HERSHEY_DUPLEX,
                FONT_SIZE, HANDEDNESS_TEXT_COLOR, FONT_THICKNESS, cv2.LINE_AA)

  return annotated_image


def extract_xy_coords(hand_landmarks):
  """Extracts x and y coordinates from hand landmarks into a NumPy array.

  Args:
    hand_landmarks: A list of NormalizedLandmark objects representing hand landmarks.

  Returns:
    A NumPy array containing [x1, y1, x2, y2, ...] for all landmarks.
  """
  x_center = hand_landmarks[0].x
  y_center = hand_landmarks[0].y

  x_scale = hand_landmarks[12].x - x_center
  y_scale = hand_landmarks[12].y - y_center

  xy_coords = []
  for landmark in hand_landmarks:
    xy_coords.extend([(landmark.x - x_center) / x_scale, (landmark.y - y_center) / y_scale, landmark.z])

  return np.array(xy_coords)

class GestureSmoother:
    def __init__(self, window_size=5):
        self.window = deque(maxlen=window_size)
    
    def smooth(self, new_prediction):
        self.window.append(new_prediction)
        # Calculate mode of the window
        counts = {}
        for num in self.window:
            counts[num] = counts.get(num, 0) + 1
        return max(counts, key=counts.get, default=new_prediction)

# Initialize smoother before your main loop
smoother = GestureSmoother(window_size=7)  # Adjust window_size as needed


def process_image_and_predict(image):
    """Processes an image, extracts hand landmarks, predicts using the model,
    and applies temporal smoothing to predictions."""
    
    detection_result = detector.detect(image)

    if detection_result.hand_landmarks:
        xy_array = extract_xy_coords(detection_result.hand_landmarks[0])
        
        # Get raw prediction
        y_hat = model.predict_proba(xy_array.reshape(1, -1))
        encoded_label = y_hat.argmax(axis=1)[0]  # Get encoded label index
        
        # Apply temporal smoothing
        smoothed_encoded = smoother.smooth(encoded_label)
        
        # Convert to class name
        prediction = label_encoder.inverse_transform([smoothed_encoded])[0]

        # Update detection result with smoothed prediction
        detection_result.handedness[0][0].display_name = prediction
        detection_result.handedness[0][0].category_name = prediction
        detection_result.handedness[0][0].score = y_hat.max()

        annotated_image = draw_landmarks_on_image(image.numpy_view(), detection_result)
        return annotated_image
    else:
        return image.numpy_view()


# STEP 2: Create an HandLandmarker object.
base_options = python.BaseOptions(model_asset_path='models/hand_landmarker.task')  # Replace with your model path
options = vision.HandLandmarkerOptions(base_options=base_options,
                                       num_hands=2)
detector = vision.HandLandmarker.create_from_options(options)

# Prepare webcam capture (you might need to adjust based on your environment)
cap = cv2.VideoCapture(0)
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Create a VideoWriter object
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f'hand_gesture_recording_{timestamp}.mp4'
fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # or 'XVID' for .avi
out = cv2.VideoWriter(output_filename, fourcc, fps, (frame_width, frame_height))

recording = False  # Flag to track recording state


while cap.isOpened():
    success, image = cap.read()
    if not success:
        print("Ignoring empty camera frame.")
        continue

    # Convert the OpenCV image to MediaPipe image format
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

    # Process and predict
    annotated_image = process_image_and_predict(mp_image)
    
    # Convert back to BGR for OpenCV display
    output_image = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)
    
    # Add recording indicator if recording
    if recording:
        cv2.circle(output_image, (30, 30), 10, (0, 0, 255), -1)  # Red circle indicator
        out.write(output_image)  # Write the frame to the video file
    
    # Display the annotated image
    cv2.imshow('Hand Gesture Recognition', output_image)

    # Handle key presses
    key = cv2.waitKey(5) & 0xFF
    if key == ord('q'):  # Quit
        break
    elif key == ord('r'):  # Toggle recording
        recording = not recording
        if recording:
            print("Recording started...")
        else:
            print("Recording stopped.")

# Release everything when done
cap.release()
out.release()
cv2.destroyAllWindows()

Recording started...
Recording stopped.
