In [None]:
import cv2
import numpy as np
import json
import mediapipe as mp
import tensorflow as tf

class GestureRecognizer:
    def __init__(self, model_path="gesture_model.tflite", labels_path="gesture_model_labels.json"):
        # Load the TFLite model and allocate tensors
        self.interpreter = tf.lite.Interpreter(model_path=model_path)
        self.interpreter.allocate_tensors()
        
        # Get input and output tensors info
        self.input_details = self.interpreter.get_input_details()
        self.output_details = self.interpreter.get_output_details()
        
        # Load label encoder classes
        with open(labels_path, 'r') as f:
            self.classes = json.load(f)
        
        # Initialize MediaPipe Hands
        self.mp_hands = mp.solutions.hands
        self.mp_drawing = mp.solutions.drawing_utils
        self.mp_drawing_styles = mp.solutions.drawing_styles
        self.hands = self.mp_hands.Hands(
            static_image_mode=False,
            max_num_hands=1,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        )

    def extract_landmarks(self, hand_landmarks):
        """Extract and normalize landmarks with scale invariance"""
        landmarks = []
        
        # Get wrist landmark as origin
        wrist = hand_landmarks.landmark[0]
        wrist_x, wrist_y, wrist_z = wrist.x, wrist.y, wrist.z
        
        # Find the middle finger MCP joint (knuckle) for scale reference
        middle_mcp = hand_landmarks.landmark[9]
        
        # Calculate hand scale as distance from wrist to middle finger MCP
        scale = np.sqrt((middle_mcp.x - wrist_x)**2 + 
                        (middle_mcp.y - wrist_y)**2 + 
                        (middle_mcp.z - wrist_z)**2)
        
        # Process each landmark with scale normalization
        for lm in hand_landmarks.landmark:
            if scale > 0:
                norm_x = (lm.x - wrist_x) / scale
                norm_y = (lm.y - wrist_y) / scale
                norm_z = (lm.z - wrist_z) / scale
            else:
                norm_x, norm_y, norm_z = 0, 0, 0
            
            landmarks.append({
                "x": norm_x,
                "y": norm_y,
                "z": norm_z
            })
        
        return landmarks

    def predict_from_landmarks(self, landmarks):
        """Predict gesture from preprocessed landmarks using TFLite model"""
        features = []
        for lm in landmarks:
            features.extend([lm['x'], lm['y'], lm['z']])
        
        # Convert to float32 and reshape
        features = np.array(features, dtype=np.float32).reshape(1, -1)
        
        # Set input tensor
        self.interpreter.set_tensor(self.input_details[0]['index'], features)
        
        # Run inference
        self.interpreter.invoke()
        
        # Get output tensor
        prediction = self.interpreter.get_tensor(self.output_details[0]['index'])[0]
        
        # Extract prediction info
        predicted_class = int(np.argmax(prediction))
        confidence = float(prediction[predicted_class])
        predicted_label = self.classes[predicted_class]
        
        return predicted_label, confidence

    def process_frame(self, frame):
        """Process a video frame and return gesture prediction"""
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = self.hands.process(rgb_frame)
        
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                self.mp_drawing.draw_landmarks(
                    frame,
                    hand_landmarks,
                    self.mp_hands.HAND_CONNECTIONS,
                    self.mp_drawing_styles.get_default_hand_landmarks_style(),
                    self.mp_drawing_styles.get_default_hand_connections_style()
                )
                
                landmarks = self.extract_landmarks(hand_landmarks)
                label, confidence = self.predict_from_landmarks(landmarks)
                
                cv2.putText(
                    frame,
                    f"{label}: {confidence:.2f}",
                    (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    1,
                    (0, 255, 0),
                    2
                )
                
                return frame, label, confidence
        
        return frame, None, None

    def start_webcam(self):
        """Start webcam feed and recognize gestures in real-time"""
        cap = cv2.VideoCapture(0)
        
        while cap.isOpened():
            success, frame = cap.read()
            if not success:
                print("Failed to read from webcam")
                break
            
            processed_frame, label, confidence = self.process_frame(frame)
            cv2.imshow('Gesture Recognition', processed_frame)
            
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        
        cap.release()
        cv2.destroyAllWindows()
