In [1]:
import cv2
import numpy as np
import tensorflow as tf
import mediapipe as mp
from collections import deque
import time

# ── CONFIG ─────────────────────────────
SEQUENCE_LENGTH = 60
LANDMARKS = 63
GESTURE_LABELS = np.load("models/labels.npy").tolist()
NUM_CLASSES = len(GESTURE_LABELS)
STATIC_GESTURES = ["like", "dislike", "peace"]
MIN_CONFIDENCE = 0.80  # For high precision
COOLDOWN_SECONDS = 2.0  # For faster gesture cycling
PREDICTION_WINDOW = 7  # For stable predictions
MAJORITY_THRESHOLD = 0.7  # At least 70% agreement

# ── INIT MEDIAPIPE ────────────────────
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.75,
    min_tracking_confidence=0.6
)
mp_draw = mp.solutions.drawing_utils

# ── LOAD TFLITE MODEL ─────────────────
interpreter = tf.lite.Interpreter(model_path="models/gesture_model.tflite")
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()


# ── INIT CAMERA ───────────────────────
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
cap.set(cv2.CAP_PROP_FPS, 60)
buffer = deque(maxlen=SEQUENCE_LENGTH)
prediction_history = deque(maxlen=PREDICTION_WINDOW)
fps_counter = deque(maxlen=30)

# Cooldown and gesture state
cooldown_active = False
cooldown_end_time = 0
current_gesture = None

def start_cooldown(gesture_name):
    """Start cooldown period for a detected gesture."""
    global cooldown_active, cooldown_end_time, current_gesture
    cooldown_active = True
    cooldown_end_time = time.time() + COOLDOWN_SECONDS
    current_gesture = gesture_name
    return COOLDOWN_SECONDS

def check_cooldown():
    """Check current cooldown status."""
    global cooldown_active
    if not cooldown_active:
        return False
    time_remaining = cooldown_end_time - time.time()
    if time_remaining <= 0:
        cooldown_active = False
        return False
    return time_remaining

def preprocess_sequence(sequence, gesture_name):
    """Preprocess sequence to match training data (only normalized landmarks)."""
    sequence = np.array(sequence, dtype=np.float32)
    
    # Normalize by subtracting the first frame
    sequence_norm = sequence - sequence[0]
    
    # For static gestures, compute mean pose over middle frames
    if gesture_name in STATIC_GESTURES:
        middle_frames = sequence_norm[20:40]
        mean_pose = np.mean(middle_frames, axis=0, keepdims=True)
        sequence_norm = np.repeat(mean_pose, SEQUENCE_LENGTH, axis=0)
    
    return sequence_norm  # Shape: (SEQUENCE_LENGTH, 63)

print("📸 Starting real-time gesture recognition...")
print(f"Available gestures: {GESTURE_LABELS}")
print("❌ Press 'q' to quit.")

while True:
    start_frame_time = time.time()
    ret, frame = cap.read()
    if not ret:
        print("❌ Failed to capture frame.")
        break

    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb)

    landmark_row = None
    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            landmark_row = []
            for lm in hand_landmarks.landmark:
                landmark_row.extend([lm.x, lm.y, lm.z])
            break

    # ── COLLECT SEQUENCE ───────────────
    if landmark_row:
        buffer.append(landmark_row)
    else:
        buffer.append([0] * LANDMARKS)
        cv2.putText(frame, "No hand detected", (10, 80),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        # Reset buffer if no hand detected for 15+ frames
        if len(buffer) == SEQUENCE_LENGTH and all(np.all(b == 0) for b in list(buffer)[-15:]):
            buffer.clear()
            prediction_history.clear()

    # Check cooldown
    cooldown_remaining = check_cooldown()
    if cooldown_active and cooldown_remaining:
        cv2.putText(frame, f"{current_gesture} ({cooldown_remaining:.1f}s)", (10, 40),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 128, 255), 2)
    
    # ── INFERENCE ──────────────────────
    elif len(buffer) == SEQUENCE_LENGTH:
        sequence = np.array(buffer)
        if sequence.shape == (SEQUENCE_LENGTH, LANDMARKS) and not np.all(sequence == 0):
            try:
                # Preprocess sequence
                sequence_norm = preprocess_sequence(sequence, "")
                
                # Prepare input for TFLite model
                input_data = sequence_norm[np.newaxis, ...].astype(np.float32)
                interpreter.set_tensor(input_details[0]['index'], input_data)
                
                # Run inference
                interpreter.invoke()
                output_data = interpreter.get_tensor(output_details[0]['index'])
                
                # Get prediction
                pred_idx = np.argmax(output_data[0])
                confidence = output_data[0][pred_idx]
                
                # Add to prediction history
                prediction_history.append((pred_idx, confidence))
                
                # Temporal smoothing and majority voting
                if len(prediction_history) >= PREDICTION_WINDOW:
                    pred_indices = [idx for idx, _ in prediction_history]
                    counter = {}
                    for idx in pred_indices:
                        counter[idx] = counter.get(idx, 0) + 1
                    
                    total_votes = sum(counter.values())
                    most_common = max(counter.items(), key=lambda x: x[1])
                    final_pred_idx = most_common[0]
                    vote_proportion = most_common[1] / total_votes
                    
                    if vote_proportion >= MAJORITY_THRESHOLD:
                        confidences = [conf for idx, conf in prediction_history if idx == final_pred_idx]
                        avg_confidence = sum(confidences) / len(confidences)
                        
                        if avg_confidence > MIN_CONFIDENCE:
                            pred_label = GESTURE_LABELS[final_pred_idx]
                            
                            # Display prediction
                            text = f"Gesture: {pred_label} ({avg_confidence:.2f})"
                            cv2.putText(frame, text, (10, 40),
                                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                            
                            # Start cooldown
                            cooldown_duration = start_cooldown(pred_label)
                            print(f"Detected: {pred_label} (Confidence: {avg_confidence:.2f}, Cooldown: {cooldown_duration}s)")
                            
                            # Clear history after detection
                            prediction_history.clear()
                        else:
                            cv2.putText(frame, f"Low confidence ({avg_confidence:.2f})", (10, 40),
                                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 165, 0), 2)
                    else:
                        cv2.putText(frame, "Uncertain prediction", (10, 40),
                                    cv2.FONT_HERSHEY_SIMPLEX, 1, (200, 200, 200), 2)
                else:
                    cv2.putText(frame, "Analyzing...", (10, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (200, 200, 200), 2)
            except Exception as e:
                cv2.putText(frame, f"Error: {str(e)[:20]}", (10, 40),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
                print(f"Error during inference: {str(e)}")
        else:
            cv2.putText(frame, "Invalid sequence", (10, 40),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)

    # Calculate and display FPS
    frame_time = time.time() - start_frame_time
    fps = 1.0 / max(frame_time, 0.001)
    fps_counter.append(fps)
    avg_fps = sum(fps_counter) / len(fps_counter)
    
    cv2.putText(frame, f"FPS: {avg_fps:.1f}", (frame.shape[1] - 120, 25),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)

    cv2.imshow("Gesture Recognition", frame)

    key = cv2.waitKey(1) & 0xFF
    if key == ord('q'):
        break

# ── CLEANUP ───────────────────────────
cap.release()
cv2.destroyAllWindows()
hands.close()
print("🟢 Testing stopped.")

    TF 2.20. Please use the LiteRT interpreter from the ai_edge_litert package.
    See the [migration guide](https://ai.google.dev/edge/litert/migration)
    for details.
    


📸 Starting real-time gesture recognition...
Available gestures: ['swipe_left', 'swipe_right', 'swipe_up', 'swipe_down', 'screenshot', 'drop', 'like', 'dislike', 'sos']
❌ Press 'q' to quit.
Detected: dislike (Confidence: 0.92, Cooldown: 2.0s)
Detected: swipe_up (Confidence: 0.99, Cooldown: 2.0s)
Detected: swipe_right (Confidence: 1.00, Cooldown: 2.0s)
Detected: swipe_left (Confidence: 1.00, Cooldown: 2.0s)
Detected: swipe_right (Confidence: 0.84, Cooldown: 2.0s)
Detected: screenshot (Confidence: 0.98, Cooldown: 2.0s)
Detected: drop (Confidence: 1.00, Cooldown: 2.0s)
Detected: screenshot (Confidence: 1.00, Cooldown: 2.0s)
Detected: drop (Confidence: 0.99, Cooldown: 2.0s)
Detected: screenshot (Confidence: 1.00, Cooldown: 2.0s)
Detected: swipe_down (Confidence: 0.97, Cooldown: 2.0s)
Detected: drop (Confidence: 1.00, Cooldown: 2.0s)
Detected: screenshot (Confidence: 0.96, Cooldown: 2.0s)
Detected: drop (Confidence: 1.00, Cooldown: 2.0s)
🟢 Testing stopped.
