In [None]:
import cv2
import mediapipe as mp
import numpy as np
import tensorflow as tf
from collections import deque
import time
import os
import matplotlib.pyplot as plt
from datetime import datetime

# Configuration
SEQUENCE_LENGTH = 60
LANDMARKS = 63  # 21 landmarks × 3 coordinates (x, y, z)
MODEL_PATH = "gesture_model.tflite"
LABELS_PATH = "labels.npy"

# Recognition parameters (can be adjusted based on preferences)
CONFIDENCE_THRESHOLD = 0.70  # Default confidence threshold
DETECTION_DURATION = 2.0  # Time a gesture must be held before confirming (seconds)
IDLE_COOLDOWN = 1.5  # Time before detecting another gesture (seconds)
INITIAL_DELAY = 1.0  # Startup delay before detecting gestures (seconds)
STATIC_GESTURE_THRESHOLD = 0.85  # Higher threshold for static gestures
DYNAMIC_GESTURE_THRESHOLD = 0.65  # Lower threshold for dynamic gestures

# Create models directory if it doesn't exist
os.makedirs("models", exist_ok=True)
os.makedirs("logs", exist_ok=True)

# Check if model exists, show appropriate message if not
if not os.path.exists(MODEL_PATH) or not os.path.exists(LABELS_PATH):
    print("⚠️ Model files not found! Please ensure gesture_model.tflite and labels.npy are in the models directory.")
    print("If you're running this for the first time, please run the training script first.")
    exit(1)

# Load the TFLite model
print("Loading model...")
interpreter = tf.lite.Interpreter(model_path=MODEL_PATH)
interpreter.allocate_tensors()

# Get input and output tensor details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Load gesture labels
labels = np.load(LABELS_PATH, allow_pickle=True).tolist()
print(f"Loaded {len(labels)} gesture labels: {labels}")

# Identify static and dynamic gestures (assuming the same classification as in training)
STATIC_GESTURES = ["like", "dislike", "peace"]
DYNAMIC_GESTURES = [g for g in labels if g not in STATIC_GESTURES]

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.6
)
mp_draw = mp.solutions.drawing_utils

# Set up video capture
print("Initializing camera...")
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
cap.set(cv2.CAP_PROP_FPS, 30)  # Try to get 30fps if possible

# Check if camera opened successfully
if not cap.isOpened():
    print("❌ Error: Could not open video capture device")
    exit()

# Sequence buffers
raw_sequence = deque(maxlen=SEQUENCE_LENGTH)
norm_sequence = deque(maxlen=SEQUENCE_LENGTH)
velocity_sequence = deque(maxlen=SEQUENCE_LENGTH)

# Feature extraction functions (matching the training preprocessing)
def extract_features(landmarks_sequence):
    """Extract normalized position and velocity features from landmarks"""
    if len(landmarks_sequence) < SEQUENCE_LENGTH:
        return None, None
    
    # Convert to numpy array
    seq_array = np.array(landmarks_sequence, dtype=np.float32)
    
    # Normalize relative to first frame
    seq_normalized = seq_array - seq_array[0]
    
    # Calculate velocity (temporal derivatives)
    seq_velocity = np.zeros_like(seq_normalized)
    seq_velocity[1:] = seq_normalized[1:] - seq_normalized[:-1]
    
    return seq_normalized, seq_velocity

# State tracking
current_gesture = None
gesture_start_time = None
last_detection_time = 0
program_start_time = time.time()
no_hand_detected_time = None
detection_history = []
total_frames = 0
fps_history = deque(maxlen=30)
prediction_history = deque(maxlen=10)  # Store recent predictions for smoothing
start_time = time.time()

# Visualization colors
COLORS = {
    'background': (50, 50, 50),
    'text': (255, 255, 255),
    'detecting': (50, 205, 50),   # Green
    'confirmed': (0, 191, 255),   # Deep Sky Blue
    'waiting': (255, 165, 0),     # Orange
    'no_hand': (128, 128, 128),   # Gray
    'high_conf': (50, 205, 50),   # Green
    'medium_conf': (255, 165, 0), # Orange
    'low_conf': (255, 0, 0)       # Red
}

# Smoothing function for predictions
def smooth_predictions(new_pred, history=None):
    if history is None:
        return new_pred
    
    # Add current prediction to history
    history.append(new_pred)
    
    # Calculate weighted average (more recent predictions have higher weight)
    weights = np.linspace(0.5, 1.0, len(history))
    weights /= weights.sum()
    
    # Combine predictions
    combined = {}
    for i, pred in enumerate(history):
        for item in pred:
            label = item["label"]
            conf = item["confidence"] * weights[i]
            if label in combined:
                combined[label] += conf
            else:
                combined[label] = conf
    
    # Sort and format result
    result = [{"label": k, "confidence": v} for k, v in combined.items()]
    result.sort(key=lambda x: x["confidence"], reverse=True)
    
    return result[:3]  # Return top 3

# Function to create visualization of confidence scores
def draw_confidence_bars(frame, predictions, x=20, y=80, width=200, height=20, gap=30):
    for i, pred in enumerate(predictions[:3]):  # Show top 3 predictions
        confidence = pred["confidence"]
        label = pred["label"]
        
        # Determine color based on confidence
        if confidence > 0.8:
            color = COLORS['high_conf']
        elif confidence > 0.5:
            color = COLORS['medium_conf']
        else:
            color = COLORS['low_conf']
        
        # Draw bar background
        cv2.rectangle(frame, (x, y + i*gap), (x + width, y + height + i*gap), (50, 50, 50), -1)
        
        # Draw filled portion of bar
        filled_width = int(width * confidence)
        cv2.rectangle(frame, (x, y + i*gap), (x + filled_width, y + height + i*gap), color, -1)
        
        # Draw border
        cv2.rectangle(frame, (x, y + i*gap), (x + width, y + height + i*gap), (200, 200, 200), 1)
        
        # Add label and percentage
        text = f"{label}: {confidence*100:.1f}%"
        cv2.putText(frame, text, (x + width + 10, y + height//2 + i*gap + 5), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, COLORS['text'], 1)

# Create gesture overlay with more information
def create_gesture_overlay(frame, text, predictions=None, fps=0, progress=None, status="idle"):
    h, w = frame.shape[:2]
    
    # Create header bar
    header_height = 60
    if status == "detecting":
        color = COLORS['detecting']
    elif status == "confirmed":
        color = COLORS['confirmed']
    elif status == "waiting":
        color = COLORS['waiting']
    elif status == "no_hand":
        color = COLORS['no_hand']
    else:
        color = COLORS['background']
    
    overlay = frame.copy()
    cv2.rectangle(overlay, (0, 0), (w, header_height), color, -1)
    
    # Apply opacity
    alpha = 0.7
    cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0, frame)
    
    # Add main text
    cv2.putText(frame, text, (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.8, COLORS['text'], 2)
    
    # Add FPS counter
    fps_text = f"FPS: {fps:.1f}"
    cv2.putText(frame, fps_text, (w - 120, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.6, COLORS['text'], 1)
    
    # Add progress bar if provided
    if progress is not None:
        bar_width = 200
        bar_height = 5
        bar_x = w - bar_width - 20
        bar_y = 20
        
        # Background
        cv2.rectangle(frame, (bar_x, bar_y), (bar_x + bar_width, bar_y + bar_height), (100, 100, 100), -1)
        
        # Filled portion
        filled_width = int(bar_width * progress)
        cv2.rectangle(frame, (bar_x, bar_y), (bar_x + filled_width, bar_y + bar_height), (255, 255, 255), -1)
    
    # Draw confidence bars if predictions are provided
    if predictions:
        draw_confidence_bars(frame, predictions)
    
    return frame

# Logging function
def log_detection(gesture, confidence):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_entry = f"{timestamp},{gesture},{confidence:.4f}\n"
    with open(f"logs/gesture_log_{datetime.now().strftime('%Y%m%d')}.csv", "a") as f:
        f.write(log_entry)

# Main loop
print("Starting gesture recognition...\n")
print("Press 'q' to quit")
print("Press 's' to take a screenshot")
print("Press 'd' to toggle debug visualization")

show_debug = False
debug_plot = None
debug_frames = []
debug_confidences = []

while True:
    ret, frame = cap.read()
    if not ret:
        print("Failed to capture frame from camera. Exiting...")
        break

    # Calculate FPS
    current_time = time.time()
    elapsed = current_time - start_time
    start_time = current_time
    fps = 1 / elapsed if elapsed > 0 else 0
    fps_history.append(fps)
    avg_fps = sum(fps_history) / len(fps_history)
    
    total_frames += 1
    frame = cv2.flip(frame, 1)  # Mirror image for more intuitive feedback
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Apply initial delay before detection starts
    if current_time < program_start_time + INITIAL_DELAY:
        progress = (current_time - program_start_time) / INITIAL_DELAY
        frame = create_gesture_overlay(frame, "INITIALIZING SYSTEM...", 
                                      fps=avg_fps, progress=progress, status="waiting")
        cv2.imshow("Gesture Recognition", frame)
        key = cv2.waitKey(1) & 0xFF
        if key == ord('q'):
            break
        continue

    # Process hand landmarks
    results = hands.process(rgb_frame)

    # Handle case when no hand is detected
    if not results.multi_hand_landmarks:
        if no_hand_detected_time is None:
            no_hand_detected_time = current_time
            # Clear the sequence buffers when hand disappears
            raw_sequence.clear()
            norm_sequence.clear()
            velocity_sequence.clear()

        # Only show "NO HAND DETECTED" after a short delay
        if current_time - no_hand_detected_time > 1.0:
            frame = create_gesture_overlay(frame, "NO HAND DETECTED", 
                                          fps=avg_fps, status="no_hand")
        else:
            frame = create_gesture_overlay(frame, "WAITING FOR HAND...", 
                                          fps=avg_fps, status="waiting")
        
        cv2.imshow("Gesture Recognition", frame)
        key = cv2.waitKey(1) & 0xFF
        if key == ord('q'):
            break
        elif key == ord('s'):
            screenshot_path = f"screenshot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
            cv2.imwrite(screenshot_path, frame)
            print(f"Screenshot saved to {screenshot_path}")
        elif key == ord('d'):
            show_debug = not show_debug
            if not show_debug and debug_plot is not None:
                plt.close(debug_plot)
                debug_plot = None
        
        continue

    # Hand is detected
    no_hand_detected_time = None
    
    # Process first detected hand
    for hand_landmarks in results.multi_hand_landmarks:
        mp_draw.draw_landmarks(
            frame, 
            hand_landmarks, 
            mp_hands.HAND_CONNECTIONS,
            landmark_drawing_spec=mp_draw.DrawingSpec(color=(0, 255, 0), thickness=2, circle_radius=2),
            connection_drawing_spec=mp_draw.DrawingSpec(color=(0, 255, 0), thickness=1)
        )
        
        # Extract landmarks
        landmarks = []
        for lm in hand_landmarks.landmark:
            landmarks.extend([lm.x, lm.y, lm.z])
        
        # Store raw landmarks
        raw_sequence.append(landmarks)
        break  # Process only the first hand

    # Feature extraction and prediction
    if len(raw_sequence) == SEQUENCE_LENGTH:
        # Extract features matching the training preprocessing
        seq_normalized, seq_velocity = extract_features(raw_sequence)
        
        if seq_normalized is not None:
            # Prepare input tensor
            input_data = seq_normalized[np.newaxis, ...]
            
            # Run inference
            interpreter.set_tensor(input_details[0]['index'], input_data)
            interpreter.invoke()
            output_data = interpreter.get_tensor(output_details[0]['index'])
            
            # Get top predictions
            top_indices = output_data[0].argsort()[-3:][::-1]
            top_predictions = [{"label": labels[idx], "confidence": float(output_data[0][idx])} 
                              for idx in top_indices]
            
            # Apply different thresholds based on gesture type
            top_gesture = top_predictions[0]["label"]
            top_confidence = top_predictions[0]["confidence"]
            
            # Determine threshold based on gesture type
            if top_gesture in STATIC_GESTURES:
                current_threshold = STATIC_GESTURE_THRESHOLD
            else:
                current_threshold = DYNAMIC_GESTURE_THRESHOLD
            
            # Apply smoothing
            smoothed_predictions = smooth_predictions(top_predictions, prediction_history)
            top_smoothed = smoothed_predictions[0]
            
            # For debugging, record confidence values
            if show_debug:
                debug_frames.append(total_frames)
                debug_confidences.append(top_smoothed["confidence"])
                
                # Keep only the last 100 frames
                if len(debug_frames) > 100:
                    debug_frames.pop(0)
                    debug_confidences.pop(0)
            
            # Gesture recognition logic
            if top_smoothed["confidence"] > current_threshold:
                if current_gesture is None or current_gesture != top_smoothed["label"]:
                    current_gesture = top_smoothed["label"]
                    gesture_start_time = current_time
                
                # Calculate detection duration
                detection_duration = current_time - gesture_start_time
                
                # Handle confirmation process
                if detection_duration < DETECTION_DURATION:
                    # Still in detection phase
                    progress = detection_duration / DETECTION_DURATION
                    overlay_text = f"DETECTING: {current_gesture}"
                    frame = create_gesture_overlay(frame, overlay_text, 
                                                 predictions=smoothed_predictions,
                                                 fps=avg_fps, progress=progress,
                                                 status="detecting")
                elif current_time - last_detection_time > IDLE_COOLDOWN:
                    # Confirm the gesture
                    last_detection_time = current_time
                    overlay_text = f"CONFIRMED: {current_gesture.upper()}"
                    frame = create_gesture_overlay(frame, overlay_text, 
                                                 predictions=smoothed_predictions,
                                                 fps=avg_fps,
                                                 status="confirmed")
                    
                    # Log the detection
                    log_detection(current_gesture, top_smoothed["confidence"])
                    detection_history.append((current_time, current_gesture))
                    
                    # Reset current gesture after confirmation
                    current_gesture = None
                else:
                    # In cooldown period
                    cooldown_remaining = IDLE_COOLDOWN - (current_time - last_detection_time)
                    progress = 1.0 - (cooldown_remaining / IDLE_COOLDOWN)
                    overlay_text = f"COOLDOWN: {cooldown_remaining:.1f}s"
                    frame = create_gesture_overlay(frame, overlay_text, 
                                                 predictions=smoothed_predictions,
                                                 fps=avg_fps, progress=progress,
                                                 status="waiting")
            else:
                frame = create_gesture_overlay(frame, "READY", 
                                              predictions=smoothed_predictions,
                                              fps=avg_fps)
                current_gesture = None
    else:
        # Not enough frames yet
        progress = len(raw_sequence) / SEQUENCE_LENGTH
        frame = create_gesture_overlay(frame, f"BUFFERING FRAMES: {len(raw_sequence)}/{SEQUENCE_LENGTH}", 
                                     fps=avg_fps, progress=progress, status="waiting")

    # Display the frame
    cv2.imshow("Gesture Recognition", frame)
    
    # Show debug visualization if enabled
    if show_debug and len(debug_frames) > 10:
        if debug_plot is None or not plt.fignum_exists(debug_plot.number):
            debug_plot = plt.figure(figsize=(10, 4))
            plt.ion()
        
        plt.clf()
        plt.plot(debug_frames, debug_confidences, 'b-')
        plt.axhline(y=STATIC_GESTURE_THRESHOLD, color='r', linestyle='-', label='Static threshold')
        plt.axhline(y=DYNAMIC_GESTURE_THRESHOLD, color='g', linestyle='-', label='Dynamic threshold')
        plt.ylim(0, 1.05)
        plt.title("Gesture Confidence Over Time")
        plt.xlabel("Frame")
        plt.ylabel("Confidence")
        plt.legend()
        plt.tight_layout()
        plt.draw()
        plt.pause(0.001)
    
    # Check for key presses
    key = cv2.waitKey(1) & 0xFF
    if key == ord('q'):
        break
    elif key == ord('s'):
        screenshot_path = f"screenshot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
        cv2.imwrite(screenshot_path, frame)
        print(f"Screenshot saved to {screenshot_path}")
    elif key == ord('d'):
        show_debug = not show_debug
        if not show_debug and debug_plot is not None:
            plt.close(debug_plot)
            debug_plot = None

# Clean up
cap.release()
cv2.destroyAllWindows()
if debug_plot is not None:
    plt.close(debug_plot)

print("Gesture recognition ended.")