# Hand Gesture Recognition System  
### Computer Vision Project using MediaPipe Hands

In [1]:
import cv2
import mediapipe as mp
import numpy as np
import time
import pandas as pd
from collections import defaultdict

from datetime import datetime

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.5
)


In [2]:
def is_finger_extended(landmarks, finger_tip_id, finger_pip_id):
    """Check if a finger is extended based on landmark positions"""
    tip = landmarks[finger_tip_id]
    pip = landmarks[finger_pip_id]
    return tip.y < pip.y  # Tip above PIP joint

def is_thumb_extended(landmarks):
    """Special case for thumb - check horizontal extension"""
    thumb_tip = landmarks[mp_hands.HandLandmark.THUMB_TIP]
    thumb_ip = landmarks[mp_hands.HandLandmark.THUMB_IP]
    thumb_mcp = landmarks[mp_hands.HandLandmark.THUMB_MCP]

    palm_center_x = landmarks[mp_hands.HandLandmark.WRIST].x
    return abs(thumb_tip.x - palm_center_x) > abs(thumb_mcp.x - palm_center_x)


In [3]:
def classify_gesture(hand_landmarks):
    """Rule-based gesture classification"""
    landmarks = hand_landmarks.landmark

    thumb_extended = is_thumb_extended(landmarks)
    index_extended = is_finger_extended(landmarks,
                                        mp_hands.HandLandmark.INDEX_FINGER_TIP,
                                        mp_hands.HandLandmark.INDEX_FINGER_PIP)
    middle_extended = is_finger_extended(landmarks,
                                         mp_hands.HandLandmark.MIDDLE_FINGER_TIP,
                                         mp_hands.HandLandmark.MIDDLE_FINGER_PIP)
    ring_extended = is_finger_extended(landmarks,
                                       mp_hands.HandLandmark.RING_FINGER_TIP,
                                       mp_hands.HandLandmark.RING_FINGER_PIP)
    pinky_extended = is_finger_extended(landmarks,
                                        mp_hands.HandLandmark.PINKY_TIP,
                                        mp_hands.HandLandmark.PINKY_PIP)

    if not any([thumb_extended, index_extended, middle_extended, ring_extended, pinky_extended]):
        return "FIST"
    if all([thumb_extended, index_extended, middle_extended, ring_extended, pinky_extended]):
        return "OPEN_PALM"
    if thumb_extended and not any([index_extended, middle_extended, ring_extended, pinky_extended]):
        return "THUMBS_UP"
    if index_extended and middle_extended and not any([ring_extended, pinky_extended]):
        return "PEACE_SIGN"
    if index_extended and not any([middle_extended, ring_extended, pinky_extended]):
        return "POINTING"
    if index_extended and middle_extended and ring_extended and not pinky_extended:
        return "THREE"
    if index_extended and middle_extended and ring_extended and pinky_extended and not thumb_extended:
        return "FOUR"

    return "UNKNOWN"


In [4]:
def run_gesture_recognition(duration=60, show_landmarks=True):
    cap = cv2.VideoCapture(0)
    start_time = time.time()
    current_gesture = "NONE"

    print("Starting gesture recognition...")
    print("Press 'q' to quit\n")

    while cap.isOpened() and (time.time() - start_time) < duration:
        ret, frame = cap.read()
        if not ret:
            break

        frame = cv2.flip(frame, 1)
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(rgb_frame)

        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                if show_landmarks:
                    mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

                gesture = classify_gesture(hand_landmarks)
                if gesture != current_gesture:
                    current_gesture = gesture
                    print("Detected:", current_gesture)
        else:
            if current_gesture != "NONE":
                current_gesture = "NONE"
                print("No hand detected")

        cv2.putText(frame, f"Gesture: {current_gesture}",
                    (10, 30), cv2.FONT_HERSHEY_SIMPLEX,
                    1, (0, 255, 0), 2)

        cv2.imshow("Hand Gesture Recognition", frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()


In [5]:
run_gesture_recognition(duration=60)

Starting gesture recognition...
Press 'q' to quit





Detected: UNKNOWN
Detected: THUMBS_UP
No hand detected
Detected: THUMBS_UP
No hand detected
Detected: THUMBS_UP
No hand detected
Detected: THREE
Detected: OPEN_PALM
Detected: PEACE_SIGN
Detected: THREE
Detected: FOUR
Detected: OPEN_PALM
Detected: THUMBS_UP
Detected: FIST
Detected: THUMBS_UP
Detected: FIST
Detected: THUMBS_UP
Detected: FIST
Detected: THUMBS_UP
Detected: FIST
Detected: UNKNOWN
Detected: FIST
No hand detected
Detected: THUMBS_UP
No hand detected
Detected: UNKNOWN
Detected: OPEN_PALM
No hand detected
Detected: FOUR
Detected: THUMBS_UP
Detected: OPEN_PALM
Detected: THUMBS_UP
Detected: FIST
Detected: THUMBS_UP
Detected: FIST
Detected: THUMBS_UP
Detected: FIST
Detected: THUMBS_UP
No hand detected
Detected: FIST
Detected: THREE
Detected: FOUR
Detected: UNKNOWN
Detected: THREE
Detected: POINTING
Detected: FOUR
Detected: FIST
Detected: FOUR
Detected: FIST
Detected: PEACE_SIGN
Detected: FIST
Detected: THREE
Detected: FIST
Detected: THREE
Detected: FIST
Detected: THUMBS_UP
Detecte

In [6]:
class GestureEvaluator:
    """Class for evaluating gesture recognition accuracy"""

    def __init__(self):
        self.evaluation_data = []
        self.gesture_names = ["FIST", "OPEN_PALM", "THUMBS_UP", "PEACE_SIGN", "POINTING", "THREE","FOUR"]

    def collect_evaluation_data(self, gesture_name, num_samples=30,
                                distance="medium", background="clean"):
        cap = cv2.VideoCapture(0)
        samples_collected = 0
        collecting = False

        print(f"\nCollecting data for: {gesture_name}")
        print(f"Distance: {distance}, Background: {background}")
        print("Press 's' to start, 'q' to quit\n")

        while cap.isOpened() and samples_collected < num_samples:
            ret, frame = cap.read()
            if not ret:
                break

            frame = cv2.flip(frame, 1)
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = hands.process(rgb_frame)

            predicted = "NONE"
            confidence_score = 0.0

            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                    predicted = classify_gesture(hand_landmarks)
                    confidence_score = results.multi_handedness[0].classification[0].score

                    if collecting:
                        self.evaluation_data.append({
                            'timestamp': datetime.now().isoformat(),
                            'ground_truth': gesture_name,
                            'predicted': predicted,
                            'correct': predicted == gesture_name,
                            'distance': distance,
                            'background': background,
                            'confidence': confidence_score
                        })
                        samples_collected += 1

            status = "COLLECTING" if collecting else "READY - Press 's'"
            cv2.putText(frame, f"{status} ({samples_collected}/{num_samples})",
                        (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

            cv2.putText(frame, f"Ground Truth: {gesture_name}",
                        (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 0), 2)

            cv2.putText(frame, f"Predicted: {predicted}",
                        (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2)

            cv2.imshow("Data Collection", frame)
            key = cv2.waitKey(1) & 0xFF

            if key == ord('s'):
                collecting = True
                print("Started collecting...")
            elif key == ord('q'):
                break

        cap.release()
        cv2.destroyAllWindows()
        print(f"Collected {samples_collected} samples for {gesture_name}\n")


In [7]:
def run_full_evaluation(self, samples_per_condition=30):
        distances = ["close", "medium", "far"]
        backgrounds = ["clean", "cluttered"]

        print("=" * 50)
        print(" FULL GESTURE EVALUATION ")
        print("=" * 50)

        for gesture in self.gesture_names:
            for distance in distances:
                for background in backgrounds:
                    print(f"Testing {gesture} | {distance} | {background}")
                    self.collect_evaluation_data(
                        gesture,
                        num_samples=samples_per_condition,
                        distance=distance,
                        background=background
                    )
                    time.sleep(1)

        self.save_results()

def calculate_metrics(self):
        if not self.evaluation_data:
            print("No evaluation data found.")
            return

        df = pd.DataFrame(self.evaluation_data)

        overall_accuracy = df["correct"].mean() * 100
        gesture_accuracy = df.groupby("ground_truth")["correct"].mean() * 100
        distance_accuracy = df.groupby("distance")["correct"].mean() * 100
        background_accuracy = df.groupby("background")["correct"].mean() * 100

        confusion = pd.crosstab(df["ground_truth"], df["predicted"],
                                normalize="index") * 100

        print("\nOverall Accuracy:", round(overall_accuracy, 2), "%")
        print("\nGesture Accuracy:\n", gesture_accuracy)
        print("\nDistance Accuracy:\n", distance_accuracy)
        print("\nBackground Accuracy:\n", background_accuracy)
        print("\nConfusion Matrix:\n", confusion.round(2))

        return {
            "overall_accuracy": overall_accuracy,
            "gesture_accuracy": gesture_accuracy.to_dict(),
            "distance_accuracy": distance_accuracy.to_dict(),
            "background_accuracy": background_accuracy.to_dict(),
            "confusion": confusion.to_dict()
        }

def save_results(self, filename="evaluation_results.csv"):
        df = pd.DataFrame(self.evaluation_data)
        df.to_csv(filename, index=False)
        print(f"Results saved to {filename}")

def load_results(self, filename="evaluation_results.csv"):
        df = pd.read_csv(filename)
        self.evaluation_data = df.to_dict("records")
        print(f"Loaded {len(self.evaluation_data)} samples from {filename}")


In [8]:
def test_robustness():
    evaluator = GestureEvaluator()

    print("Running quick robustness test...")

    test_cases = [
        ("FIST", "close", "clean"),
        ("FIST", "far", "cluttered"),
        ("OPEN_PALM", "medium", "clean"),
        ("THUMBS_UP", "close", "cluttered"),
        ("PEACE_SIGN", "far", "clean"),
        ("POINTING", "medium", "cluttered"),
        ("THREE", "close", "clean"),
        ("FOUR", "far", "cluttered"),
    ]

    for gesture, dist, bg in test_cases:
        evaluator.collect_evaluation_data(
            gesture, num_samples=10, distance=dist, background=bg
        )

    evaluator.calculate_metrics()
    evaluator.save_results("robustness_test.csv")

# test_robustness()
