In [3]:
import os
import cv2
import pickle
import mediapipe as mp
import numpy as np
import tkinter as tk
from tkinter import simpledialog
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [5]:
# Step 1: Data Collection
DATA_DIR = './data'
dataset_size = 100  # Number of images per gesture

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

cap = cv2.VideoCapture(0)

while True:
    # Open a GUI window to get the label
    root = tk.Tk()
    root.withdraw()  # Hide the main Tkinter window
    label = simpledialog.askstring("Input", "Enter the label for this gesture (or type 'exit' to quit):")
    
    if label is None or label.lower() == 'exit':
        break  # Exit if the user closes the dialog or types "exit"

    class_dir = os.path.join(DATA_DIR, label)
    if not os.path.exists(class_dir):
        os.makedirs(class_dir)
    
    print(f'Collecting data for class: {label}')
    
    while True:
        ret, frame = cap.read()
        cv2.putText(frame, f'Ready for {label}? Press "Q"!', (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 3)
        cv2.imshow('frame', frame)
        if cv2.waitKey(25) == ord('q'):
            break
    
    counter = 0
    while counter < dataset_size:
        ret, frame = cap.read()
        cv2.imshow('frame', frame)
        cv2.imwrite(os.path.join(class_dir, f'{counter}.jpg'), frame)
        counter += 1
        cv2.waitKey(25)

cap.release()
cv2.destroyAllWindows()

Collecting data for class: hello
Collecting data for class: okay
Collecting data for class: no


In [7]:
# Step 2: Data Preprocessing
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)
data, labels = [], []

for dir_ in os.listdir(DATA_DIR):
    for img_path in os.listdir(os.path.join(DATA_DIR, dir_)):
        data_aux, x_, y_ = [], [], []
        img = cv2.imread(os.path.join(DATA_DIR, dir_, img_path))
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        results = hands.process(img_rgb)
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                for i in range(len(hand_landmarks.landmark)):
                    x_.append(hand_landmarks.landmark[i].x)
                    y_.append(hand_landmarks.landmark[i].y)
                for i in range(len(hand_landmarks.landmark)):
                    data_aux.append(hand_landmarks.landmark[i].x - min(x_))
                    data_aux.append(hand_landmarks.landmark[i].y - min(y_))
            data.append(data_aux)
            labels.append(dir_)  # Store the label as a string (e.g., "A", "B", "C")

# Save the dataset
pickle.dump({'data': data, 'labels': labels}, open('data.pickle', 'wb'))

In [11]:

# Step 3: Model Training
data_dict = pickle.load(open('data.pickle', 'rb'))
data, labels = np.asarray(data_dict['data']), np.asarray(data_dict['labels'])

# Ensure labels remain as strings
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, stratify=labels)

model = RandomForestClassifier()
model.fit(x_train, y_train)

y_predict = model.predict(x_test)
print(f'{accuracy_score(y_predict, y_test) * 100:.2f}% of samples were classified correctly!')

# Save the trained model
pickle.dump({'model': model}, open('model.p', 'wb'))

100.00% of samples were classified correctly!


In [18]:
# Step 4: Live Gesture Recognition
model_dict = pickle.load(open('model.p', 'rb'))
model = model_dict['model']

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)

cap = cv2.VideoCapture(0)

while True:
    data_aux, x_, y_ = [], [], []
    ret, frame = cap.read()
    H, W, _ = frame.shape
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            for i in range(len(hand_landmarks.landmark)):
                x_.append(hand_landmarks.landmark[i].x)
                y_.append(hand_landmarks.landmark[i].y)
            for i in range(len(hand_landmarks.landmark)):
                data_aux.append(hand_landmarks.landmark[i].x - min(x_))
                data_aux.append(hand_landmarks.landmark[i].y - min(y_))

        x1, y1 = int(min(x_) * W) - 10, int(min(y_) * H) - 10
        x2, y2 = int(max(x_) * W) - 10, int(max(y_) * H) - 10

        # Predict the gesture
        prediction = model.predict([np.asarray(data_aux)])

        # Display prediction on the screen
        predicted_label = prediction[0]  # Directly using predicted string label

        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 0), 4)
        cv2.putText(frame, predicted_label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 0, 0), 3)
        print(predicted_label,end=" ")

    cv2.imshow('frame', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay no no no no no no no no no no no no no okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello no no okay okay okay okay okay okay okay okay okay okay okay okay no no no no no no no no no no no no no no no Hello Hello Hello Hello Hello Hello Hello Hello Hello no no no no no no no no no okay okay okay Hello Hello Hello Hello no no no no no no no no okay okay okay okay okay okay okay okay okay no no no no no no no no no no no no no no no no no no okay okay okay okay no no no no no n

KeyboardInterrupt: 