In [1]:
pip install opencv-python

Note: you may need to restart the kernel to use updated packages.


In [None]:
pip uninstall cmake

In [1]:
pip install scikit-learn pyttsx3 pandas

Collecting pyttsx3
  Downloading pyttsx3-2.90-py3-none-any.whl.metadata (3.6 kB)
Collecting comtypes (from pyttsx3)
  Downloading comtypes-1.4.2-py3-none-any.whl.metadata (4.1 kB)
Collecting pypiwin32 (from pyttsx3)
  Downloading pypiwin32-223-py3-none-any.whl.metadata (236 bytes)
Downloading pyttsx3-2.90-py3-none-any.whl (39 kB)
Downloading comtypes-1.4.2-py3-none-any.whl (201 kB)
   ---------------------------------------- 0.0/201.2 kB ? eta -:--:--
   -------------- ------------------------- 71.7/201.2 kB 2.0 MB/s eta 0:00:01
   ---------------------------- ----------- 143.4/201.2 kB 1.7 MB/s eta 0:00:01
   -------------------------------------- - 194.6/201.2 kB 2.0 MB/s eta 0:00:01
   -------------------------------------- - 194.6/201.2 kB 2.0 MB/s eta 0:00:01
   -------------------------------------- - 194.6/201.2 kB 2.0 MB/s eta 0:00:01
   -------------------------------------- - 194.6/201.2 kB 2.0 MB/s eta 0:00:01
   -------------------------------------- - 194.6/201.2 kB 2.0 MB

# mainly includes 3 steps:

* capturing hand landarks -We use MediaPipe to detect and track hand landmarks from the webcam feed.Each frame's hand landmarks are collected and saved into a CSV file for later use in model training.
* training a machine learning model: we train a K-Nearest Neighbors (KNN) classifier to recognize different hand gestures.
  We load the collected hand landmarks data.
  Split the data into training and testing sets.
  Train a KNN classifier and evaluate its accuracy.
  Save the trained model for later use in real-time recognition.
* recognising gestures in real time
  We load the trained model.
  Capture hand landmarks in real-time and use the model to predict the gesture.
  Provide audio feedback using text-to-speech and display the gesture name on the webcam feed.
  

#  step 1:Capture Hand Landmarks

In [1]:
import cv2
import mediapipe as mp
import numpy as np

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils 
mp_drawing_styles = mp.solutions.drawing_styles 

# Initialize webcam
cam = cv2.VideoCapture(0)

with mp_hands.Hands(
    model_complexity=0,  
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
) as hands:
    while cam.isOpened():
        success, image = cam.read()
        if not success:
            continue
        
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        results = hands.process(image) #rocesses the RGB image to detect and track hands.
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)#Converts the image back to BGR color space for OpenCV compatibility.
        
       
        if results.multi_hand_landmarks: #Checks if any hand landmarks are detected.
            for hand_landmarks in results.multi_hand_landmarks: #Iterates through each detected hand.
                mp_drawing.draw_landmarks( #Draws hand landmarks and connections on the image.
                    image, 
                    hand_landmarks,
                    mp_hands.HAND_CONNECTIONS,
                    mp_drawing_styles.get_default_hand_landmarks_style(),
                    mp_drawing_styles.get_default_hand_connections_style()
                )
             
                data = [] #Initializes an empty list to store landmark coordinat
                for point in mp_hands.HandLandmark:
                    normalizedLandmark = hand_landmarks.landmark[point] #Gets the normalized coordinates of each landmark.
                    data.append(normalizedLandmark.x) #Appends the x, y, and z coordinates of each landmark to the data list.
                    data.append(normalizedLandmark.y)
                    data.append(normalizedLandmark.z)
            
                print(len(data))
                
                
                data = str(data)[1:-1] #Converts the data list to a comma-separated string and removes the square brackets.
                with open('hello.csv', 'a') as f: #pens the CSV file in append mode
                    f.write(data + ',hello\n') #Writes the data string to the file, adding a label ("rock").
        
        cv2.imshow('Hand Tracking', image)
        
        
        if cv2.waitKey(20) & 0xFF == ord('q'):
            break

cam.release()
cv2.destroyAllWindows()




63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63


# Train the Model

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import pickle

data = pd.read_csv('hello.csv')
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25) #(25% for testing).

classifier = KNeighborsClassifier(n_neighbors=5) # 5 neighbours
classifier.fit(X_train, Y_train) #Trains the classifier on the training data.


accuracy = accuracy_score(Y_test, classifier.predict(X_test))
print(f'Accuracy: {accuracy}') # accuracy of model

# Save the model to a file
with open('model.pkl', 'wb') as model_file:
    pickle.dump(classifier, model_file) #Serializes and saves the trained classifier to the file.


Accuracy: 1.0


#  load the trained model and run the application to recognize gestures and provide text-to-speech output:

In [4]:
import cv2
import mediapipe as mp
import pickle
import numpy as np
import pyttsx3  # the text-to-speech conversion library

model = pickle.load(open('model.pkl', 'rb'))

mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_hands = mp.solutions.hands

engine = pyttsx3.init() # Initialize text-to-speech engine

cam = cv2.VideoCapture(0)

with mp_hands.Hands(
    model_complexity=0,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
) as hands:
    while cam.isOpened():
        success, image = cam.read()
        if not success:
            print("Ignoring empty camera frame.")
            continue
        
     
        imageWidth, imageHeight = image.shape[:2] #Retrieves height and width of img
        
        
        image.flags.writeable = False #Marks the image as non-writable to improve performance by allowing operations to pass by reference instead of making a copy.
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        results = hands.process(image)
        
        image.flags.writeable = True # Marks the image as writable again after processing.
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(
                    image, hand_landmarks,
                    mp_hands.HAND_CONNECTIONS,
                    mp_drawing_styles.get_default_hand_landmarks_style(),
                    mp_drawing_styles.get_default_hand_connections_style()
                )
                
                
                data = []
                for point in mp_hands.HandLandmark:
                    normalizedLandmark = hand_landmarks.landmark[point]
                    data.append(normalizedLandmark.x)
                    data.append(normalizedLandmark.y)
                    data.append(normalizedLandmark.z)
                
                #Ensures that the data list contains the correct number of coordinates (21 landmarks × 3 coordinates each = 63
                if len(data) == 63:
                    
                    out = model.predict([data])
                    gesture = out[0]  # Assuming the model returns the gesture name
                    
                    print(gesture)
                    
                   
                    engine.say(gesture) # text-to-speech engine to say the predicted gesture.
                    engine.runAndWait() #Runs the speech engine to produce the sound.
                    
                    font = cv2.FONT_HERSHEY_SIMPLEX
                    org = (50, 50) # top left corner of text
                    fontScale = 1 # size of text
                    color = (255, 0, 0) 
                    thickness = 2
                    image = cv2.putText(image, gesture, org, font, fontScale, color, thickness, cv2.LINE_AA)
        
       
        cv2.imshow('MediaPipe Hands', image)
       
        if cv2.waitKey(5) & 0xFF == ord('q'):
 
cam.release()
cv2.destroyAllWindows()




rock




rock
rock




rock
