# 1. DATA COLLECTION (Capturing Hand Gestures and saving it in the directory)

In [1]:
import os
import cv2

DIRECTORY = './images_gestures'
if not os.path.exists(DIRECTORY):
    os.makedirs(DIRECTORY)

number_of_gestures = 10
dataset_size = 100

frame_width = 890
frame_height = 620

cap = cv2.VideoCapture(0)
for j in range(number_of_gestures):
    if not os.path.exists(os.path.join(DIRECTORY, str(j))):
        os.makedirs(os.path.join(DIRECTORY, str(j)))

    print('Capturing Gesture {}'.format(j))

    done = False
    while True:
        ret, frame = cap.read()

        frame = cv2.resize(frame, (frame_width, frame_height))

        text = 'To create a hand gesture, Press "Enter"!'
        text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 1.3, 5)[0]
        text_x = int((frame.shape[1] - text_size[0]) / 2)
        text_y = int((frame.shape[0] + text_size[1]) / 2) 
        cv2.putText(frame, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (125, 0, 255), 5, cv2.LINE_AA)
        cv2.imshow("Frame", frame)
        if cv2.waitKey(25) == 13: 
            break

    # Countdown
    for i in range(3, 0, -1):
        ret, frame = cap.read()

        frame = cv2.resize(frame, (frame_width, frame_height))

        countdown_text = str(i)
        countdown_text_size = cv2.getTextSize(countdown_text, cv2.FONT_HERSHEY_SIMPLEX, 4, 4)[0]
        countdown_text_x = int((frame.shape[1] - countdown_text_size[0]) / 2)
        countdown_text_y = int((frame.shape[0] + countdown_text_size[1]) / 2) 
        cv2.putText(frame, countdown_text, (countdown_text_x, countdown_text_y), cv2.FONT_HERSHEY_SIMPLEX, 4, (125, 0, 255), 8, cv2.LINE_AA)
        cv2.imshow("Frame", frame)
        cv2.waitKey(1000)

    counter = 0
    while counter < dataset_size:
        ret, frame = cap.read()

        frame = cv2.resize(frame, (frame_width, frame_height))

        cv2.imshow('frame', frame)
        cv2.waitKey(25)
        cv2.imwrite(os.path.join(DIRECTORY, str(j), '{}.jpg'.format(counter)), frame)

        counter += 1

cap.release()
cv2.destroyAllWindows()


Capturing Gesture 0
Capturing Gesture 1
Capturing Gesture 2
Capturing Gesture 3
Capturing Gesture 4
Capturing Gesture 5
Capturing Gesture 6
Capturing Gesture 7
Capturing Gesture 8
Capturing Gesture 9


# 2. DATA PREPROCESSING(Using MediaPipe)

In [2]:
#DATA PREPROCESSING
import os
import csv
import mediapipe as mp
import cv2

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.2)

DIRECTORY = './images_gestures'

data = []
labels = []

for dir_ in os.listdir(DIRECTORY):
    for img_path in os.listdir(os.path.join(DIRECTORY, dir_)):
        data_aux = []

        x_ = []
        y_ = []

        img = cv2.imread(os.path.join(DIRECTORY, dir_, img_path))
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        results = hands.process(img_rgb)
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x
                    y = hand_landmarks.landmark[i].y

                    x_.append(x)
                    y_.append(y)

                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x
                    y = hand_landmarks.landmark[i].y
                    data_aux.append(x - min(x_))
                    data_aux.append(y - min(y_))

            data.append(data_aux)
            labels.append(dir_)
#DATA STORING 
with open('gestures.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['Label'] + [f'X_{i}' for i in range(1, 22)] + [f'Y_{i}' for i in range(1, 22)])
    for i in range(len(data)):
        csvwriter.writerow([labels[i]] + data[i])




# 3. MODEL TRAINING (Using K-Nearest Neighbour Algorithm)

In [3]:
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import pickle
import pandas as pd
import csv
import numpy as np

data_dict = pd.read_csv(open('./gestures.csv', 'rb'))

x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=1)

model = KNeighborsClassifier(n_neighbors=5)

model.fit(x_train, y_train)

predict = model.predict(x_test)

score=model.score(x_test, y_test)

print('{}% of samples were classified correctly !'.format(score * 100))

f = open('model.p', 'wb')
pickle.dump({'model': model}, f)
f.close()

98.95104895104895% of samples were classified correctly !


# 4. Real-Time Hand Gesture Recognition

In [4]:
import pickle
import pandas as pd
import cv2
import mediapipe as mp
import numpy as np
import time as t
import pyautogui
model_dict = pickle.load(open('./model.p', 'rb'))
model = model_dict['model']

cap = cv2.VideoCapture(0)

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)

labels_dict = {0: 'Forward', 1: 'Backward', 2: 'Pause', 3: 'Volume Up', 4: 'Volume Down', 5: 'Forward', 6: 'Backward', 7: 'Pause', 8: 'Volume Up', 9: 'Volume Down'}
try:
    while True:

        data_aux = []
        x_ = []
        y_ = []

        ret, frame = cap.read()

        H, W, _ = frame.shape

        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        results = hands.process(frame_rgb)
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(
                    frame,  # image to draw
                    hand_landmarks,  # model output
                    mp_hands.HAND_CONNECTIONS,  # hand connections
                    mp_drawing_styles.get_default_hand_landmarks_style(),
                    mp_drawing_styles.get_default_hand_connections_style())

            for hand_landmarks in results.multi_hand_landmarks:
                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x
                    y = hand_landmarks.landmark[i].y

                    x_.append(x)
                    y_.append(y)

                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x
                    y = hand_landmarks.landmark[i].y
                    data_aux.append(x - min(x_))
                    data_aux.append(y - min(y_))

            x1 = int(min(x_) * W) - 10
            y1 = int(min(y_) * H) - 10

            x2 = int(max(x_) * W) - 10
            y2 = int(max(y_) * H) - 10

            prediction = model.predict([np.asarray(data_aux)])

            predicted_character = labels_dict[int(prediction[0])]

            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 0), 4)
            cv2.putText(frame, predicted_character, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 0, 0), 3,
                        cv2.LINE_AA)

        cv2.imshow('frame', frame)
        cv2.waitKey(1)
        
except KeyboardInterrupt:
    print("Thank You")

cap.release()
cv2.destroyAllWindows()

Thank You


# 5. Media Controller using Hand Gesture Recognition

In [5]:
import pickle
import pandas as pd
import cv2
import mediapipe as mp
import numpy as np
import time as t
import pyautogui
model_dict = pickle.load(open('./model.p', 'rb'))
model = model_dict['model']

cap = cv2.VideoCapture(0)

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)

labels_dict = {0: 'Forward', 1: 'Backward', 2: 'Pause', 3: 'Volume Up', 4: 'Volume Down', 5: 'Forward', 6: 'Backward', 7: 'Pause', 8: 'Volume Up', 9: 'Volume Down'}
try:
    while True:

        data_aux = []
        x_ = []
        y_ = []

        ret, frame = cap.read()

        H, W, _ = frame.shape

        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        results = hands.process(frame_rgb)
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(
                    frame,  # image to draw
                    hand_landmarks,  # model output
                    mp_hands.HAND_CONNECTIONS,  # hand connections
                    mp_drawing_styles.get_default_hand_landmarks_style(),
                    mp_drawing_styles.get_default_hand_connections_style())

            for hand_landmarks in results.multi_hand_landmarks:
                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x
                    y = hand_landmarks.landmark[i].y

                    x_.append(x)
                    y_.append(y)

                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x
                    y = hand_landmarks.landmark[i].y
                    data_aux.append(x - min(x_))
                    data_aux.append(y - min(y_))

            x1 = int(min(x_) * W) - 10
            y1 = int(min(y_) * H) - 10

            x2 = int(max(x_) * W) - 10
            y2 = int(max(y_) * H) - 10

            prediction = model.predict([np.asarray(data_aux)])

            predicted_character = labels_dict[int(prediction[0])]

            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 0), 4)
            cv2.putText(frame, predicted_character, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 0, 0), 3,
                        cv2.LINE_AA)
            a = predicted_character
            #print(a) 
            if a == "Forward":
                pyautogui.press('right')
                t.sleep(1)
            elif a == "Backward":
                pyautogui.press("left")
                t.sleep(1)
            elif a == "Pause":
                pyautogui.press("space")
                t.sleep(1)
            elif a == "Volume Up":
                pyautogui.press("up")
                t.sleep(1)
            elif a == "Volume Down":
                pyautogui.press("down")
                t.sleep(1)
            else:
                pass
        cv2.imshow('frame', frame)
        cv2.waitKey(1)

except KeyboardInterrupt:
    print("Thank You")
    
cap.release()
cv2.destroyAllWindows()

Thank You
