# Training an ASL Classifier /Custom hand data

### 1. Creating dataset

_Landmark point to dataset from images_

In [1]:
import os
import cv2
import mediapipe as mp

import matplotlib.pyplot as plt

In [2]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

In [3]:
hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)

In [9]:
DATA_PATH = '../../Data/Hand-Gesture-data/custom_dataset'

In [10]:
data = []
labels = []
for dir_ in os.listdir(DATA_PATH):
    for img_path in os.listdir(os.path.join(DATA_PATH, dir_)):
        data_aux = []

        x_ = []
        y_ = []

        img = cv2.imread(os.path.join(DATA_PATH, dir_, img_path))
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        results = hands.process(img_rgb) # type: ignore
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x
                    y = hand_landmarks.landmark[i].y

                    x_.append(x)
                    y_.append(y)

                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x
                    y = hand_landmarks.landmark[i].y
                    data_aux.append(x - min(x_))
                    data_aux.append(y - min(y_))
                    
            
            # Append data only if the expected number of landmarks is present
            if len(data_aux) == 42:  # 21 landmarks with x and y values (21*2=42)
                data.append(data_aux)  # Append as a full row of features
                labels.append(dir_)
            else:
                print(f"Skipping image {img_path} due to incomplete landmarks")

            # data.append(data_aux)
            # labels.append(dir_)


In [None]:
# saving the data
# f = open('../Data/data.pickle', 'wb')
# pickle.dump({'data':data, 'labels':labels}, f)
# f.close()

### 2. Training the classifier

_using RandomForestClassifier initially...maybe would use ann later_

In [11]:
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

In [12]:
data = np.array(data)  # Should be 2D (n_samples, n_features)
labels = np.array(labels)  # Should be 1D (n_samples,)

In [13]:
print("Data shape:", data.shape)  
print("Labels shape:", labels.shape) 

Data shape: (944, 42)
Labels shape: (944,)


In [14]:
# splitting
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, shuffle=True, stratify=labels)

In [15]:
model = RandomForestClassifier()

In [16]:
model.fit(x_train, y_train)

RandomForestClassifier()

In [17]:
y_predict = model.predict(x_test)
score = accuracy_score(y_predict, y_test)
print('{}% of samples were classified correctly !'.format(score * 100))

100.0% of samples were classified correctly !


In [20]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

         ILY       1.00      1.00      1.00        40
   Open palm       1.00      1.00      1.00        40
 Thumbs down       1.00      1.00      1.00        39
   Thumbs up       1.00      1.00      1.00        32
     Victory       1.00      1.00      1.00        38

    accuracy                           1.00       189
   macro avg       1.00      1.00      1.00       189
weighted avg       1.00      1.00      1.00       189



In [21]:
# saving the model
f = open('../../Models/HandGesture models/RFmodel/model_v3.p', 'wb')
pickle.dump({'model': model}, f)
f.close()

### 3. Live Predictions

In [6]:
label_dict = {}

idx = 0
for dir_ in os.listdir(DATA_PATH):
    # print(dir_)
    label_dict[idx] = dir_
    idx += 1
    
# label_dict

In [10]:
import cv2
from time import time
from math import hypot
import mediapipe as mp
import matplotlib.pyplot as plt
import numpy as np
from time import sleep
import pickle

In [11]:
model_dict = pickle.load(open('../Models/RFmodel/model.p', 'rb'))
model = model_dict['model']

In [12]:
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands

hand_image = mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5, max_num_hands=1)
hand_video = mp_hands.Hands(static_image_mode=False, model_complexity=1, min_detection_confidence=0.7,
                          min_tracking_confidence=0.7, max_num_hands=1)

In [13]:
def detectHands(image, hand_fn, draw=False, display=False):
    # Create a copy of the input image.
    output_image = image.copy()
    
    # Convert the image from BGR into RGB format.
    imageRGB = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Perform the Pose Detection.
    results = hand_fn.process(imageRGB)
    
    if results.multi_hand_landmarks and draw:
            for num, hand in enumerate(results.multi_hand_landmarks):
                mp_drawing.draw_landmarks(output_image, hand, mp_hands.HAND_CONNECTIONS, 
                                        mp_drawing.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
                                        mp_drawing.DrawingSpec(color=(250, 44, 250), thickness=2, circle_radius=2),
                                         )
    
    if display:
        plt.figure(figsize=[22,22])
        plt.subplot(121);plt.imshow(image[:,:,::-1]);plt.title("Original Image");plt.axis('off')
        plt.subplot(122);plt.imshow(output_image[:,:,::-1]);plt.title("Output Image");plt.axis('off')
    else:
        return output_image, results

In [21]:
camera_video = cv2.VideoCapture(0)
camera_video.set(3, 1280)
camera_video.set(4, 960)

cv2.namedWindow('ASL!!!' , cv2.WINDOW_NORMAL)

while camera_video.isOpened():
    
    data_aux = []
    x_ = []
    y_ = []
    ok, frame = camera_video.read()

    
    if not ok:
        continue
    
    frame = cv2.flip(frame, 1)
    frame_height, frame_width, _ = frame.shape
    
    frame, results = detectHands(frame, hand_video, draw=True)
    
    if results.multi_hand_landmarks:

        for hand_landmarks in results.multi_hand_landmarks:
            for i in range(len(hand_landmarks.landmark)):
                x = hand_landmarks.landmark[i].x
                y = hand_landmarks.landmark[i].y

                x_.append(x)
                y_.append(y)

            for i in range(len(hand_landmarks.landmark)):
                x = hand_landmarks.landmark[i].x
                y = hand_landmarks.landmark[i].y
                data_aux.append(x - min(x_))
                data_aux.append(y - min(y_))

        x1 = int(min(x_) * frame_width) - 10
        y1 = int(min(y_) * frame_height) - 10

        x2 = int(max(x_) * frame_width) - 10
        y2 = int(max(y_) * frame_height) - 10
        
        if len(data_aux) == 42: 
            prediction = model.predict([np.asarray(data_aux)])
            predicted_character = prediction[0]
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 0), 4)
            cv2.putText(frame, predicted_character, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 0, 0), 3,cv2.LINE_AA)
        
    
    cv2.imshow('ASL!!!', frame)
    
    k = cv2.waitKey(1) & 0xFF 
    if k == 27:
        break

camera_video.release()
cv2.destroyAllWindows()