In [None]:
%pip install mediapipe opencv-python

Note: you may need to restart the kernel to use updated packages.


The filename, directory name, or volume label syntax is incorrect.


In [22]:
%pip install matplotlib

Note: you may need to restart the kernel to use updated packages.


The filename, directory name, or volume label syntax is incorrect.


In [23]:
import mediapipe as mp # mediapipe by Google as our ML solution for live and streaming media data
import cv2 # openCV for webcam
import numpy as np # math mod|ule for mediapipe
import uuid # uniform unique identifier creates a unique string for images
import os 
from matplotlib import pyplot as plt
import tkinter
from tkinter import messagebox # alert once a gesture is detected

In [2]:
mp_drawing = mp.solutions.drawing_utils # able to create red dots, which are landmarks
mp_hands = mp.solutions.hands # a hands model with default landmarks

In [4]:
# create folder for saved webcam frames
os.mkdir("Output Images")

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'Output Images'

# Step 1: Build Hand Detection using MediaPipe

Let's explore the capabilities of MediaPipe by Google!

<img src=https://mediapipe.dev/images/mobile/hand_landmarks.png>

In [20]:
# index refers to the hand result; dependent on number of hands on screen
# hand refers to the actual hand landmarks
# results refers to our hand axis data
def get_label(index, hand, results):
    output = None
    for idx, classification in enumerate(results.multi_handedness): 
        if idx == index: # checking for correct hand
            # Extract data
            label = classification.classification[0].label
            score = classification.classification[0].score
            text = "{} {}".format(label, round(score, 2))

            # Extract coordinates 
            # [640, 480] refers to the dimensions of your webcam, change it accordingly
            coords = tuple(
                np.multiply(np.array((hand.landmark[mp_hands.HandLandmark.WRIST].x, hand.landmark[mp_hands.HandLandmark.WRIST].y)),
                [640, 480]).astype(int))

            output = text, coords
        
    return output


In [31]:
# 6 angles shown - 5 for each finger, 1 for wrist 
joint_list = [[4, 3, 2], [8, 7, 6], [12, 11, 10], [16, 15, 14], [20, 19, 18], [1, 0, 5]] # joints of fingers we will calculate angle from

In [53]:
def draw_finger_angles(image, results, joint_list):
    angles = []
    # Loop through hands
    for hand in results.multi_hand_landmarks:
        #Loop through joint sets 
        for joint in joint_list:
            a = np.array([hand.landmark[joint[0]].x, hand.landmark[joint[0]].y]) # First coord
            b = np.array([hand.landmark[joint[1]].x, hand.landmark[joint[1]].y]) # Second coord
            c = np.array([hand.landmark[joint[2]].x, hand.landmark[joint[2]].y]) # Third coord
            
            # formula to calculate angle
            radians = np.arctan2(c[1] - b[1], c[0]-b[0]) - np.arctan2(a[1]-b[1], a[0]-b[0])
            angle = np.abs(radians*180.0/np.pi)
            
            if angle > 180.0: # reference to upright finger
                angle = 360 - angle
            
            angles.append(angle)
            
            # print to image
            cv2.putText(image, str(round(angle, 2)), tuple(np.multiply(b, [640, 480]).astype(int)),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2, cv2.LINE_AA)
    return image

In [66]:
cap = cv2.VideoCapture(0) # the input number 0 depends on your machine, it could be 1 or 2 as well

# Detection Confidence: Threshold for initial detection to be successful
# Tracking Confidence: Threshold for tracking after initial detection
with mp_hands.Hands(min_detection_confidence = 0.5, min_tracking_confidence = 0.5, max_num_hands = 4) as hands:
    while cap.isOpened():
        ret, frame = cap.read() # returns a return value and frame which is the image

        # Get hand axis data from each frame; default 2 hands detectable
        # In case your webcam is so old that it uses the color format BGR instead of RGB, uncomment the appropriate lines
        image = cv2.flip(frame, 1) # flip on horizontal
        # image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # webcam frame colour format is in BGR, but mediapipe takes in RGB
        image.flags.writeable = False # lock the frame for us to process its data
        results = hands.process(image) # get axis data
        image.flags.writeable = True # unlock the frame for us to draw landmarks and connections
        # image = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) # convert mediapipe image back to BGR for webcam
        
        cv2.putText(image, "Status: Safe", (0, 25), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

        if results.multi_hand_landmarks: # if we detect a hand, we draw the landmarks and the connections
            for num, hand in enumerate(results.multi_hand_landmarks):
                mp_drawing.draw_landmarks(image, hand, mp_hands.HAND_CONNECTIONS,
                                        mp_drawing.DrawingSpec(color=(0, 0, 0), thickness=2, circle_radius=4), # color = BGR
                                        mp_drawing.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2))

                # output hand left or right + accuracy score at coords calculated (wrist)
                if get_label(num, hand, results): 
                    text, coord = get_label(num, hand, results)
                    cv2.putText(image, text, coord, cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
            
                if (True): # need machine learning to train to detect the gesture we want 
                    cv2.putText(image, "Status: DANGER", (0, 25), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
                    # root = tkinter.Tk()
                    # root.withdraw()
                    # messagebox.showinfo("Title", "Message") # alert box once a hand is detected
                    # break                
        
            # draw_finger_angles(image, results, joint_list)

        cv2.imshow("Live Webcam Feed", image) # render the image to the screen, and name it "Hand Tracking"

        # Save images
        # cv2.imwrite(os.path.join("Output Images", "{}.jpg".format(uuid.uuid1())), image)

        if cv2.waitKey(10) & 0xFF == ord("q"): # exit the webcam feed in 10ms by pressing q 
            break

cap.release()
cv2.destroyAllWindows()

# Step 2: Extract Landmarks and Angle Data & Export to CSV

For our project, our goal is to detect a certain hand gesture among a crowd to signal an SOS.

Thus, the most logical data to collect for this purpose is the hand landmarks. This will be our main variable.

For more improvements, we could add in the angle of our joints.

For further developement of such SOS gestures beyond our hands, we could add in face and pose landmarks, which are supported by MediaPipe.

In [34]:
import csv

In [90]:
# total number of hand landmarks = 21
# total number of x, y, z coordinates = 21 * 3 = 63
variables = len(hand.landmark)

In [91]:
# form our column headers in csv file
landmarks = ['class'] # class refers to the result, i.e., whether the hand shows the gesture or not
for i in range(1, variables+1):
    landmarks += ["x{}".format(i), "y{}".format(i), "z{}".format(i)]

In [92]:
# create csv file
with open("var.csv", mode = "w", newline = '') as f:
    csv_writer = csv.writer(f, delimiter = ",", quotechar = '"', quoting = csv.QUOTE_MINIMAL)
    csv_writer.writerow(landmarks)

In [95]:
class_name = "SOS"

In [97]:
cap = cv2.VideoCapture(0) # the input number 0 depends on your machine, it could be 1 or 2 as well

# Detection Confidence: Threshold for initial detection to be successful
# Tracking Confidence: Threshold for tracking after initial detection
with mp_hands.Hands(min_detection_confidence = 0.5, min_tracking_confidence = 0.5, max_num_hands = 4) as hands:
    while cap.isOpened():
        ret, frame = cap.read() # returns a return value and frame which is the image

        # Get hand axis data from each frame; default 2 hands detectable
        # In case your webcam is so old that it uses the color format BGR instead of RGB, uncomment the appropriate lines
        image = cv2.flip(frame, 1) # flip on horizontal
        # image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # webcam frame colour format is in BGR, but mediapipe takes in RGB
        image.flags.writeable = False # lock the frame for us to process its data
        results = hands.process(image) # get axis data
        image.flags.writeable = True # unlock the frame for us to draw landmarks and connections
        # image = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) # convert mediapipe image back to BGR for webcam
        
        cv2.putText(image, "Status: Safe", (0, 25), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

        if results.multi_hand_landmarks: # if we detect a hand, we draw the landmarks and the connections
            for num, hand in enumerate(results.multi_hand_landmarks):
                mp_drawing.draw_landmarks(image, hand, mp_hands.HAND_CONNECTIONS,
                                        mp_drawing.DrawingSpec(color=(0, 0, 0), thickness=2, circle_radius=4), # color = BGR
                                        mp_drawing.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2))

                # output hand left or right + accuracy score at coords calculated (wrist)
                if get_label(num, hand, results): 
                    text, coord = get_label(num, hand, results)
                    cv2.putText(image, text, coord, cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
            
                if (True): # need machine learning to train to detect the gesture we want 
                    cv2.putText(image, "Status: DANGER", (0, 25), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
                    # root = tkinter.Tk()
                    # root.withdraw()
                    # messagebox.showinfo("Title", "Message") # alert box once a hand is detected
                    # break                

                # Export training data
                # For better accuracy, take more samples
                try:
                    hand_landmark = hand.landmark
                    hand_row = list(np.array([[landmark.x, landmark.y, landmark.z] for landmark in hand_landmark]).flatten())

                    hand_row.insert(0, class_name)

                    with open("var.csv", mode = "a", newline = '') as f:
                        csv_writer = csv.writer(f, delimiter = ",", quotechar = '"', quoting = csv.QUOTE_MINIMAL)
                        csv_writer.writerow(hand_row)
                except:
                    pass

            # draw_finger_angles(image, results, joint_list)


        cv2.imshow("Live Webcam Feed", image) # render the image to the screen, and name it "Hand Tracking"

        # Save images
        # cv2.imwrite(os.path.join("Output Images", "{}.jpg".format(uuid.uuid1())), image)

        if cv2.waitKey(10) & 0xFF == ord("q"): # exit the webcam feed in 10ms by pressing q 
            break

cap.release()
cv2.destroyAllWindows()

In [80]:
hand_row

[0.5858122706413269,
 0.6939601302146912,
 2.2907808272520924e-07,
 0.527172327041626,
 0.6743758916854858,
 -0.017742620781064034,
 0.4777282774448395,
 0.6202595233917236,
 -0.026190297678112984,
 0.4417157471179962,
 0.5779957175254822,
 -0.0341699980199337,
 0.40599194169044495,
 0.5589165091514587,
 -0.042631637305021286,
 0.5123770833015442,
 0.5076427459716797,
 -0.009192545898258686,
 0.4823743402957916,
 0.43902283906936646,
 -0.02386436052620411,
 0.4639066755771637,
 0.39624032378196716,
 -0.03798253834247589,
 0.45025795698165894,
 0.36031728982925415,
 -0.04888857528567314,
 0.5487779378890991,
 0.4913095533847809,
 -0.014613251201808453,
 0.5304022431373596,
 0.406402587890625,
 -0.026121854782104492,
 0.520290195941925,
 0.35374170541763306,
 -0.038568440824747086,
 0.5126468539237976,
 0.31152933835983276,
 -0.04809936508536339,
 0.5848333835601807,
 0.4945409893989563,
 -0.02359580807387829,
 0.5789993405342102,
 0.4121286869049072,
 -0.03821095451712608,
 0.5750751495

# Step 3: CNN Models

It is time to train our dataset of landmarks to detect our SOS gesture.  

It seems like there are 2 popular open source solutions for neural network models other than building your own:
Scikit-Learn 
 
    - Scikit-Learn
    
    - Tensorflow  
    
We shall be exploring and learning both of them since it is just a matter of applying rather than building an algorithm from scratch. At least, it is just a few hyperparameters.

## Model 1: Scikit-Learn

In [None]:
%pip install -U scikit-learn scipy matplotlib

In [112]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pickle

In [99]:
# read dataset
df = pd.read_csv("var.csv")
df.head()

Unnamed: 0,class,x1,y1,z1,x2,y2,z2,x3,y3,z3,...,z18,x19,y19,z19,x20,y20,z20,x21,y21,z21
0,SOS,0.659763,0.988639,4.275413e-07,0.592645,0.962583,-0.042312,0.542857,0.90145,-0.07247,...,-0.070238,0.793026,0.736141,-0.094796,0.811189,0.684645,-0.105867,0.824053,0.636357,-0.112448
1,SOS,0.663425,0.931848,5.699486e-07,0.58807,0.861756,-0.026669,0.531753,0.772755,-0.041814,...,-0.036531,0.77846,0.604706,-0.050955,0.79639,0.552505,-0.057252,0.810432,0.500476,-0.061279
2,SOS,0.655319,0.862647,5.972269e-07,0.57516,0.805765,-0.030462,0.520258,0.72063,-0.047605,...,-0.047771,0.764324,0.544507,-0.069305,0.78156,0.491967,-0.079103,0.793906,0.439828,-0.085596
3,SOS,0.642661,0.810859,5.846537e-07,0.562953,0.772985,-0.034952,0.505502,0.686717,-0.052515,...,-0.048999,0.755159,0.508332,-0.07387,0.773151,0.455497,-0.086857,0.78492,0.403252,-0.095704
4,SOS,0.627298,0.801663,5.177911e-07,0.546639,0.761923,-0.034101,0.486315,0.67954,-0.051714,...,-0.056262,0.729574,0.496518,-0.081211,0.749899,0.444681,-0.092765,0.76556,0.394522,-0.100449


In [108]:
# Split dataset into train and set
x = df.drop("class", axis = 1)
y = df["class"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=1)

scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)


In [110]:
# We are using MLPClassifer from Scikit-learn for this implementation
# There are a few parameters in MLPClassifier that we have to hypertune, which is one of the disadvantages of this neural network model,
# but this is unavoided because we do not have the computing power for a more advanced one
# Here are the more important parameters:
    # solver: {'lbfgs', 'sgd', 'adam'}; adam works well for large datasets (>10000), lbfgs works better for smaller datasets
    # alpha: strength of L2 regularization
    # hidden_layer_sizes: number of neurons in the ith hidden layer; default = (100,)
clf = MLPClassifier(solver = 'lbfgs', alpha = 1e-5, hidden_layer_sizes = (63, 31, 16, 8, 4,), random_state = 1)
clf.fit(x_train, y_train)

In [116]:
yhat = clf.predict(x_test)
print("Accuracy of MLPClassifier = " + str(accuracy_score(y_test, yhat) * 100.0) + "%")

Accuracy of MLPClassifier = 100.0%


In [117]:
# save our model as pickle file
# pickle file is recommended in the data science field, but you can export it as a python file too for other editors
with open("sos-gesture.pkl", "wb") as f:
    pickle.dump(clf, f)

### Advantages of MLP
- Capability to learn non-linear models.
- Capability to learn models in real-time (on-line learning)

### Disadvantages of MLP
- MLP with hidden layers have a non-convex loss function where there exists more than one local minimum. Therefore different random weight initializations can lead to different validation accuracy.
- MLP requires tuning a number of hyperparameters such as the number of hidden neurons, layers, and iterations.
- MLP is sensitive to feature scaling.

## Model 2: Tensorflow


## Step 4: Make Detections with our Models in Real-time

Instead of appending our live hand gesture data into our csv, we will load that as rows into our model to predict whether it is the gesture we want or not.

In [118]:
with open("sos-gesture.pkl", "rb") as f:
    model = pickle.load(f)

In [119]:
model

In [None]:
cap = cv2.VideoCapture(0) # the input number 0 depends on your machine, it could be 1 or 2 as well

# Detection Confidence: Threshold for initial detection to be successful
# Tracking Confidence: Threshold for tracking after initial detection
with mp_hands.Hands(min_detection_confidence = 0.5, min_tracking_confidence = 0.5, max_num_hands = 4) as hands:
    while cap.isOpened():
        ret, frame = cap.read() # returns a return value and frame which is the image

        # Get hand axis data from each frame; default 2 hands detectable
        # In case your webcam is so old that it uses the color format BGR instead of RGB, uncomment the appropriate lines
        image = cv2.flip(frame, 1) # flip on horizontal
        # image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # webcam frame colour format is in BGR, but mediapipe takes in RGB
        image.flags.writeable = False # lock the frame for us to process its data
        results = hands.process(image) # get axis data
        image.flags.writeable = True # unlock the frame for us to draw landmarks and connections
        # image = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) # convert mediapipe image back to BGR for webcam
        
        cv2.putText(image, "Status: Safe", (0, 25), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

        if results.multi_hand_landmarks: # if we detect a hand, we draw the landmarks and the connections
            for num, hand in enumerate(results.multi_hand_landmarks):
                mp_drawing.draw_landmarks(image, hand, mp_hands.HAND_CONNECTIONS,
                                        mp_drawing.DrawingSpec(color=(0, 0, 0), thickness=2, circle_radius=4), # color = BGR
                                        mp_drawing.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2))

                # output hand left or right + accuracy score at coords calculated (wrist)
                if get_label(num, hand, results): 
                    text, coord = get_label(num, hand, results)
                    cv2.putText(image, text, coord, cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
            
                if (True): # need machine learning to train to detect the gesture we want 
                    cv2.putText(image, "Status: DANGER", (0, 25), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
                    # root = tkinter.Tk()
                    # root.withdraw()
                    # messagebox.showinfo("Title", "Message") # alert box once a hand is detected
                    # break                

                # Export training data
                # For better accuracy, take more samples
                try:
                    hand_landmark = hand.landmark
                    hand_row = list(np.array([[landmark.x, landmark.y, landmark.z] for landmark in hand_landmark]).flatten())
                    
                    # Pass in our live data into our model
                    X = pd.DataFrame([hand_row])
                    sos_gesture_class = model.predict(X)[0] # classify whether it is our gesture or not
                    sos_gesture_prob = model.predict_proba(X)[0] # probability of the class; this will be an array of multiple values
                    
                    # Print the result onto the screen
                    
                except:
                    pass

            # draw_finger_angles(image, results, joint_list)


        cv2.imshow("Live Webcam Feed", image) # render the image to the screen, and name it "Hand Tracking"

        # Save images
        # cv2.imwrite(os.path.join("Output Images", "{}.jpg".format(uuid.uuid1())), image)

        if cv2.waitKey(10) & 0xFF == ord("q"): # exit the webcam feed in 10ms by pressing q 
            break

cap.release()
cv2.destroyAllWindows()