In [25]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import mediapipe as mp
import time
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# BUILD THE MODEL AND LOAD THE WEIGHTS


In [26]:
# classes

# no need for the list of classes, it's just necessary to describe HOW MANY CLASSES do we have
# because that is the number of nodes in the very last layer of the model

actions = np.array(['Hi', 'Yes', 'No', 'ThankYou', 'ILoveYou'])
number_of_classes = actions.shape[0]

# number_of_classes = 3

In [27]:
# parameters and layer details need to be adjusted in case a new model is to be tested

activation_function = "tanh" # was recommended by CheatGPT for this kind of a model
activation_function = "relu" # was chosen by Nick and described by CheatGPT as suboptimal for this kind of a model
facemesh_included = True

if facemesh_included:
  coefficient = 1 # if coefficient is 1, the model structure is the same as it is in the original model by Nick
  number_of_keypoints = 1662 # 258 if no facemesh, 1662 if facemesh is included
else:
  number_of_keypoints = 258 # 258 if no facemesh, 1662 if facemesh is included
  coefficient = 0.5 # coefficient 0.5 means that the number of neurons in 2nd, the 3rd and the 4th layer will be half - so as to account for the smaller input shape WHEN FACEMESH is REMOVED

model = Sequential()
model.add(LSTM(64, return_sequences=True, activation=activation_function, input_shape=(30, number_of_keypoints)))
model.add(LSTM(int(128*coefficient), return_sequences=True, activation=activation_function))
model.add(LSTM(int(64*coefficient), return_sequences=False, activation=activation_function))
model.add(Dense(int(64*coefficient), activation=activation_function))
model.add(Dense(32, activation=activation_function))
model.add(Dense(number_of_classes, activation='softmax')) 

In [28]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

## load the weights instead of training the model

In [29]:
model.load_weights('weights_with_facemesh.keras')

# PROBABILITY VIZUALIZATION

In [36]:
colors = [(245,117,16), (117,245,16), (16,117,245)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        # cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        # ensure that the colors list has at least as many elements as the res list. If the colors list is supposed to contain a specific set of colors that cycles, you can use the modulo operator to cycle back to the start of the colors list when the end is reached.
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num % len(colors)], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)

    return output_frame

In [31]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [32]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [33]:
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS,
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             )
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             )
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )
    # Draw right hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

In [34]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose,
                           face,
                           lh,
                           rh,
                           ])

# REAL TIME DETECTION

In [39]:
# 1. Detection variables: to be adjusted by the user
threshold = 0.8
# Number of videos/sequences
no_sequences = 30
# length of each video/sequence in frames per sequence
sequence_length = 30

sequence = []
sentence = []
predictions = []


cap = cv2.VideoCapture(0)
# Set mediapipe model to detect keypoints
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)

        # Draw landmarks
        draw_styled_landmarks(image, results)

        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]

        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
            predictions.append(np.argmax(res))


        #3. Visualization of detected phrases 
            if np.unique(predictions[-10:])[0]==np.argmax(res):

                if res[np.argmax(res)] > threshold:
                    if len(sentence) > 0:
                        if actions[np.argmax(res)] != sentence[-1]:
                            sentence.append(actions[np.argmax(res)])
                    else:
                        sentence.append(actions[np.argmax(res)])
            
            # Maintain only the last 5 actions in sentence
            if len(sentence) > 5:
                sentence = sentence[-5:]

            # Visualize the probabilities of the actions on the image
            image = prob_viz(res, actions, image, colors)

        # Draw rectangles and text on the image to display recognized actions
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        # cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num % len(colors)], -1)
        cv2.putText(image, ' '.join(sentence), (3,30),
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('t'):
            break

    cap.release()
    cv2.destroyAllWindows()
    cv2.waitKey(1)

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti