In [28]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import mediapipe as mp
import time
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# BUILD THE MODEL AND LOAD THE WEIGHTS


In [29]:
# actions/gestures to be detected (model was trained with)
actions = np.array(['Hi', 'Yes', 'No', 'ThankYou', 'ILoveYou', 'background', 'NoHands'])

# classes
# number of nodes in the very last layer of the model
number_of_classes = actions.shape[0]

camera_index = 0

In [30]:
# parameters and layer details need to be adjusted in case a new model is to be tested

# activation_function = "relu" # was chosen by Nick and described by CheatGPT as suboptimal for this kind of a model
activation_function = "tanh" # was recommended by CheatGPT for this kind of a model
neural_multiplicator = 15
facemesh_included = False

if facemesh_included == True:
  number_of_keypoints = 1662 # 258 if no facemesh, 1662 if facemesh is included
  coefficient = 1 # if coefficient is 1, the model structure is the same as it is in the original model by Nick
  weights_name = "weights_w_facemesh_NM" + str(neural_multiplicator)
elif facemesh_included == False:
  number_of_keypoints = 258 # 258 if no facemesh, 1662 if facemesh is included
  coefficient = 0.5 # coefficient 0.5 means that the number of neurons in 2nd, the 3rd and the 4th layer will be half - so as to account for the smaller input shape WHEN FACEMESH is REMOVED
  weights_name = "weights_wo_facemesh_NM" + str(neural_multiplicator)

model = Sequential()
model.add(LSTM(64, return_sequences=True, activation=activation_function, input_shape=(30, number_of_keypoints)))
model.add(LSTM(int(128*coefficient*neural_multiplicator), return_sequences=True, activation=activation_function))
model.add(LSTM(int(64*coefficient*neural_multiplicator), return_sequences=False, activation=activation_function))
model.add(Dense(int(64*coefficient*neural_multiplicator), activation=activation_function))
model.add(Dense(32*neural_multiplicator, activation=activation_function))
model.add(Dense(number_of_classes, activation='softmax')) 

In [31]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

## load the weights instead of training the model

In [32]:
model.load_weights(weights_name + ".keras")

ValueError: Cannot assign value to variable ' lstm_21/lstm_cell/kernel:0': Shape mismatch.The variable shape (258, 256), and the assigned value shape (1662, 3840) are incompatible.

# PROBABILITY VIZUALIZATION

In [None]:
threshold = 0.5
colors = (245,117,16)
one_color = (245,117,16)

def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        color = one_color if prob < threshold else (0,0,255)
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), color, -1) #colors[num%len(colors)]
        cv2.putText(output_frame, str(int(prob * 100)) + "%" + " " + actions[num] , (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

In [None]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [None]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [None]:
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS,
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             )
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             )
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )
    # Draw right hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

In [None]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    
    # include face only if facemesh_included
    if facemesh_included:
        face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
        return np.concatenate([pose, lh, rh, face,])   
    else: 
        return np.concatenate([pose, lh, rh,])           

# REAL TIME DETECTION

In [None]:
# # 1. Detection variables: to be adjusted by the user
# threshold = 0.8
# # Number of videos/sequences
# no_sequences = 30
# # length of each video/sequence in frames per sequence
# sequence_length = 30

# sequence = []
# sentence = []
# predictions = []


# cap = cv2.VideoCapture(0)
# # Set mediapipe model to detect keypoints
# with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
#     while cap.isOpened():

#         # Read feed
#         ret, frame = cap.read()

#         # Make detections
#         image, results = mediapipe_detection(frame, holistic)
#         print(results)

#         # Draw landmarks
#         draw_styled_landmarks(image, results)

#         # 2. Prediction logic
#         keypoints = extract_keypoints(results)
#         sequence.append(keypoints)
#         sequence = sequence[-30:]

#         if len(sequence) == 30:
#             res = model.predict(np.expand_dims(sequence, axis=0))[0]
#             print(actions[np.argmax(res)])
#             predictions.append(np.argmax(res))


#         #3. Visualization of detected phrases 
#             if np.unique(predictions[-10:])[0]==np.argmax(res):

#                 if res[np.argmax(res)] > threshold:
#                     if len(sentence) > 0:
#                         if actions[np.argmax(res)] != sentence[-1]:
#                             sentence.append(actions[np.argmax(res)])
#                     else:
#                         sentence.append(actions[np.argmax(res)])
            
#             # Maintain only the last 5 actions in sentence
#             if len(sentence) > 5:
#                 sentence = sentence[-5:]

#             # Visualize the probabilities of the actions on the image
#             image = prob_viz(res, actions, image, colors)

#         # Draw rectangles and text on the image to display recognized actions
#         cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
#         # cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num % len(colors)], -1)
#         cv2.putText(image, ' '.join(sentence), (3,30),
#                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

#         # Show to screen
#         cv2.imshow('OpenCV Feed', image)

#         # Break gracefully
#         if cv2.waitKey(10) & 0xFF == ord('t'):
#             break

#     cap.release()
#     cv2.destroyAllWindows()
#     cv2.waitKey(1)

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

KeyboardInterrupt: 

: 

In [None]:
# load static frame (no hands)
if facemesh_included == True: 
    static_frame = np.load('_.npy')
else: 
    static_frame = np.load('_.npy')[0:258]

# 1. New detection variables
sequence = []
sentence = []
predictions = []

# Settings for the sequence
length_of_sequence = 15 # set how many frames should be grabbed before making a prediction - 30 is the maximum, everything below will be padded
no_of_reps_to_accept = 5 # how many times the same sign should be predicted before it is accepted

# for visualising the sequence pace
frame_counter = 0

cap = cv2.VideoCapture(0)
# cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640) # set the width and height of the capture
# cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480) # set the width and height of the capture
# Set mediapipe model 
try:
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():

            # Read feed
            ret, frame = cap.read()

            frame_counter += 1

            # Make detections
            image, results = mediapipe_detection(frame, holistic)
            # print(results)
            
            # Draw landmarks - not necessary for the recognition
            #draw_styled_landmarks(image, results)
            
            # 2. Prediction logic
            keypoints = extract_keypoints(results)
    #         sequence.insert(0,keypoints)
    #         sequence = sequence[:30]
            sequence.append(keypoints)

            # shorter sequence for recognition
            sequence = sequence[-length_of_sequence:]
            
            if frame_counter == length_of_sequence:
                # Display the sign
                cv2.putText(image, f'{length_of_sequence} Frames Reached', (800,50), 
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2, cv2.LINE_AA)
                # Reset the counter
                frame_counter = 0

            if len(sequence) == length_of_sequence:

                zero_frame = np.zeros_like(sequence[0])  # Creates a zero array with the same shape as a frame

                # comment out to use zeroes (no pose coordinates) instead
                zero_frame = static_frame # overwrites zero_frame with the static frame (static posture coordinates)

                padding = [zero_frame] * (int((30 - len(sequence) +1 )/2)) # may be an unnecessary +1 in the formula making sure the padding whole padded sequence is at least 3 frames

                # pad the actual sequence from left and right
                padded_sequence = padding + sequence + padding

                # Ensure that the padded sequence is exactly 30 frames long
                padded_sequence = padded_sequence[:30]

                res = model.predict(np.expand_dims(padded_sequence, axis=0))[0]
                print(actions[np.argmax(res)], round(max(res),2))
                predictions.append(np.argmax(res))



            #3. Viz logic
                if np.unique(predictions[-no_of_reps_to_accept:])[0]==np.argmax(res):

                    if res[np.argmax(res)] > threshold: 
                        if len(sentence) > 0: 
                            if actions[np.argmax(res)] != sentence[-1]:
                                sentence.append(actions[np.argmax(res)])
                        else:
                            sentence.append(actions[np.argmax(res)])

                if len(sentence) > 5: 
                    sentence = sentence[-5:]

                # Viz probabilities
                image = prob_viz(res, actions, image, colors)
                
            cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
            cv2.putText(image, ' '.join(sentence), (3,30), 
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
            
            # Show to screen
            cv2.imshow('OpenCV Feed', image)

            # Break gracefully
            if cv2.waitKey(10) & 0xFF == ord('t'):
                break

finally:
    cap.release()
    cv2.destroyAllWindows()
    cv2.waitKey(1)  # additional waitKey to ensure cleanup
    print("Cleanup complete")

# Additional checks for leftover windows - there were some issues on Martin's machine with the windows not closing properly
try:
    cap.release()
except:
    print("Capture device already released or not available.")

try:
    cv2.destroyAllWindows()
except Exception as e:
    print("Error in destroying all windows:", e)

try:
    # Use a try-except to handle the potential error from getWindowProperty
    if cv2.getWindowProperty('OpenCV Feed', 0) >= 0:
        cv2.destroyWindow('OpenCV Feed')
except cv2.error as e:
    print("Window already destroyed or never existed:", e)

print("Cleanup complete")