In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time

In [ ]:
# name of the weights file (without ".keras" ending) 
weights_name = "weights_wo_facemesh_LSTM_tanh_softmax_NM20_testsize0.3"

# BUILD THE MODEL

In [3]:
# WARNING: updates of config may require restart of kernel if reload is unsuccessful
import config
# reload config without restarting the kernel
import importlib
importlib.reload(config)
from config import actions, no_sequences, sequence_length, number_of_classes, DATA_PATH, WEIGHTS_PATH, facemesh_included

from model import create_model

In [4]:
# OPTIONAL model() arguments
# available model_types: "LSTM", "Conv2D"
# tested activation functions: "tanh", "relu"; OTHERS: see below 
# tested activations for last neural layer: "softmax", "sigmoid"; OTHERS: "linear", "tanh", "relu", "softplus", "softsign", "selu", "elu", "exponential"
# neural multiplicator: multiply neurons per layer by this amount (default = 1); the higher NM, the longer the computation
# output will change according if facemesh_included = True or False

# reLU = Rectified Linear Activation Function: returns 0 if it receives any negative input, but for any positive value x it returns that value back.  It helps the model to account for non-linearity.
# Sigmoid: Outputs a value between 0 and 1, making it suitable for binary classification or multilabel classification tasks where each label is predicted independently.
model_type, activation_function, activation, neural_multiplicator = "LSTM", "tanh", "softmax", 20
model = create_model(model_type, activation_function, activation, neural_multiplicator)

  super().__init__(**kwargs)


In [15]:
model.load_weights(os.path.join(WEIGHTS_PATH, weights_name + ".keras"))

# PROBABILITY VIZUALIZATION

In [16]:
import mediapipe as mp

In [17]:
threshold = 0.8

colors = [(245,117,16), (117,245,16), (16,117,245)]
colors = (245,117,16)
one_color = (245,117,16)
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        color = one_color if prob < threshold else (0,0,255)
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), color, -1) #colors[num%len(colors)]
        cv2.putText(output_frame, str(int(prob * 100)) + "%" + " " + actions[num] , (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

In [18]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [19]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [20]:
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

In [21]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    # face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    if facemesh_included: 
      return np.concatenate([pose,
                              lh, 
                              rh,
                              face, 
                              ])
    else:    # no facemesh keypoints returned
      return np.concatenate([pose,
                              lh, 
                              rh,
                              ])

In [23]:
# Actions that we try to detect
# actions = np.array(["Hi", "ILoveYou", "_"])


In [22]:
# picture of a static person using no hands as fall-back option for model
static_frame = np.load(os.path.join(DATA_PATH, "_.npy"))

In [23]:
from scipy.stats import mode 

In [28]:
# 1. New detection variables
sequence = []
sentence = []
predictions = []
threshold = 0.4 # 40% 
frame_counter = 0
last_word = ""
num_of_words = 5

length_of_sequence = 30
no_of_reps_to_accept = 5
silent_sign_name = "NoHands"

cap = cv2.VideoCapture(0) # I have changed the 1 to 0 in order to open successfully the camera

# Set mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        frame_counter += 1

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)

        # Draw landmarks
        draw_styled_landmarks(image, results)

        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        # dynamically adjust the length of the sequence using the variable from above
        sequence = sequence[-length_of_sequence:]

        if len(sequence) == length_of_sequence:

            zero_frame = np.zeros_like(sequence[0])  # Creates a zero array with the same shape as a frame

            # overwrites zero_frame with the static
            zero_frame = static_frame

            padding = [zero_frame] * (int((30 - len(sequence))/2))

            # pad the actual sequence from left and right
            padded_sequence = padding + sequence + padding

            # Ensure that the padded sequence is exactly 30 frames long
            padded_sequence = padded_sequence[:30]

            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)], round(max(res),2))
            predictions.append(np.argmax(res))
            #  my_predictions = list(dict.fromke<ys(predictions)) --> another way how to get rid of duplicates and keep the order of the elements in the list


        #3. Viz logic
            # if np.unique(predictions[-2:])[0]==np.argmax(res): <-- wrong line of code since .unique sorts the elements in the list alphabetically!!!
            
            # if mode(predictions[-3:])[0]==np.argmax(res):


            is_silent_sign = actions[np.argmax(res)] == silent_sign_name

            if (np.unique(predictions[-no_of_reps_to_accept:])[0]==np.argmax(res) 
                and (np.unique(predictions[-no_of_reps_to_accept:]).shape[0]==1) # additional condition to check if the returned unique list has only one element
                and not is_silent_sign
                ): 


                if (res[np.argmax(res)] > threshold) and (actions[np.argmax(res)] != last_word): 
                    if len(sentence) >= num_of_words:
                        time.sleep(3) 
                        sentence = []
                        sentence.append(actions[np.argmax(res)])
                        last_word = actions[np.argmax(res)]
                        frame_counter = 0
                    elif len(sentence) > 0: 
                        sentence.append(actions[np.argmax(res)])
                        last_word = actions[np.argmax(res)]
                        frame_counter = 0
                    elif len(sentence) == 0:
                        sentence.append(actions[np.argmax(res)])
                        last_word = actions[np.argmax(res)]
                        frame_counter = 0
                    # the same thing:
                    # else:
                    #     sentence.append(actions[np.argmax(res)])
                        
                        
            # if len(sentence) > 5:
            #     sentence = sentence[-5:]

            # Viz probabilities
            image = prob_viz(res, actions, image, colors)
       
        if frame_counter == 9:
            sentence = []
            last_word = ""
            frame_counter = 0   

        cv2.putText(image, f"{frame_counter}, last word {last_word}", (50,50),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        cv2.rectangle(image, (0, 430), (640, 480), (128, 128, 128), -1) # TODO : make it more transperant
        cv2.putText(image, ', '.join(sentence), (50,465),
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        

        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('t'):
            break

    cap.release()
    cv2.destroyAllWindows()
    cv2.waitKey(1)

I0000 00:00:1715509767.550902       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 83.1), renderer: Apple M1


<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti