In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time

In [2]:
import importlib # for reloading .py files
# WARNING: updates of config may require restart of kernel if reload is unsuccessful

import config
# reload config without restarting the kernel
importlib.reload(config)
from config import actions, no_sequences, sequence_length, number_of_classes, DATA_PATH, WEIGHTS_PATH, facemesh_included, is_martin

import utils
# reload utils without restarting the kernel
importlib.reload(utils)
from utils import mediapipe_detection, extract_keypoints, draw_styled_landmarks, prob_viz, set_camera_settings

import env
# reload utils without restarting the kernel
importlib.reload(env)
from env import SCIENTIST, cam_num_martin


import model
# reload model without restarting the kernel
importlib.reload(model)
from model import create_model, build_which_model

# CHOOSE MODEL / WEIGHTS

In [3]:
# name of the weights file (without ".keras" ending) 
# also used as model name - dot not allowed, thus deleted, and file renamed
model_name = "model_19"
# weights_name = model_name
weights_name = "weights_wo_facemesh_LSTM_tanh_softmax_NM20_testsize0.3"

# BUILD THE MODEL - CHRIS

In [4]:
facemesh_included = False

In [5]:
# # OPTIONAL model() arguments
# # available model_types: "LSTM", "Conv2D"
# # tested activation functions: "tanh", "relu"; OTHERS: see below 
# # tested activations for last neural layer: "softmax", "sigmoid"; OTHERS: "linear", "tanh", "relu", "softplus", "softsign", "selu", "elu", "exponential"
# # neural multiplicator: multiply neurons per layer by this amount (default = 1); the higher NM, the longer the computation
# # output will change according if facemesh_included = True or False

# reLU = Rectified Linear Activation Function: returns 0 if it receives any negative input, but for any positive value x it returns that value back.  It helps the model to account for non-linearity.
# Sigmoid: Outputs a value between 0 and 1, making it suitable for binary classification or multilabel classification tasks where each label is predicted independently.

importlib.reload(config)
from config import model_type, activation_function, activation, neural_factor, metrics
model = create_model(model_type, activation_function, activation, neural_factor, metrics)

  super().__init__(**kwargs)


['accuracy', 'categorical_accuracy', 'Precision', 'Recall']


# CALL THE MODEL - Martin

In [6]:
if SCIENTIST == "Martin" or SCIENTIST == "Vero": 
    model_name_martin = "model_bowmore"
    model = build_which_model(model_name_martin)
    model.load_weights("weights/model_bowmore.keras")


  saveable.load_own_variables(weights_store.get(inner_path))


In [7]:
model.predict

<bound method TensorFlowTrainer.predict of <Sequential name=sequential_1, built=True>>

## Loading weights

In [8]:
if SCIENTIST != "Martin": 
    model.load_weights(os.path.join(WEIGHTS_PATH, weights_name + ".keras"))

# PROBABILITY VIZUALIZATION - REAL TIME TESTING

In [9]:
import mediapipe as mp
from scipy.stats import mode 

In [10]:
from utils import mp_holistic, mp_drawing, set_camera_settings, default_fps
import datetime

In [11]:
### FPS FIXIN


In [12]:

def prob_viz(res, actions, input_frame, threshold):
    fontScale = 1

    one_color = (245,117,16)
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        color = one_color if prob < threshold else (0,0,255)
        cv2.rectangle(output_frame, (0,60+num*30), (int(prob*100), 90+num*30), color, -1) #colors[num%len(colors)]
        cv2.putText(output_frame, str(int(prob * 100)) + "%" + " " + actions[num], 
                    (0, 85+num*30), cv2.FONT_HERSHEY_SIMPLEX, fontScale, (0,0,0), 4, cv2.LINE_AA)
        # Then draw the text again with the original color (e.g., white)
        cv2.putText(output_frame, str(int(prob * 100)) + "%" + " " + actions[num], 
                    (0, 85+num*30), cv2.FONT_HERSHEY_SIMPLEX, fontScale, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

In [19]:
import time
# 1. New detection variables
sequence = []
sentence = []
predictions = []
all_words = [] 
threshold = 0.4 # 40% 
fps_list = []
last_word = ""
num_of_words = 5
fps_during_training = 6

frame_counter = 0
actual_fps_live = 0


cam_num = 0 if SCIENTIST != "Martin" else cam_num_martin

frame_count_for_fps = 0
start_time = time.time()

length_of_sequence = 30
no_of_reps_to_accept = 5
silent_sign_name = "NoHands"

# picture of a static person using no hands as fall-back option for model
static_frame = np.load(os.path.join(DATA_PATH, "_.npy"))



cap = cv2.VideoCapture(cam_num)  # Adjust device index as needed
actual_res, actual_fps = set_camera_settings(cap) # setting camera resolution and fps + printing the actual values
desired_fps = 7
frame_time = 1.0 / desired_fps
last_time = time.time()



#TODO: put recording in subfolder
# recording
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
video_file_name = f"recording_{timestamp}.avi"
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter(video_file_name, fourcc, 10.0, (int(cap.get(3)), int(cap.get(4))))
# /recording


# Set mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    while cap.isOpened():
        start_process_time = time.time()

        frame_counter += 1

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)

        # Draw landmarks
        draw_styled_landmarks(image, results)

        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        # dynamically adjust the length of the sequence using the variable from above
        sequence = sequence[-length_of_sequence:]

        if len(sequence) == length_of_sequence:

            zero_frame = np.zeros_like(sequence[0])  # Creates a zero array with the same shape as a frame

            # overwrites zero_frame with the static
            zero_frame = static_frame

            padding = [zero_frame] * (int((30 - len(sequence))/2))

            # pad the actual sequence from left and right
            padded_sequence = padding + sequence + padding

            # Ensure that the padded sequence is exactly 30 frames long
            padded_sequence = padded_sequence[:30]

            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)], round(max(res),2))
            predictions.append(np.argmax(res))
            #  my_predictions = list(dict.fromke<ys(predictions)) --> another way how to get rid of duplicates and keep the order of the elements in the list


        #3. Viz logic
            # if np.unique(predictions[-2:])[0]==np.argmax(res): <-- wrong line of code since .unique sorts the elements in the list alphabetically!!!
            
            # if mode(predictions[-3:])[0]==np.argmax(res):


            is_silent_sign = actions[np.argmax(res)] == silent_sign_name

            if (np.unique(predictions[-no_of_reps_to_accept:])[0]==np.argmax(res) 
                and (np.unique(predictions[-no_of_reps_to_accept:]).shape[0]==1) # additional condition to check if the returned unique list has only one element
                and not is_silent_sign
                ): 


                if (res[np.argmax(res)] > threshold) and (actions[np.argmax(res)] != last_word):
                    all_words.append(actions[np.argmax(res)]) 
                    if len(sentence) >= num_of_words:
                        time.sleep(3) 
                        sentence = []
                        sentence.append(actions[np.argmax(res)])
                        last_word = actions[np.argmax(res)]
                        frame_counter = 0
                    elif len(sentence) > 0: 
                        sentence.append(actions[np.argmax(res)])
                        last_word = actions[np.argmax(res)]
                        frame_counter = 0
                    elif len(sentence) == 0:
                        sentence.append(actions[np.argmax(res)])
                        last_word = actions[np.argmax(res)]
                        frame_counter = 0
                    # the same thing:
                    # else:
                    #     sentence.append(actions[np.argmax(res)])
                        
                        
            # if len(sentence) > 5:
            #     sentence = sentence[-5:]

            # Viz probabilities
            image = prob_viz(res, actions, image, threshold)
       
        if frame_counter == 15:
            sentence = []
            last_word = ""
            frame_counter = 0   


        # Get the width of the image
        image_width = image.shape[1]

        # Set the x coordinate for the text
        text_x = image_width - 200  # 200 is an estimate of the text width, adjust as needed

        # cv2.putText(image, f"frames: {frame_counter}", (text_x,50),
        #             cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)



        # cv2.putText(image, f"last word: {last_word}", (50,50),
        #             cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA) # cv2.putText(image, f"{frame_counter}, last word: {last_word}", (50,50),
        #                                                                                 # cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        # cv2.putText(image, f"frames: {frame_counter}", (50,50),
        #             cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 1, cv2.LINE_AA)
        # cv2.putText(image, f"All detected words: {', '.join(all_words)}", (0,50),
        #             cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        cv2.rectangle(image, (0, 660), (1280, 720), (158, 128, 107), -1) # TODO : make it more transperant
        cv2.putText(image, ', '.join(sentence), (50,710),
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        
        # FPS fixin
        process_time = time.time() - start_process_time
        print(f"Processing time: {process_time:.2f}s")


        frame_count_for_fps += 1


        if frame_count_for_fps % 10 == 0:
            end_time = time.time()
            elapsed = end_time - start_time
            actual_fps_live = frame_count_for_fps / elapsed
            fps_list.append(actual_fps_live)
            print("Actual FPS:", actual_fps_live)
            # Reset timer and counter
            start_time = time.time()
            frame_count_for_fps = 0


        #### FPS ####
        # Display the calculated FPS on the frame

        # frame_counter_text = f"{frame_counter}"
        # cv2.putText(image, frame_counter_text, (1170, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2, cv2.LINE_AA)

        fps_during_training_text = f"training FPS ~{fps_during_training:.2f}"     
        cv2.putText(image, fps_during_training_text, (950, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 4, cv2.LINE_AA)
        cv2.putText(image, fps_during_training_text, (950, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        

        fps_text = f"live FPS  {actual_fps_live:.2f}"     
        cv2.putText(image, fps_text, (1020, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 4, cv2.LINE_AA)
        cv2.putText(image, fps_text, (1020, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        

        fps_difference = ((actual_fps_live - fps_list[-2]) / fps_list[-2]) * 100 if len(fps_list) > 1 else 0
        fps_difference = ((actual_fps_live - fps_during_training) / fps_during_training) * 100 if len(fps_list) > 1 else 0
        
        if round(fps_difference) == 0: 
            fps_difference_text = f" 0.0%"
        elif fps_difference < 0: 
            fps_difference_text = f"{fps_difference:.1f}%"
        else: 
            fps_difference_text = f"+{fps_difference:.1f}%"

        cv2.putText(image, fps_difference_text, (1140, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 4, cv2.LINE_AA)
        cv2.putText(image, fps_difference_text, (1140, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)


        # Show to screen
        cv2.imshow('OpenCV Feed', image)
        # write the frame in the video recording
        out.write(image)

        # FIXIN FPS
        # Calculate the time to wait
        elapsed = time.time() - last_time
        wait_time = max(0, frame_time - elapsed)
        print(f"Elapsed time: {elapsed:.2f}s, Waiting for: {wait_time:.2f}s")
        time.sleep(wait_time)
        last_time = time.time()


        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('t'):
            break
    
    #release the recording
    out.release()

    cap.release()
    cv2.destroyAllWindows()
    cv2.waitKey(1)

Requested Resolution: 1280x720, Actual Resolution: 1280x720
Requested FPS: 30, Actual FPS: 30.0
<class 'mediapipe.python.solution_base.SolutionOutputs'>
Processing time: 0.62s
Elapsed time: 0.80s, Waiting for: 0.00s
<class 'mediapipe.python.solution_base.SolutionOutputs'>
Processing time: 0.15s
Elapsed time: 0.20s, Waiting for: 0.00s
<class 'mediapipe.python.solution_base.SolutionOutputs'>
Processing time: 0.13s
Elapsed time: 0.16s, Waiting for: 0.00s
<class 'mediapipe.python.solution_base.SolutionOutputs'>
Processing time: 0.15s
Elapsed time: 0.20s, Waiting for: 0.00s
<class 'mediapipe.python.solution_base.SolutionOutputs'>
Processing time: 0.14s
Elapsed time: 0.17s, Waiting for: 0.00s
<class 'mediapipe.python.solution_base.SolutionOutputs'>
Processing time: 0.14s
Elapsed time: 0.19s, Waiting for: 0.00s
<class 'mediapipe.python.solution_base.SolutionOutputs'>
Processing time: 0.12s
Elapsed time: 0.16s, Waiting for: 0.00s
<class 'mediapipe.python.solution_base.SolutionOutputs'>
Process

In [14]:
fps_list

[2.4258580314369684]

In [15]:
# print the detected words
print(f"All the words that have been detected are [{len(all_words)}] and they are: {', '.join(all_words)}")
print(f"The last words that have been detected and printed out are [{len(sentence)}] and they are: {', '.join(sentence)}")

All the words that have been detected are [3] and they are: No, Me, Happy
The last words that have been detected and printed out are [0] and they are: 
