# 1. Import and Install Dependencies

In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

# 2. Keypoints using MP Holistic

In [2]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [3]:
mp_drawing = mp.solutions.drawing_utils
mp_holistic = mp.solutions.holistic
mp_face_mesh = mp.solutions.face_mesh  # for face connections

def draw_styled_landmarks(image, results):
    # Draw left hand connections
    if results.left_hand_landmarks:
        mp_drawing.draw_landmarks(
            image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
            mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
            mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
        )

    # Draw right hand connections
    if results.right_hand_landmarks:
        mp_drawing.draw_landmarks(
            image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
            mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
            mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
        )


# 3. Extract Keypoint Values

In [4]:
def extract_keypoints(results):
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([lh, rh])

# 4. Setup Folders for Collection

In [40]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('MP_Data') 

# Actions that we try to detect
actions = np.array(['Hello', 'All The Best', 'Peace', 'Call me', 'Nice'])

# Thirty videos worth of data
no_sequences = 60

# Videos are going to be 30 frames in length
sequence_length = 30

In [16]:
for action in actions: 
    for sequence in range(no_sequences):
        try: 
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

In [67]:
for action in actions: 
    print(action)

Hello
All The Best
Peace
Call me
Nice


# 5. Collect Keypoint Values for Training and Testing

In [12]:
cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    # NEW LOOP
    # Loop through actions
    # for action in actions:
        # Loop through sequences aka videos
    action = actions[5]
    # Loop through sequences aka videos
    for sequence in range(no_sequences):
            # Loop through video length aka sequence length
        for frame_num in range(sequence_length):
                # Read feed
            ret, frame = cap.read()

                # Make detections
            image, results = mediapipe_detection(frame, holistic)
#                 print(results)

                # Draw landmarks
            draw_styled_landmarks(image, results)
                
                # NEW Apply wait logic
            if frame_num == 0: 
                cv2.putText(image, 'STARTING COLLECTION', (120,200), 
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
                cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                cv2.imshow('OpenCV Feed', image)
                cv2.waitKey(2000)
            else: 
                cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                cv2.imshow('OpenCV Feed', image)
                
                # Export keypoints
            keypoints = extract_keypoints(results)
            npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
            np.save(npy_path, keypoints)
                #  Save the frame as an image
            img_path = os.path.join(DATA_PATH, action, str(sequence), f"frame_{frame_num}.jpg")
            cv2.imwrite(img_path, image)
                # Break gracefully
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
                    
    cap.release()
    cv2.destroyAllWindows()

# 6. Preprocess Data and Create Labels and Features

In [55]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [56]:
label_map = {label:num for num, label in enumerate(actions)}

In [65]:
label_map

{'Hello': 0, 'All The Best': 1, 'Peace': 2, 'Call me': 3, 'Nice': 4}

In [68]:
sequences, labels = [], []
for action in actions:
    for sequence in range(no_sequences):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [59]:
np.array(sequences).shape

(300, 30, 126)

In [60]:
np.array(labels).shape

(300,)

In [69]:
X = np.array(sequences)
y = to_categorical(labels).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)


In [70]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(285, 30, 126) (15, 30, 126) (285, 5) (15, 5)


# 7. Build and Train LSTM Neural Network

In [71]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [72]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [73]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,126)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [74]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [84]:
start_train = time.time()
history = model.fit(X_train, y_train, epochs=2000, callbacks=[tb_callback], validation_data=(X_test, y_test), verbose=2)
end_train = time.time()

Epoch 1/2000
9/9 - 0s - loss: 0.2643 - categorical_accuracy: 0.8842 - val_loss: 0.5177 - val_categorical_accuracy: 0.8000 - 294ms/epoch - 33ms/step
Epoch 2/2000
9/9 - 0s - loss: 0.2670 - categorical_accuracy: 0.8842 - val_loss: 0.3643 - val_categorical_accuracy: 0.8000 - 249ms/epoch - 28ms/step
Epoch 3/2000
9/9 - 0s - loss: 0.2385 - categorical_accuracy: 0.8947 - val_loss: 0.3173 - val_categorical_accuracy: 0.9333 - 248ms/epoch - 28ms/step
Epoch 4/2000
9/9 - 0s - loss: 0.2851 - categorical_accuracy: 0.8596 - val_loss: 0.4606 - val_categorical_accuracy: 0.8000 - 239ms/epoch - 27ms/step
Epoch 5/2000
9/9 - 0s - loss: 0.2957 - categorical_accuracy: 0.8526 - val_loss: 0.5365 - val_categorical_accuracy: 0.8000 - 242ms/epoch - 27ms/step
Epoch 6/2000
9/9 - 0s - loss: 0.3378 - categorical_accuracy: 0.8491 - val_loss: 0.2781 - val_categorical_accuracy: 0.9333 - 245ms/epoch - 27ms/step
Epoch 7/2000
9/9 - 0s - loss: 0.2720 - categorical_accuracy: 0.8561 - val_loss: 0.2234 - val_categorical_accurac

In [86]:
total_train_time = end_train - start_train
epoch_time = total_train_time / 2000
# Time the training process
print(f"Total training time: {total_train_time:.2f} seconds")
print(f"Average latency per epoch: {epoch_time:.2f} seconds")

Total training time: 533.52 seconds
Average latency per epoch: 0.27 seconds


In [87]:
# Report accuracy 
train_acc = history.history['categorical_accuracy'][-1]
val_acc = history.history['val_categorical_accuracy'][-1]
print(f"Final Train Accuracy: {train_acc*100:.2f}%")
print(f"Final Validation (Test) Accuracy: {val_acc*100:.2f}%")

Final Train Accuracy: 97.54%
Final Validation (Test) Accuracy: 100.00%


In [89]:
# Inference latency on test set
inference_latencies = []
correct = 0
for i in range(len(X_test)):
    x = np.expand_dims(X_test[i], axis=0)
    true_label = np.argmax(y_test[i])
    t0 = time.time()
    pred = model.predict(x, verbose=0)
    t1 = time.time()
    inference_latencies.append((t1-t0)*1000)  # ms
    if np.argmax(pred) == true_label:
        correct += 1
avg_inf_latency = np.mean(inference_latencies)
test_acc = correct / len(X_test)
print(f"Average inference latency per sample: {avg_inf_latency:.2f} ms")
print(f"Test set accuracy (manual check): {test_acc*100:.2f}%")

Average inference latency per sample: 39.06 ms
Test set accuracy (manual check): 100.00%


In [90]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_9 (LSTM)               (None, 30, 64)            48896     
                                                                 
 lstm_10 (LSTM)              (None, 30, 128)           98816     
                                                                 
 lstm_11 (LSTM)              (None, 64)                49408     
                                                                 
 dense_9 (Dense)             (None, 64)                4160      
                                                                 
 dense_10 (Dense)            (None, 32)                2080      
                                                                 
 dense_11 (Dense)            (None, 5)                 165       
                                                                 
Total params: 203525 (795.02 KB)
Trainable params: 203

In [None]:
model.save('actions.h5')