In [40]:
!pip install scikit-learn



# 1) Import and install dependencies

In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp


# 2) Keypoints using MP Hands

In [2]:
mp_hands = mp.solutions.hands # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [3]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [4]:
def draw_landmarks(image, results):
    if not results.multi_hand_landmarks:
        return
    
    for hand_landmarks in results.multi_hand_landmarks:
        mp_drawing.draw_landmarks(
            image, 
            hand_landmarks, 
            mp_hands.HAND_CONNECTIONS
        )


In [11]:
import cv2
import time
import mediapipe as mp

# Initialize mediapipe hands
mp_hands = mp.solutions.hands

# Open webcam
cap = cv2.VideoCapture(0)

# For FPS calculation
prev_time = 0

with mp_hands.Hands(min_detection_confidence=0.5, 
                    min_tracking_confidence=0.2, 
                    max_num_hands=1) as hands: 
    
    while cap.isOpened(): 
        ret, frame = cap.read()
        if not ret:
            break
        
        # make detection 
        image, results = mediapipe_detection(frame, hands)
        
        # Draw landmarks
        draw_landmarks(image, results)
        
        # Calculate FPS
        curr_time = time.time()
        fps = 1 / (curr_time - prev_time) if prev_time != 0 else 0
        prev_time = curr_time
        
        # Put FPS text on image
        cv2.putText(image, f'FPS: {int(fps)}', (10, 40), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        
        # show to screen
        cv2.imshow('OpenCV Feed', image)
        
        # break gracefully 
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()


I0000 00:00:1757560628.357924 13657757 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M1 Pro
W0000 00:00:1757560628.369072 13677977 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1757560628.379111 13677977 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1757560628.434910 13677978 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


# 3) Extract Keypoints

In [5]:
def extract_keypoints_normalized(results):
    """
    Extract keypoints and normalize them for position and scale invariance.
    Returns a 1D numpy array of length 21*4 per hand (x, y, z, visibility).
    """
    if results.multi_hand_landmarks:
        hands = []
        for hand_landmarks in results.multi_hand_landmarks:
            # Extract raw keypoints
            kp = np.array([[res.x, res.y, res.z, res.visibility] for res in hand_landmarks.landmark])
            
            # Use wrist (landmark 0) as origin
            origin = kp[0, :3]  # x, y, z
            kp[:, :3] -= origin  # subtract wrist position
            
            # Optional: scale normalization (divide by max distance)
            max_dist = np.max(np.linalg.norm(kp[:, :3], axis=1))
            if max_dist > 0:
                kp[:, :3] /= max_dist
            
            hands.append(kp.flatten())
        
        return np.concatenate(hands)
    else:
        # Return zeros if no hand detected
        return np.zeros(21*4)


# 4) Setup Folders for Collection

In [42]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('MP_Data') 

# Actions that we try to detect
actions = np.array(['swiperight', 'swipeleft', 'zoom'])

# Thirty videos worth of data
no_sequences = 15

# Videos are going to be 30 frames in length
sequence_length = 20

In [32]:
for action in actions: 
    dirmax = np.max(np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int))
    for sequence in range(1,no_sequences+1):
        try: 
            os.makedirs(os.path.join(DATA_PATH, action, str(dirmax+sequence)))
        except:
            pass

ValueError: invalid literal for int() with base 10: '.DS_Store'

In [33]:
for action in actions: 
    for sequence in range(no_sequences):
        try: 
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

# 5) Collect Keypoint Values for Training and Testing

In [34]:
# Folder start
start_folder = 14
# Action start
action_start = 'swiperight'

In [81]:
cap = cv2.VideoCapture(0)
prev_time = 0

action = action_start

with mp_hands.Hands(min_detection_confidence=0.5, 
                    min_tracking_confidence=0.2, 
                    max_num_hands=1) as hands: 

    # loop through sequences 
    for sequence in range(no_sequences): 
        # loop through video length
        for frame_num in range(sequence_length): 
            
            ret, frame = cap.read()
            if not ret:
                break
            
            # make detection 
            image, results = mediapipe_detection(frame, hands)
            
            # Draw landmarks
            draw_landmarks(image, results)
            
            # Calculate FPS
            curr_time = time.time()
            fps = 1 / (curr_time - prev_time) if prev_time != 0 else 0
            prev_time = curr_time
            
            # Put FPS text on image
            cv2.putText(image, f'FPS: {int(fps)}', (10, 40), 
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

            # Apply collection logic
            if frame_num == 0: 
                cv2.putText(image, "Starting Collection", (120, 200), 
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 4, cv2.LINE_AA)
                cv2.putText(image, f"Collecting frames for {action} video number {sequence}", (15, 80), 
                            cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 0), 4, cv2.LINE_AA)
                cv2.imshow('OpenCV Feed', image)
                cv2.waitKey(2000)  # wait 2 seconds at first frame
            else: 
                cv2.putText(image, f"Collecting frames for {action} video number {sequence}", (15, 80), 
                            cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 0), 4, cv2.LINE_AA)
            
            # Export Keypoints
            keypoints = extract_keypoints_normalized(results)
            npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
            np.save(npy_path, keypoints) 
            
            # Show to screen
            cv2.imshow('OpenCV Feed', image)
            
            # Exit if 'q' pressed
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break

cap.release()
cv2.destroyAllWindows()


I0000 00:00:1757603357.911565 14427533 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M1 Pro
W0000 00:00:1757603357.921275 14668486 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1757603357.929746 14668486 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [66]:
keypoints.shape

(84,)

## Fix individual sample if needed

In [80]:
cap = cv2.VideoCapture(0)
prev_time = 0

action = action_start
sequence = 14

with mp_hands.Hands(min_detection_confidence=0.5, 
                    min_tracking_confidence=0.2, 
                    max_num_hands=1) as hands: 

    # loop through sequences 
    for frame_num in range(sequence_length): 
        
        ret, frame = cap.read()
        if not ret:
            break
        
        # make detection 
        image, results = mediapipe_detection(frame, hands)
        
        # Draw landmarks
        draw_landmarks(image, results)
        
        # Calculate FPS
        curr_time = time.time()
        fps = 1 / (curr_time - prev_time) if prev_time != 0 else 0
        prev_time = curr_time
        
        # Put FPS text on image
        cv2.putText(image, f'FPS: {int(fps)}', (10, 40), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

        # Apply collection logic
        if frame_num == 0: 
            cv2.putText(image, "Starting Collection", (120, 200), 
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 4, cv2.LINE_AA)
            cv2.putText(image, f"Collecting frames for {action} video number {sequence}", (15, 80), 
                        cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 0), 4, cv2.LINE_AA)
            cv2.imshow('OpenCV Feed', image)
            cv2.waitKey(2000)  # wait 2 seconds at first frame
        else: 
            cv2.putText(image, f"Collecting frames for {action} video number {sequence}", (15, 80), 
                        cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 0), 4, cv2.LINE_AA)
        
        # Export Keypoints
        keypoints = extract_keypoints_normalized(results)
        npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
        np.save(npy_path, keypoints) 
        
        # Show to screen
        cv2.imshow('OpenCV Feed', image)
        
        # Exit if 'q' pressed
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()


I0000 00:00:1757603345.265083 14427533 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M1 Pro
W0000 00:00:1757603345.276945 14667955 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1757603345.285833 14667955 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


# 6) Preprocess Data and Create labels and features

In [55]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [56]:
label_map = {label:num for num, label in enumerate(actions)}

In [57]:
label_map

{'swiperight': 0, 'swipeleft': 1, 'zoom': 2}

In [58]:
sequences, labels = [], []
for action in actions:
    for sequence in range(no_sequences):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [59]:
np.array(sequences).shape

(45, 20, 84)

In [60]:
np.array(labels).shape

(45,)

In [61]:
X = np.array(sequences)

In [62]:
X.shape

(45, 20, 84)

In [63]:
y = to_categorical(labels).astype(int)

In [64]:
y

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1]])

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.08)

# 7) Build and Train LSTM Neural Network

In [66]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [67]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [68]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(20,84)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [29]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [30]:
model.fit(X_train, y_train, epochs=2000, callbacks=[tb_callback])

Epoch 1/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - categorical_accuracy: 0.2680 - loss: 1.0966
Epoch 2/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - categorical_accuracy: 1.0000 - loss: 1.0732
Epoch 3/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - categorical_accuracy: 1.0000 - loss: 1.0472
Epoch 4/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - categorical_accuracy: 0.9675 - loss: 1.0023
Epoch 5/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - categorical_accuracy: 0.7482 - loss: 0.9206
Epoch 6/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - categorical_accuracy: 1.0000 - loss: 0.7180
Epoch 7/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - categorical_accuracy: 1.0000 - loss: 0.2382
Epoch 8/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/ste

<keras.src.callbacks.history.History at 0x34fb44ee0>

# 8) Make Predictions

In [69]:
res = model.predict(X_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step


In [70]:
actions[np.argmax(res[0])]

'swiperight'

In [71]:
actions[np.argmax(y_test[0])]

'zoom'

# 9) Save Weights

In [72]:
model.save('action.h5')



In [73]:
del model

In [74]:
model.load_weights('action.h5')

NameError: name 'model' is not defined

# 10) Evaluation using confusion matrix and accuracy

In [50]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [51]:
yhat = model.predict(X_test)

NameError: name 'X_test' is not defined

In [52]:
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

NameError: name 'y_test' is not defined

In [53]:
multilabel_confusion_matrix(ytrue, yhat)

NameError: name 'ytrue' is not defined

# 11) Test in real time

In [28]:
from scipy import stats

In [40]:
colors = [(245,117,16), (117,245,16), (16,117,245)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

In [41]:
plt.figure(figsize=(18,18))
plt.imshow(prob_viz(res, actions, image, colors))

IndexError: list index out of range

<Figure size 1800x1800 with 0 Axes>

In [25]:
#new detection variable
sequence = []
sentence = []
threshold = 0.7

# Open webcam
cap = cv2.VideoCapture(0)

# For FPS calculation
prev_time = 0

with mp_hands.Hands(min_detection_confidence=0.5, 
                    min_tracking_confidence=0.2, 
                    max_num_hands=1) as hands: 
    
    while cap.isOpened(): 
        ret, frame = cap.read()
        if not ret:
            break
        
        # make detection 
        image, results = mediapipe_detection(frame, hands)
        
        # Draw landmarks
        draw_landmarks(image, results)
        
        # Calculate FPS
        curr_time = time.time()
        fps = 1 / (curr_time - prev_time) if prev_time != 0 else 0
        prev_time = curr_time
        
        # Put FPS text on image
        cv2.putText(image, f'FPS: {int(fps)}', (10, 40), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

        # 2. Prediction logic
        keypoints = extract_keypoints_normalized(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
            predictions.append(np.argmax(res))

        #3. Viz logic
            if np.unique(predictions[-10:])[0]==np.argmax(res): 
                if res[np.argmax(res)] > threshold: 
                    
                    if len(sentence) > 0: 
                        if actions[np.argmax(res)] != sentence[-1]:
                            sentence.append(actions[np.argmax(res)])
                    else:
                        sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
            image = prob_viz(res, actions, image, colors)
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # show to screen
        cv2.imshow('OpenCV Feed', image)
        
        # break gracefully 
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()


I0000 00:00:1757647475.431399 15686477 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M1 Pro
W0000 00:00:1757647475.440946 15695913 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1757647475.448481 15695913 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step
swiperight


IndexError: list index out of range