In [1]:
import cv2
import numpy as np
from keras import models
import pandas as pd
from sklearn.preprocessing import StandardScaler
import mediapipe as mp
import io
import time
from collections import Counter
from itertools import combinations
from sklearn.discriminant_analysis import StandardScaler

In [2]:



model = models.load_model("data/lstm_v3.keras")
class_labels = pd.read_csv("data/class_labels.csv")['gesture'].tolist()

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2)
mp_drawing = mp.solutions.drawing_utils

# feature engineering 
def calculate_hand_motion_features(df, landmark_cols):
    new_cols = {}

    for col in landmark_cols:
        new_cols[f"velocity_{col}"] = df[col].diff().fillna(0)
        new_cols[f"acceleration_{col}"] = new_cols[f"velocity_{col}"].diff().fillna(0)
        
    # Calculate pairwise distances between all landmarks
    landmark_pairs = list(combinations(landmark_cols, 2))
    for (col1, col2) in landmark_pairs:
        idx1 = col1[1:]  # Get index part from 'x0', 'y0', etc.
        idx2 = col2[1:]
        if idx1 == idx2:
            continue
        x1, y1, z1 = f'x{idx1}', f'y{idx1}', f'z{idx1}'
        x2, y2, z2 = f'x{idx2}', f'y{idx2}', f'z{idx2}'
        distance_col = f'distance_{idx1}_{idx2}'
        new_cols[distance_col] = np.sqrt((df[x1] - df[x2])**2 + (df[y1] - df[y2])**2 + (df[z1] - df[z2])**2)
    
    new_df = pd.DataFrame(new_cols)

    return pd.concat([df, new_df], axis=1)

def predict_gesture(landmarks_seq, frame_rate, frame_width, frame_height, gesture_action=""):
    gesture_index = int(time.time())

    header = ['frame'] + [f'{coord}_{i}' for i in range(21) for coord in ('x', 'y', 'z')] + ['frame_rate', 'frame_width', 'frame_height', 'gesture', 'gesture_index', 'distance_0_1']
    data = [[i] + frame_data + [frame_rate, frame_width, frame_height, gesture_action, gesture_index, 0] for i, frame_data in enumerate(landmarks_seq)]  # Initialize distance_0_1 as 0
    
    df = pd.DataFrame(data, columns=header)
    landmark_cols = [col for col in df.columns if col.startswith(("x", "y", "z"))]

    dataframe = calculate_hand_motion_features(df.copy(), landmark_cols)

    csv_buffer = io.StringIO()
    dataframe.to_csv(csv_buffer, columns = [*dataframe])
    csv_buffer.seek(0)

    input_df = pd.read_csv(csv_buffer)

    features = [col for col in input_df.columns if col not in ["frame", "gesture"]]
    scaler = StandardScaler()

    input_df[features] = scaler.fit_transform(input_df[features])

    X_new = input_df[features].values.reshape((1, input_df.shape[0], len(features)))

    prediction = model.predict(X_new)
    predicted_labels = [class_labels[np.argmax(pred)] for pred in prediction]

    gesture_counts = Counter(predicted_labels)

    most_common_gesture = gesture_counts.most_common(1)[0][0]

    return most_common_gesture




def record():
    cap = cv2.VideoCapture(0)
    recording = False
    landmarks_seq = []

    frame_rate = cap.get(cv2.CAP_PROP_FPS)
    frame_width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
    frame_height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(rgb_frame)
        frame = cv2.cvtColor(rgb_frame, cv2.COLOR_RGB2BGR)
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                if recording:
                    # Extract landmarks
                    landmarks = [lm for lm in hand_landmarks.landmark]
                    landmarks_flat = [coord for lm in landmarks for coord in (lm.x, lm.y, lm.z)]
                    landmarks_seq.append(landmarks_flat)

        cv2.imshow("TEST", frame)

        key = cv2.waitKey(5) & 0xFF

        # Start recording on 'r' key press
        if key == ord('r'):
            recording = True
            print("Recording gestures...")
        
        # Stop recording and predict on 's' key press
        elif key == ord('s'):
            recording = False
            if landmarks_seq:
                # Predict gesture from recorded landmarks sequence
                predicted_gesture = predict_gesture(landmarks_seq, frame_rate, frame_width, frame_height)
                print(f"Predicted Gesture: {predicted_gesture}")
            else:
                print("No gestures recorded.")

        # Exit on 'q' key press
        elif key == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()

record()


Recording gestures...


ValueError: Exception encountered when calling LSTMCell.call().

[1mDimensions must be equal, but are 405 and 404 for '{{node sequential_1/lstm_1/lstm_cell_1/MatMul}} = MatMul[T=DT_FLOAT, grad_a=false, grad_b=false, transpose_a=false, transpose_b=false](sequential_1/lstm_1/strided_slice_1, sequential_1/lstm_1/lstm_cell_1/Cast/ReadVariableOp)' with input shapes: [1,405], [404,256].[0m

Arguments received by LSTMCell.call():
  • inputs=tf.Tensor(shape=(1, 405), dtype=float32)
  • states=('tf.Tensor(shape=(1, 64), dtype=float32)', 'tf.Tensor(shape=(1, 64), dtype=float32)')
  • training=False