# RECORD ACTION TO TRAIN MODEL

In [None]:
import cv2
import mediapipe as mp
import numpy as np
import time

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=1, min_detection_confidence=0.7)
cap = cv2.VideoCapture(0)

recording = False
record_data = []
start_time = None

def normalize_hand(positions):
    """Chuẩn hóa bàn tay cho static gesture:
       - Gốc = cổ tay (landmark 0)
       - Scale = khoảng cách cổ tay -> đầu ngón giữa (0 -> 12)
       - Bỏ z, chỉ lấy (x, y)
    """
    wrist = np.array(positions[0][:2])        
    middle_tip = np.array(positions[12][:2])  

    scale = np.linalg.norm(middle_tip - wrist)
    if scale < 1e-6:  
        scale = 1.0

    norm_positions = [((x - wrist[0]) / scale,
                       (y - wrist[1]) / scale) for (x, y, z) in positions]
    return norm_positions

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb)

    positions = None
    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            h, w, _ = frame.shape
            positions = []
            for lm in hand_landmarks.landmark:  
                positions.append((lm.x, lm.y, lm.z))  

                cx, cy = int(lm.x * w), int(lm.y * h)
                cv2.circle(frame, (cx, cy), 5, (0, 255, 0), -1)

    if recording and positions is not None:
        norm_positions = normalize_hand(positions)  
        record_data.append(norm_positions)

        if time.time() - start_time >= 0.3:
            filename = f"hand_record_{int(time.time()*1000)}.npy"
            np.save(filename, np.array(record_data))
            print(f"Saved: {filename}, shape={np.array(record_data).shape}")
            record_data = []
            recording = False

    cv2.putText(frame, "Press '1' to record 1s hand motion", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2)

    cv2.imshow("Hand Tracking", frame)

    key = cv2.waitKey(1) & 0xFF
    if key == 27:  # ESC to exit
        break
    elif key == ord('1') and not recording:
        print("Start recording...")
        recording = True
        start_time = time.time()
        record_data = []

cap.release()
hands.close()
cv2.destroyAllWindows()

# Rename file, in the correct folder to create category for training model

In [None]:
import os

folder_path = r"D:\Github\Machine-Learning-Studies\Handtracking\command\swiperight"  # PICK YOUR FOLDER
files = [f for f in os.listdir(folder_path) if f.endswith(".npy")]

files.sort()

for idx, filename in enumerate(files, 1):
    old_path = os.path.join(folder_path, filename)
    new_name = f"swiperight{idx}.npy" # CHANGE "commandname" TO YOUR COMMAND
    new_path = os.path.join(folder_path, new_name)
    os.rename(old_path, new_path)

print(f"Renamed {len(files)} files to commandname{{x}}.npy")

## Categorize, resize and shaped the datasets for the training

In [None]:
import os
import numpy as np
from sklearn.utils import resample
from scipy.interpolate import interp1d
from tensorflow.keras.utils import to_categorical

def load_all_sequences(base_folder):
    class_names = sorted(os.listdir(base_folder))
    sequences = []
    labels = []
    
    for idx, class_name in enumerate(class_names):
        class_folder = os.path.join(base_folder, class_name)
        if not os.path.isdir(class_folder):
            continue
        
        files = [f for f in os.listdir(class_folder) if f.endswith(".npy")]
        
        for f in files:
            seq_path = os.path.join(class_folder, f)
            seq = np.load(seq_path)  # shape: (frames, features)
            sequences.append(seq)
            labels.append(idx)  # gán nhãn bằng index của class
    
    return np.array(sequences, dtype=object), np.array(labels), class_names


folder_path = r"D:\Github\Machine-Learning-Studies\Handtracking\command" # PICK YOUR FOLDER
X, y, class_names = load_all_sequences(folder_path)

def load_all_sequences(base_folder, balance=False, method="undersample"):
    class_names = sorted(os.listdir(base_folder))
    sequences = []
    labels = []
    class_data = {} 
    
    for idx, class_name in enumerate(class_names):
        class_folder = os.path.join(base_folder, class_name)
        if not os.path.isdir(class_folder):
            continue
        
        files = [f for f in os.listdir(class_folder) if f.endswith(".npy")]
        class_samples = []
        
        for f in files:
            seq_path = os.path.join(class_folder, f)
            seq = np.load(seq_path)  
            class_samples.append(seq)
        
        class_data[idx] = class_samples

    if balance:
        class_sizes = [len(v) for v in class_data.values()]
        if method == "undersample":
            target_size = min(class_sizes)
        elif method == "oversample":
            target_size = max(class_sizes)
        else:
            raise ValueError("'undersample' or 'oversample'")
        
        balanced_sequences = []
        balanced_labels = []
        
        for idx, samples in class_data.items():
            if method == "undersample":
                selected = resample(samples, replace=False, n_samples=target_size, random_state=42)
            else:  # oversample
                selected = resample(samples, replace=True, n_samples=target_size, random_state=42)
            
            balanced_sequences.extend(selected)
            balanced_labels.extend([idx] * target_size)
        
        return np.array(balanced_sequences, dtype=object), np.array(balanced_labels), class_names
    
    for idx, samples in class_data.items():
        sequences.extend(samples)
        labels.extend([idx] * len(samples))
    
    return np.array(sequences, dtype=object), np.array(labels), class_names

def resize_frames(seq, target_len=12):
    """
    Resize 1 sequence (frames, joints, coords) thành target_len frame.
    seq shape: (T, J, C)
    """
    old_len = seq.shape[0]

    if old_len == 1:
        return np.repeat(seq, target_len, axis=0)

    x_old = np.linspace(0, 1, old_len)
    x_new = np.linspace(0, 1, target_len)

    f = interp1d(x_old, seq, axis=0)
    return f(x_new)

X_resized = np.array([resize_frames(seq, 10) for seq in X_bal])

X_gru = X_resized.reshape(X_resized.shape[0], 10, -1) 
y_categorical = to_categorical(y_bal, num_classes=4) 

## Train the model

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, Bidirectional

X_train, X_test, y_train, y_test = train_test_split(X_gru, y_categorical, test_size=0.2, random_state=42)

model = Sequential([
    Bidirectional(GRU(128, return_sequences=True), input_shape=(X_gru.shape[1], X_gru.shape[2])),
    Bidirectional(GRU(64)),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(y_categorical.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=8)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x2302b489c00>

## Save the model

In [None]:
model.save("action_gru_model.h5") 

  saving_api.save_model(


# Test the model on camera

In [None]:
import cv2
import numpy as np
import mediapipe as mp
from tensorflow.keras.models import load_model
from collections import deque
import time
import os

# ========= CONFIG =========
MODEL_PATH = "action_gru_model.h5"

ACTIONS = ['activate', 'nocommand', 'swipeleft', 'swiperight'] # RENAME TO YOUR COMMANDS

USE_Z = False   # IF YOU WANT TO USE Z-AXIS, SET TO TRUE

# ========= LOAD MODEL =========
model = load_model(MODEL_PATH)

# Input shape: (None, T, F)
TIMESTEPS = model.input_shape[1]
FEATURES = model.input_shape[2]
print(f"[INFO] Model input shape: T={TIMESTEPS}, F={FEATURES}")

# ========= HANDS INIT =========
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

# ========= FEATURE EXTRACT =========
def normalize_positions(landmarks, use_z=False):
    """Chuẩn hóa tọa độ bàn tay:
       - Lấy cổ tay (id=0) làm gốc
       - Scale theo khoảng cách cổ tay -> đầu ngón giữa (id=12)
       - Trả về vector (42,) hoặc (63,)
    """
    wrist = np.array([landmarks[0].x, landmarks[0].y] + ([landmarks[0].z] if use_z else []))
    middle_tip = np.array([landmarks[12].x, landmarks[12].y] + ([landmarks[12].z] if use_z else []))

    scale = np.linalg.norm(middle_tip - wrist)
    if scale < 1e-6:
        scale = 1.0

    feats = []
    for lm in landmarks:
        vec = np.array([lm.x, lm.y] + ([lm.z] if use_z else []))
        norm = (vec - wrist) / scale
        feats.extend(norm.tolist())
    return np.array(feats, dtype=np.float32)


def extract_features(results):
    if results.multi_hand_landmarks:
        hand = results.multi_hand_landmarks[0]
        return normalize_positions(hand.landmark, use_z=USE_Z)
    return np.zeros((FEATURES,), dtype=np.float32)


# ========= SEQ BUFFER =========
seq_buffer = deque(maxlen=TIMESTEPS)

# ========= CAMERA =========
cap = cv2.VideoCapture(0)

# ========= LOOP =========
fps_time = time.time()
last_pred = None
last_prob = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    feats = extract_features(results)
    seq_buffer.append(feats)

    if len(seq_buffer) == TIMESTEPS:
        seq_input = np.expand_dims(seq_buffer, axis=0)  # (1, T, F)
        preds = model.predict(seq_input, verbose=0)[0]
        max_idx = np.argmax(preds)
        last_pred = ACTIONS[max_idx] if max_idx < len(ACTIONS) else str(max_idx)
        last_prob = preds[max_idx]

    # ===== DISPLAY =====
    fps = 1.0 / (time.time() - fps_time)
    fps_time = time.time()

    cv2.putText(frame, f"Pred: {last_pred} ({last_prob:.2f})", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    cv2.putText(frame, f"FPS: {fps:.1f}", (10, 70),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)

    cv2.imshow("Hand Gesture Recognition", frame)
    if cv2.waitKey(1) & 0xFF == 27:  # ESC to EXIT
        break

cap.release()
cv2.destroyAllWindows()
