In [1]:
# -------------------------------
# Part F: Real-time Webcam Demo (.h5 LSTM) with Landmarks
# -------------------------------

import cv2
import numpy as np
from collections import deque
import time
import tensorflow as tf
import mediapipe as mp
import os
import pickle

# --- Parameters
SEQUENCE_LENGTH = 32
FEATURE_SIZE = 138
MODEL_SAVE_PATH = os.path.join("processed_dataset", "isl_gesture_model.h5")
PROCESSED_DIR = "processed_dataset"

# --- Load label map
with open(os.path.join(PROCESSED_DIR, "label_map.pkl"), "rb") as f:
    label_to_idx = pickle.load(f)
idx_to_label = {v: k for k, v in label_to_idx.items()}

# --- Load trained LSTM model
model = tf.keras.models.load_model(MODEL_SAVE_PATH)
print(f"✅ Loaded model: {MODEL_SAVE_PATH}")

# --- Initialize MediaPipe Holistic
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
holistic = mp_holistic.Holistic(
    static_image_mode=False,
    model_complexity=1,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5,
    refine_face_landmarks=False
)

# --- Feature extraction function (138 features)
def extract_features(frame, holistic):
    image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = holistic.process(image_rgb)
    vec = []

    # Pose: shoulders (11=left, 12=right)
    if results.pose_landmarks:
        for idx in [11, 12]:
            lm = results.pose_landmarks.landmark[idx]
            vec.extend([lm.x, lm.y, lm.z])
    else:
        vec.extend([0.0]*3*2)

    # Hands: 21 landmarks each
    for hand in [results.left_hand_landmarks, results.right_hand_landmarks]:
        if hand:
            for lm in hand.landmark:
                vec.extend([lm.x, lm.y, lm.z])
        else:
            vec.extend([0.0]*3*21)

    # Palms: index 0 each hand
    if results.left_hand_landmarks:
        vec.extend([results.left_hand_landmarks.landmark[0].x,
                    results.left_hand_landmarks.landmark[0].y,
                    results.left_hand_landmarks.landmark[0].z])
    else:
        vec.extend([0.0]*3)
    if results.right_hand_landmarks:
        vec.extend([results.right_hand_landmarks.landmark[0].x,
                    results.right_hand_landmarks.landmark[0].y,
                    results.right_hand_landmarks.landmark[0].z])
    else:
        vec.extend([0.0]*3)

    return np.array(vec, dtype=float) if len(vec) == FEATURE_SIZE else np.zeros(FEATURE_SIZE, dtype=float), results

# --- Sliding window buffer
buffer = deque(maxlen=SEQUENCE_LENGTH)
last_pred_text = None
last_time = time.time()

# --- Open webcam
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    cap = cv2.VideoCapture(1)
    if not cap.isOpened():
        raise RuntimeError("Cannot open webcam")

print("Webcam opened. Press 'q' to quit.")

try:
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Failed to read frame.")
            break

        # Extract features and get landmarks
        lm, results = extract_features(frame, holistic)
        buffer.append(lm)

        h, w, _ = frame.shape

        # --- Draw pose landmarks (shoulders) ---
        if results.pose_landmarks:
            left_shoulder = results.pose_landmarks.landmark[11]
            right_shoulder = results.pose_landmarks.landmark[12]
            cv2.circle(frame, (int(left_shoulder.x*w), int(left_shoulder.y*h)), 8, (0,255,0), -1)
            cv2.circle(frame, (int(right_shoulder.x*w), int(right_shoulder.y*h)), 8, (0,255,0), -1)

        # --- Draw hand landmarks + palms ---
        for hand_landmarks, color in zip([results.left_hand_landmarks, results.right_hand_landmarks],
                                        [(0,0,255),(255,0,0)]):
            if hand_landmarks:
                # Draw all hand landmarks
                mp_drawing.draw_landmarks(frame, hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                                          landmark_drawing_spec=mp_drawing.DrawingSpec(color=color, thickness=2, circle_radius=3),
                                          connection_drawing_spec=mp_drawing.DrawingSpec(color=color, thickness=2))
                # Highlight palm (index 0)
                lm0 = hand_landmarks.landmark[0]
                cv2.circle(frame, (int(lm0.x*w), int(lm0.y*h)), 12, color, -1)

        # Predict gesture if buffer full
        if len(buffer) == SEQUENCE_LENGTH:
            seq_input = np.array(buffer, dtype=np.float32).reshape(1, SEQUENCE_LENGTH, FEATURE_SIZE)
            pred = model.predict(seq_input, verbose=0)[0]
            idx = int(np.argmax(pred))
            conf = float(pred[idx])
            label = idx_to_label.get(idx, "Unknown")
            last_pred_text = f"{label} ({conf*100:.1f}%)"
            last_time = time.time()

        # Display prediction
        if last_pred_text:
            cv2.rectangle(frame, (10, 10), (450, 60), (0, 0, 0), -1)
            cv2.putText(frame, last_pred_text, (20, 45),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 255, 0), 2, cv2.LINE_AA)

        # Show webcam frame
        cv2.imshow("ISL Gesture Real-Time Demo", frame)

        # Quit on 'q'
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

except KeyboardInterrupt:
    print("Interrupted by user.")

finally:
    cap.release()
    cv2.destroyAllWindows()
    holistic.close()
    print("✅ Real-time demo stopped.")




✅ Loaded model: processed_dataset\isl_gesture_model.h5
Webcam opened. Press 'q' to quit.
✅ Real-time demo stopped.


In [1]:
# -------------------------------
# Real-time Webcam Demo using .tflite LSTM model (with face landmarks to hide face)
# -------------------------------
import cv2
import numpy as np
from collections import deque
import time
import tensorflow as tf
import mediapipe as mp
import os
import pickle

# --- Parameters ---
SEQUENCE_LENGTH = 32
FEATURE_SIZE = 138
MODEL_PATH = os.path.join("processed_dataset", "isl_gesture_model.tflite")
PROCESSED_DIR = "processed_dataset"

# --- Load label map ---
with open(os.path.join(PROCESSED_DIR, "label_map.pkl"), "rb") as f:
    label_to_idx = pickle.load(f)
idx_to_label = {v: k for k, v in label_to_idx.items()}

# --- Load TensorFlow Lite model ---
interpreter = tf.lite.Interpreter(model_path=MODEL_PATH)
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
print(f"Loaded TFLite model: {MODEL_PATH}")

# --- Initialize MediaPipe Holistic ---
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
holistic = mp_holistic.Holistic(
    static_image_mode=False,
    model_complexity=1,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5,
    refine_face_landmarks=False
)

# --- Feature extraction function (138 features) ---
def extract_features(frame, holistic):
    image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = holistic.process(image_rgb)
    vec = []

    # Pose: shoulders (11=left, 12=right)
    if results.pose_landmarks:
        for idx in [11, 12]:
            lm = results.pose_landmarks.landmark[idx]
            vec.extend([lm.x, lm.y, lm.z])
    else:
        vec.extend([0.0]*3*2)

    # Hands: 21 landmarks each
    for hand in [results.left_hand_landmarks, results.right_hand_landmarks]:
        if hand:
            for lm in hand.landmark:
                vec.extend([lm.x, lm.y, lm.z])
        else:
            vec.extend([0.0]*3*21)

    # Palms: index 0 each hand
    if results.left_hand_landmarks:
        lm = results.left_hand_landmarks.landmark[0]
        vec.extend([lm.x, lm.y, lm.z])
    else:
        vec.extend([0.0]*3)
    if results.right_hand_landmarks:
        lm = results.right_hand_landmarks.landmark[0]
        vec.extend([lm.x, lm.y, lm.z])
    else:
        vec.extend([0.0]*3)

    return (
        np.array(vec, dtype=float)
        if len(vec) == FEATURE_SIZE
        else np.zeros(FEATURE_SIZE, dtype=float),
        results
    )

# --- Sliding window buffer ---
buffer = deque(maxlen=SEQUENCE_LENGTH)
last_pred_text = None
last_time = time.time()

# --- Open webcam ---
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    cap = cv2.VideoCapture(1)
    if not cap.isOpened():
        raise RuntimeError("Cannot open webcam")

print("Webcam opened. Press 'q' to quit.")

try:
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Failed to read frame.")
            break

        # Extract features and landmarks
        lm, results = extract_features(frame, holistic)
        buffer.append(lm)

        h, w, _ = frame.shape

        # --- Draw pose landmarks (shoulders) ---
        if results.pose_landmarks:
            left_shoulder = results.pose_landmarks.landmark[11]
            right_shoulder = results.pose_landmarks.landmark[12]
            cv2.circle(frame, (int(left_shoulder.x*w), int(left_shoulder.y*h)), 8, (0,255,0), -1)
            cv2.circle(frame, (int(right_shoulder.x*w), int(right_shoulder.y*h)), 8, (0,255,0), -1)

        # --- Draw face landmarks (to hide face) ---
        if results.face_landmarks:
            mp_drawing.draw_landmarks(
                frame,
                results.face_landmarks,
                mp_holistic.FACEMESH_TESSELATION,
                landmark_drawing_spec=None,
                connection_drawing_spec=mp_drawing.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1)
            )

        # --- Draw hand landmarks + palms ---
        for hand_landmarks, color in zip(
            [results.left_hand_landmarks, results.right_hand_landmarks],
            [(0,0,255), (255,0,0)]
        ):
            if hand_landmarks:
                mp_drawing.draw_landmarks(
                    frame,
                    hand_landmarks,
                    mp_holistic.HAND_CONNECTIONS,
                    landmark_drawing_spec=mp_drawing.DrawingSpec(color=color, thickness=2, circle_radius=3),
                    connection_drawing_spec=mp_drawing.DrawingSpec(color=color, thickness=2)
                )
                lm0 = hand_landmarks.landmark[0]
                cv2.circle(frame, (int(lm0.x*w), int(lm0.y*h)), 12, color, -1)

        # --- Run inference when buffer full ---
        if len(buffer) == SEQUENCE_LENGTH:
            seq_input = np.array(buffer, dtype=np.float32).reshape(1, SEQUENCE_LENGTH, FEATURE_SIZE)
            interpreter.set_tensor(input_details[0]['index'], seq_input)
            interpreter.invoke()
            pred = interpreter.get_tensor(output_details[0]['index'])[0]
            idx = int(np.argmax(pred))
            conf = float(pred[idx])
            label = idx_to_label.get(idx, "Unknown")
            last_pred_text = f"{label} ({conf*100:.1f}%)"
            last_time = time.time()

        # --- Display prediction text ---
        if last_pred_text:
            cv2.rectangle(frame, (10, 10), (450, 60), (0, 0, 0), -1)
            cv2.putText(frame, last_pred_text, (20, 45),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 255, 0), 2, cv2.LINE_AA)

        # Show frame
        cv2.imshow("ISL Gesture Real-Time Demo (.tflite)", frame)

        # Quit on 'q'
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

except KeyboardInterrupt:
    print("Interrupted by user.")

finally:
    cap.release()
    cv2.destroyAllWindows()
    holistic.close()
    print("Real-time demo stopped.")


    TF 2.20. Please use the LiteRT interpreter from the ai_edge_litert package.
    See the [migration guide](https://ai.google.dev/edge/litert/migration)
    for details.
    


Loaded TFLite model: processed_dataset\isl_gesture_model.tflite
Webcam opened. Press 'q' to quit.
Real-time demo stopped.
