In [20]:
import os
import cv2
import numpy as np
import mediapipe as mp
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score


In [21]:
# ================================
# 2. Configurations
# ================================
DATA_PATH = os.path.join("MP_Data")

# Easily add more gestures here
actions = np.array(["hello", "thanks", "iloveyou", "yes", "no"])  

no_sequences = 30        # Videos per action
sequence_length = 30     # Frames per video
start_folder = 0         # Start index for sequences
threshold = 0.5          # Confidence threshold

In [22]:
# 3. Mediapipe Setup
import mediapipe as mp

mp_holistic = mp.solutions.holistic          # Holistic model (pose + hands + face landmarks detection)
mp_drawing = mp.solutions.drawing_utils     # Drawing utilities
mp_face_mesh = mp.solutions.face_mesh 
      # ✅ Face mesh for face connections
def mediapipe_detection(image, model):
    # Convert BGR (OpenCV) to RGB (Mediapipe uses RGB)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False              # Improve performance
    results = model.process(image)             # Make detection
    image.flags.writeable = True               # Set back to true
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  # Convert back to BGR for OpenCV
    return image, results

def draw_styled_landmarks(image, results):
    specs = {
        "face": [(80, 110, 10), (80, 256, 121)],
        "pose": [(80, 22, 10), (80, 44, 121)],
        "left_hand": [(121, 22, 76), (121, 44, 250)],
        "right_hand": [(245, 117, 66), (245, 66, 230)],
    }

    # Draw Face landmarks (using face mesh)
    if results.face_landmarks:
        mp_drawing.draw_landmarks(
            image,
            results.face_landmarks,
            mp_face_mesh.FACEMESH_TESSELATION,   # ✅ fixed (instead of mp_holistic.FACE_CONNECTIONS)
            mp_drawing.DrawingSpec(color=specs["face"][0], thickness=1, circle_radius=1),
            mp_drawing.DrawingSpec(color=specs["face"][1], thickness=1, circle_radius=1),
        )

    # Draw Pose landmarks
    if results.pose_landmarks:
        mp_drawing.draw_landmarks(
            image,
            results.pose_landmarks,
            mp_holistic.POSE_CONNECTIONS,
            mp_drawing.DrawingSpec(color=specs["pose"][0], thickness=2, circle_radius=4),
            mp_drawing.DrawingSpec(color=specs["pose"][1], thickness=2, circle_radius=2),
        )

    # Draw Left hand landmarks
    if results.left_hand_landmarks:
        mp_drawing.draw_landmarks(
            image,
            results.left_hand_landmarks,
            mp_holistic.HAND_CONNECTIONS,
            mp_drawing.DrawingSpec(color=specs["left_hand"][0], thickness=2, circle_radius=4),
            mp_drawing.DrawingSpec(color=specs["left_hand"][1], thickness=2, circle_radius=2),
        )

    # Draw Right hand landmarks
    if results.right_hand_landmarks:
        mp_drawing.draw_landmarks(
            image,
            results.right_hand_landmarks,
            mp_holistic.HAND_CONNECTIONS,
            mp_drawing.DrawingSpec(color=specs["right_hand"][0], thickness=2, circle_radius=4),
            mp_drawing.DrawingSpec(color=specs["right_hand"][1], thickness=2, circle_radius=2),
        )
def extract_keypoints(results):
    """Extract pose, face, and hand landmarks into a flat NumPy array."""
    
    # Pose (33 landmarks * (x,y,z,visibility))
    pose = np.array([[res.x, res.y, res.z, res.visibility] 
                     for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)

    # Face (468 landmarks * (x,y,z))
    face = np.array([[res.x, res.y, res.z] 
                     for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)

    # Left hand (21 landmarks * (x,y,z))
    lh = np.array([[res.x, res.y, res.z] 
                   for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)

    # Right hand (21 landmarks * (x,y,z))
    rh = np.array([[res.x, res.y, res.z] 
                   for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

    # Concatenate into single array
    return np.concatenate([pose, face, lh, rh])


In [23]:
def collect_data():
    # Create folders for all actions & sequences
    for action in actions:
        for sequence in range(start_folder, start_folder+no_sequences):
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)), exist_ok=True)

    cap = cv2.VideoCapture(0)
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        for action in actions:
            for sequence in range(start_folder, start_folder+no_sequences):
                for frame_num in range(sequence_length):
                    ret, frame = cap.read()
                    image, results = mediapipe_detection(frame, holistic)
                    draw_styled_landmarks(image, results)

                    if frame_num == 0:
                        cv2.putText(image, f"STARTING {action} Seq:{sequence}", (20,200),
                                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 3, cv2.LINE_AA)
                        cv2.imshow("OpenCV Feed", image)
                        cv2.waitKey(1000)
                    else:
                        cv2.putText(image, f"{action} Seq:{sequence}", (20,40),
                                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2, cv2.LINE_AA)
                        cv2.imshow("OpenCV Feed", image)

                    # Save keypoints
                    keypoints = extract_keypoints(results)
                    npy_path = os.path.join(DATA_PATH, action, str(sequence))
                    np.save(os.path.join(npy_path, str(frame_num)), keypoints)

                    if cv2.waitKey(10) & 0xFF == ord("q"):
                        cap.release()
                        cv2.destroyAllWindows()
                        return
    cap.release()
    cv2.destroyAllWindows()


In [24]:
collect_data()


In [25]:
# 5. Preprocess Data
# ================================
def preprocess_data():
    label_map = {label:num for num, label in enumerate(actions)}
    sequences, labels = [], []
    for action in actions:
        for sequence in np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int):
            window = [np.load(os.path.join(DATA_PATH, action, str(sequence), f"{frame_num}.npy")) for frame_num in range(sequence_length)]
            sequences.append(window)
            labels.append(label_map[action])
    X = np.array(sequences)
    y = to_categorical(labels).astype(int)
    return train_test_split(X, y, test_size=0.05)

In [26]:
 #6. Build Model
# ================================
def build_model():
    model = Sequential([
        LSTM(64, return_sequences=True, activation="relu", input_shape=(30,1662)),
        LSTM(128, return_sequences=True, activation="relu"),
        LSTM(64, return_sequences=False, activation="relu"),
        Dense(64, activation="relu"),
        Dense(32, activation="relu"),
        Dense(actions.shape[0], activation="softmax")
    ])
    model.compile(optimizer="Adam", loss="categorical_crossentropy", metrics=["categorical_accuracy"])
    return model


In [27]:
# 7. Train Model
# ================================
def train_model(model, X_train, y_train):
    log_dir = os.path.join("Logs")
    callbacks = [
        TensorBoard(log_dir=log_dir),
        EarlyStopping(monitor="loss", patience=20, restore_best_weights=True),
        ReduceLROnPlateau(monitor="loss", factor=0.5, patience=10)
    ]
    model.fit(X_train, y_train, epochs=500, callbacks=callbacks)
    return model


In [28]:
# 8. Real-Time Prediction
# ================================
def live_prediction(model):
    sequence, sentence, predictions = [], [], []
    cap = cv2.VideoCapture(0)
    colors = [(245,117,16), (117,245,16), (16,117,245), (255,0,0), (0,255,255)]

    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            ret, frame = cap.read()
            image, results = mediapipe_detection(frame, holistic)
            draw_styled_landmarks(image, results)

            keypoints = extract_keypoints(results)
            sequence.append(keypoints)
            sequence = sequence[-30:]

            if len(sequence) == 30:
                res = model.predict(np.expand_dims(sequence, axis=0))[0]
                predictions.append(np.argmax(res))

                if np.unique(predictions[-10:])[0] == np.argmax(res):
                    if res[np.argmax(res)] > threshold:
                        if len(sentence) == 0 or actions[np.argmax(res)] != sentence[-1]:
                            sentence.append(actions[np.argmax(res)])

                if len(sentence) > 5:
                    sentence = sentence[-5:]

                for num, prob in enumerate(res):
                    cv2.rectangle(image, (0,60+num*40), (int(prob*200), 90+num*40), colors[num % len(colors)], -1)
                    cv2.putText(image, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2)

            cv2.rectangle(image, (0,0), (640, 40), (245,117,16), -1)
            cv2.putText(image, " ".join(sentence), (3,30),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2)

            cv2.imshow("OpenCV Feed", image)

            if cv2.waitKey(10) & 0xFF == ord("q"):
                break
    cap.release()
    cv2.destroyAllWindows()

In [29]:
X_train, X_test, y_train, y_test = preprocess_data()
model = build_model()
model = train_model(model, X_train, y_train)
live_prediction(model)
X_train, X_test, y_train, y_test = preprocess_data()
model = build_model()
model = train_model(model, X_train, y_train)
live_prediction(model)


Epoch 1/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 48ms/step - categorical_accuracy: 0.2183 - loss: 3.1055 - learning_rate: 0.0010
Epoch 2/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - categorical_accuracy: 0.1901 - loss: 3.2476 - learning_rate: 0.0010
Epoch 3/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - categorical_accuracy: 0.2113 - loss: 1.8647 - learning_rate: 0.0010
Epoch 4/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - categorical_accuracy: 0.3239 - loss: 1.5295 - learning_rate: 0.0010
Epoch 5/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - categorical_accuracy: 0.2606 - loss: 1.5524 - learning_rate: 0.0010
Epoch 6/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - categorical_accuracy: 0.2958 - loss: 1.5715 - learning_rate: 0.0010
Epoch 7/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47

In [30]:
model.save("sign_lang_model.h5")

