In [2]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout, Conv1D, MaxPooling1D, GRU, TimeDistributed, Flatten ,Conv2D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

import mediapipe as mp
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

# Paths and constants
DATA_PATH = 'DATA'  # Path to save videos and keypoints
# DATA_PATH = 'NEW_DATA'  # Path to save new videos and keypoints for fine tuning the model
# accumulated actions
ACTIONS = np.array(['Hi', 'Saya Sayang Awak', 'Makan',
                   'Selamat Malam', 'Terima Kasih', 'Apa Khabar',
                   'Awak', 'Saya', 'Minum',
                   'Salah', 'Betul', 'Minta Maaf',
                   'Tolong', 'Hijau', 'Kita',
                   'Mereka', 'Ini', 'Itu',
                   'Apa', 'Siapa',
                    'Ini Di Luar Pengetahuan Saya'
                    ])

# new actions to be added to the dataset
# ACTIONS = np.array(['Selamat Malam', 'Terima Kasih', 'Apa Khabar'])

OLD_ACTIONS = np.array(['Hi', 'Saya Sayang Awak', 'Makan'])
NEW_ACTIONS = np.array(['Selamat Malam', 'Terima Kasih', 'Apa Khabar'])


NO_SEQUENCES = 90  # Number of videos per action
SEQUENCE_LENGTH = 90  # Frames per video

# Mediapipe setup
mp_holistic = mp.solutions.holistic  # Holistic model
mp_drawing = mp.solutions.drawing_utils  # Drawing utilities

def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    return image, results

def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33 * 4)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21 * 3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21 * 3)
    return np.concatenate([pose, lh, rh])

def extract_keypoints_from_video(video_path):
    cap = cv2.VideoCapture(video_path)
    keypoints = []
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            _, results = mediapipe_detection(frame, holistic)
            keypoint_frame = extract_keypoints(results)
            keypoints.append(keypoint_frame)

    cap.release()
    return np.array(keypoints)

def process_videos_to_keypoints(start_sequence=0, end_sequence=None):
    """
    Process videos to extract keypoints and save as .npy files,
    keeping .npy indices aligned with video indices.
    """
    for action in ACTIONS:
        action_path = os.path.join(DATA_PATH, action)
        
        # Get all video files in the action directory
        video_files = [f for f in os.listdir(action_path) if f.endswith(".mp4")]
        
        # Sort files numerically based on the numeric part of their names
        video_files.sort(key=lambda x: int(''.join(filter(str.isdigit, x)) or 0))
        
        # If end_index is None, process all files from start_index onwards
        if end_sequence is None:
            end_sequence = len(video_files)
        
        for video_file in video_files[start_sequence:end_sequence]:
            # Extract the numeric index from the video filename
            video_index = int(''.join(filter(str.isdigit, os.path.splitext(video_file)[0])) or 0)
            
            video_path = os.path.join(action_path, video_file)
            keypoints = extract_keypoints_from_video(video_path)
            
            # Save keypoints as .npy, ensuring consistent zero-based naming
            npy_path = os.path.join(action_path, f"{video_index}.npy")
            np.save(npy_path, keypoints)
            print(f"Processed and saved keypoints for Action({action}) {video_file} as {video_index}.npy")
            
# Model Name Declaration
model_codename = "video_model_CNN_LSTM_v10.keras"

# Test the model in real-time
def real_time_test(model):
    sequence = []
    sentence = []
    predictions = []
    threshold = 0.45

    cap = cv2.VideoCapture(0)
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            
            # Process frame
            _, results = mediapipe_detection(frame, holistic)
            keypoints = extract_keypoints(results)
            sequence.append(keypoints)
            sequence = sequence[-SEQUENCE_LENGTH:]
            
            # Predict action
            if len(sequence) == SEQUENCE_LENGTH:
                res = model.predict(np.expand_dims(sequence, axis=0))[0]
                print(res)
                predictions.append(np.argmax(res))
                
                if res[np.argmax(res)] > threshold:
                    action = ACTIONS[np.argmax(res)]
                    sentence.append(action)

            # Display prediction
            # Assuming `frame` is the video frame
            frame_height, frame_width, _ = frame.shape  # Get the frame dimensions
            
            # Define rectangle at the bottom
            cv2.rectangle(frame, (0, frame_height - 40), (frame_width, frame_height), (255, 255, 255), -1)
            
            # Add text inside the rectangle
            cv2.putText(frame, ' '.join(sentence[-1:]), (10, frame_height - 10), 
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (13, 13, 13), 2, cv2.LINE_AA)
            cv2.imshow("Sign Language Recognition", frame)

            if cv2.waitKey(10) & 0xFF == ord('q'):
                break   

    cap.release()
    cv2.destroyAllWindows()

model = tf.keras.models.load_model(model_codename)
real_time_test(model)

[6.3678954e-04 1.8761620e-04 1.0408169e-01 3.1999725e-09 2.9302348e-04
 2.3368656e-14 7.1641548e-06 9.4858450e-07 8.8490701e-01 1.1510877e-03
 3.1043237e-06 1.8248217e-11 6.0040672e-10 2.3771022e-03 1.5862769e-07
 1.2301875e-07 9.6351378e-08 6.6014296e-09 2.5464869e-12 6.3539390e-03
 7.6428783e-09]
[6.3477969e-04 1.9479786e-04 1.0674720e-01 3.1879219e-09 2.9133871e-04
 2.2754676e-14 7.2198318e-06 9.6111603e-07 8.8170236e-01 1.1945649e-03
 3.1890515e-06 1.7958142e-11 5.9380112e-10 2.4185220e-03 1.6013760e-07
 1.2015467e-07 9.9218369e-08 6.8186226e-09 2.4701690e-12 6.8047512e-03
 7.6131084e-09]
[6.0201995e-04 2.3212717e-04 1.1582631e-01 3.5093581e-09 3.1009529e-04
 2.2040004e-14 8.4353551e-06 1.1232208e-06 8.6958921e-01 1.5345332e-03
 3.8774529e-06 1.8081812e-11 6.3146433e-10 2.7623337e-03 1.9940573e-07
 1.3441174e-07 1.1402665e-07 8.6032408e-09 2.4789723e-12 9.1295484e-03
 8.7601837e-09]
[6.31629897e-04 2.73573474e-04 1.23366557e-01 3.67955444e-09
 3.31036514e-04 2.02845214e-14 8.765135

In [1]:
import tensorflow as tf
# Load the model in TensorFlow 2.16.1 environment
model = tf.keras.models.load_model("video_model_TCN_v1.keras")

# Save the model in HDF5 format (more compatible with older versions)
# model.save("video_model_CNN_LSTM_v10.h5")
# model.save("model")
model.export('tcn_model')





INFO:tensorflow:Assets written to: tcn_model\assets


INFO:tensorflow:Assets written to: tcn_model\assets


Saved artifact at 'tcn_model'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 30, 258), dtype=tf.float32, name='input_1')
Output Type:
  TensorSpec(shape=(None, 21), dtype=tf.float32, name=None)
Captures:
  2569537641760: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2569537641056: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2569520470800: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2569537647392: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2569538735264: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2569537639120: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2569537636832: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2569537644752: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2569538735616: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2569538739664: TensorSpec(shape=(), dtype=tf.resource, name=None)


In [2]:
!pip list

Package                      Version
---------------------------- --------------
absl-py                      2.1.0
anyio                        4.7.0
argon2-cffi                  23.1.0
argon2-cffi-bindings         21.2.0
arrow                        1.3.0
asttokens                    3.0.0
astunparse                   1.6.3
async-lru                    2.0.4
attrs                        24.2.0
babel                        2.16.0
beautifulsoup4               4.12.3
bleach                       6.2.0
cachetools                   5.5.0
certifi                      2024.8.30
cffi                         1.17.1
charset-normalizer           3.4.0
colorama                     0.4.6
comm                         0.2.2
contourpy                    1.3.1
cycler                       0.12.1
debugpy                      1.8.9
decorator                    5.1.1
defusedxml                   0.7.1
exceptiongroup               1.2.2
executing                    2.1.0
fastjsonschema               2.21


[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import tensorflow as tf
# Load the model in TensorFlow 2.16.1 environment
model = tf.keras.models.load_model("video_model_CNN_LSTM_v9.keras")
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_8 (Conv1D)           (None, 90, 64)            49600     
                                                                 
 max_pooling1d_8 (MaxPoolin  (None, 45, 64)            0         
 g1D)                                                            
                                                                 
 dropout_8 (Dropout)         (None, 45, 64)            0         
                                                                 
 conv1d_9 (Conv1D)           (None, 45, 128)           24704     
                                                                 
 max_pooling1d_9 (MaxPoolin  (None, 22, 128)           0         
 g1D)                                                            
                                                                 
 dropout_9 (Dropout)         (None, 22, 128)          