# Notebook Explanation and Important Link
This notebook follows the same data format as Aditya's, so you should be able to use the data by simply changing the directory or link, if necessary. <br>

There are 3 main changes that have been made:
1.   Instead of using B's videos (which was highly optimized for CNN), Wiame's videos are used. It maintains the original resolution but has standardized frames (113 frames). If the duration still seems too long, you can simply select 1 frame every 2 or 3 frames to reduce it.
2.   Face Landmark Removal. Previously, in addition to Pose (33 landmarks) and Hands (21x2 landmarks), the Face model with 468 landmarks was included. However, due to the imbalance in the number of landmarks and the limited contribution of facial data to sign language recognition, it was removed.
3.   When the pose or hand is not detected, instead of using a zero array as a filler, the previous detected coordinates are used to maintain continuity.

Data Format<br>
The output numpy array has the shape (113, 75, 3):<br>
113 = Frame count <br>
75 = Key points (0-32 pose, 33-53 left hand, 54-74 right hand). You can select specific hand indices if needed.<br>
3 = Coordinates (x,y,z).

Link:
1.   [All data for and from this notebook, drive](https://drive.google.com/drive/folders/1rTRZxMkvAyf805AuPoVvrfw8KnB3Ttod?usp=share_link)
2.   [Aditya original notebook, slack post](https://omdenaindones-9mu9399.slack.com/archives/C07MH4C0YLF/p1732443924936359)
3.   [Wiame processed videos, slack post](https://omdenaindones-9mu9399.slack.com/archives/C07N05MQNCC/p1732105984337299)




# Future Improvement

1.   **Landmark-Level Augmentation.** Similar to video augmentation, but applied only to the coordinates. This includes mirroring, scaling, rotation, and adding noise.
2.   **Model Result Comparison(Zero vs. Non-Zero).** A reference for future extraction, comparing results when using zero-filled coordinates versus using previously detected coordinates.
3.   **Specific Hand and Pose Detection (vs. Holistic).** Focusing on specific hands and poses rather than holistic detection could allow for more flexible parameters, improving extraction performance and reducing landmark extraction duration.
4.   **Confidence Parameter Adjustment.** Instead of using the default confidence threshold of 5, adjust it depending on the hand detection frequency. Lower it if hands are often not detected, or increase it for more precise results.
5. **GPU Version?**




# Install and Import Dependencies

In [None]:
!pip install -q mediapipe

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.1/36.1 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import cv2
import time
import torch
import numpy as np
import mediapipe as mp
from matplotlib import pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Extract and Save Keypoints

## Non-Zero Extraction

When the pose or hand is not detected, instead of using [0, 0, 0] to fill the coordinates, the previously detected coordinates are used. This way, the continuity of movement is preserved.

In [None]:
# Initialize Mediapipe Holistic
mp_holistic = mp.solutions.holistic
holistic = mp_holistic.Holistic(static_image_mode=False,
                                min_detection_confidence=0.5,
                                min_tracking_confidence=0.5)

def extract_keypoints(video_path):

    prev_left_keypoints = np.zeros((21, 3))
    prev_right_keypoints = np.zeros((21, 3))
    prev_pose_keypoints = np.zeros((33, 3))

    cap = cv2.VideoCapture(video_path)
    keypoints_sequence = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = holistic.process(frame_rgb)

        # Extract pose landmarks
        if results.pose_landmarks:
            pose_keypoints = np.array([[lm.x, lm.y, lm.z] for lm in results.pose_landmarks.landmark])
            prev_pose_keypoints = pose_keypoints
        else:
            pose_keypoints = prev_pose_keypoints  # 33 pose landmarks

        # Extract hand landmarks
        if results.left_hand_landmarks:
            left_hand_keypoints = np.array([[lm.x, lm.y, lm.z] for lm in results.left_hand_landmarks.landmark])
            prev_left_keypoints = left_hand_keypoints
        else:
            left_hand_keypoints = prev_left_keypoints  # 21 hand landmarks for left hand

        if results.right_hand_landmarks:
            right_hand_keypoints = np.array([[lm.x, lm.y, lm.z] for lm in results.right_hand_landmarks.landmark])
            prev_right_keypoints = right_hand_keypoints
        else:
            right_hand_keypoints = prev_right_keypoints  # 21 hand landmarks for right hand

        # Concatenate all keypoints into a single vector
        keypoints = np.concatenate([pose_keypoints, left_hand_keypoints, right_hand_keypoints])
        keypoints_sequence.append(keypoints)

    cap.release()

    keypoints_sequence = np.array(keypoints_sequence)
    if keypoints_sequence.shape != (113, 75, 3):
      print("abort mission, wrong shape")
      return None

    return keypoints_sequence  # Shape: (num_frames, total_keypoints, 3)

In [None]:
DATA_DIR = '/content/drive/MyDrive/Omdena/sign_language_recognition/processed_videos_1'
SAVE_DIR = '/content/drive/MyDrive/Omdena/sign_language_recognition/landmark_non_zero'

os.makedirs(SAVE_DIR, exist_ok=True)

for word in os.listdir(DATA_DIR):
    word_dir = os.path.join(DATA_DIR, word)
    save_word_dir = os.path.join(SAVE_DIR, word)
    os.makedirs(save_word_dir, exist_ok=True)

    for video_file in os.listdir(word_dir):
        save_path = os.path.join(save_word_dir, video_file.replace('.mp4', '.npy'))

        Skip if the keypoints file already exists
        if os.path.exists(save_path):
            continue

        print("Processing" , word, video_file)
        video_path = os.path.join(word_dir, video_file)
        keypoints = extract_keypoints(video_path)
        np.save(save_path, keypoints)  # Save as .npy

Processing lihat 1_processed.mp4
Processing lihat kenji_processed.mp4
Processing lihat dina_lihat_processed.mp4
Processing lihat Anastasiia_processed.mp4
Processing lihat nadya_processed.mp4
Processing lihat Arun_processed.mp4
Processing lihat Deepa_processed.mp4
Processing lihat shikha_processed.mp4
Processing ibu 1_processed.mp4
Processing ibu kenji_processed.mp4
Processing ibu Roshan_Thapa_processed.mp4
Processing ibu louis_10_processed.mp4
Processing ibu Viorelia_processed.mp4
Processing ibu vaishnavi_desai_processed.mp4
Processing ibu mahsa_processed.mp4
Processing ibu Bhavitha_Bojja_processed.mp4
Processing ibu kenzo_10_processed.mp4
Processing selamat 1_processed.mp4
Processing selamat kenji_processed.mp4
Processing selamat louis_7_processed.mp4
Processing selamat Marouf_selemat_processed.mp4
Processing selamat thant_htoo_san_processed.mp4
Processing selamat selamat_processed.mp4
Processing selamat Rebecca_processed.mp4
Processing selamat kenzo_18_processed.mp4
Processing guru 1

Note: It takes 46 mins for full run

## Zero-Fill Extraction

When the pose or hand is not detected, [0, 0, 0] is used as a placeholder to maintain the shape. However, during modeling, it might be better to mask the filler coordinates for improved performance.

In [None]:
# Initialize Mediapipe Holistic
mp_holistic = mp.solutions.holistic
holistic = mp_holistic.Holistic(static_image_mode=False,
                                min_detection_confidence=0.5,
                                min_tracking_confidence=0.5)

def extract_keypoints(video_path):

    zero_hand_keypoints = np.zeros((21, 3))
    zero_pose_keypoints = np.zeros((33, 3))

    cap = cv2.VideoCapture(video_path)
    keypoints_sequence = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = holistic.process(frame_rgb)

        # Extract pose landmarks
        if results.pose_landmarks:
            pose_keypoints = np.array([[lm.x, lm.y, lm.z] for lm in results.pose_landmarks.landmark])
        else:
            pose_keypoints = zero_pose_keypoints  # 33 pose landmarks

        # Extract hand landmarks
        if results.left_hand_landmarks:
            left_hand_keypoints = np.array([[lm.x, lm.y, lm.z] for lm in results.left_hand_landmarks.landmark])
        else:
            left_hand_keypoints = zero_hand_keypoints  # 21 hand landmarks for left hand

        if results.right_hand_landmarks:
            right_hand_keypoints = np.array([[lm.x, lm.y, lm.z] for lm in results.right_hand_landmarks.landmark])
        else:
            right_hand_keypoints = zero_hand_keypoints  # 21 hand landmarks for left hand

        # Concatenate all keypoints into a single vector
        keypoints = np.concatenate([pose_keypoints, left_hand_keypoints, right_hand_keypoints])
        keypoints_sequence.append(keypoints)

    cap.release()
    keypoints_sequence = np.array(keypoints_sequence)
    if keypoints_sequence.shape != (113, 75, 3):
      print("abort mission, wrong shape")
      return None
    return keypoints_sequence
    # Shape: (num_frames, total_keypoints, 3)

In [None]:
DATA_DIR = '/content/drive/MyDrive/Omdena/sign_language_recognition/processed_videos_1'
SAVE_DIR = '/content/drive/MyDrive/Omdena/sign_language_recognition/landmark_with_zero'

os.makedirs(SAVE_DIR, exist_ok=True)

for word in os.listdir(DATA_DIR):
    word_dir = os.path.join(DATA_DIR, word)
    save_word_dir = os.path.join(SAVE_DIR, word)
    os.makedirs(save_word_dir, exist_ok=True)

    for video_file in os.listdir(word_dir):
        save_path = os.path.join(save_word_dir, video_file.replace('.mp4', '.npy'))

        # Skip if the keypoints file already exists
        # if os.path.exists(save_path):
        #     continue

        print("Processing" , word, video_file)
        video_path = os.path.join(word_dir, video_file)
        keypoints = extract_keypoints(video_path)
        np.save(save_path, keypoints)  # Save as .npy

Processing lihat 1_processed.mp4
Processing lihat kenji_processed.mp4
Processing lihat dina_lihat_processed.mp4
Processing lihat Anastasiia_processed.mp4
Processing lihat nadya_processed.mp4
Processing lihat Arun_processed.mp4
Processing lihat Deepa_processed.mp4
Processing lihat shikha_processed.mp4
Processing ibu 1_processed.mp4
Processing ibu kenji_processed.mp4
Processing ibu Roshan_Thapa_processed.mp4
Processing ibu louis_10_processed.mp4
Processing ibu Viorelia_processed.mp4
Processing ibu vaishnavi_desai_processed.mp4
Processing ibu mahsa_processed.mp4
Processing ibu Bhavitha_Bojja_processed.mp4
Processing ibu kenzo_10_processed.mp4
Processing selamat 1_processed.mp4
Processing selamat kenji_processed.mp4
Processing selamat louis_7_processed.mp4
Processing selamat Marouf_selemat_processed.mp4
Processing selamat thant_htoo_san_processed.mp4
Processing selamat selamat_processed.mp4
Processing selamat Rebecca_processed.mp4
Processing selamat kenzo_18_processed.mp4
Processing guru 1

Anything below has not been tested or run yet; it's just for future improvement or potential needs.

# Landmark Level Augmentation


# Push to Dagshub



In [2]:
# Install the DagsHub python client
!pip install -q dagshub

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/252.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m245.8/252.2 kB[0m [31m10.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.2/252.2 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.2/203.2 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.5/49.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.2/83.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━

In [None]:
from dagshub.notebook import save_notebook

save_notebook(repo="Omdena/JakartaIndonesia_SignLanguageTranslation", path="preprocessing", branch="kenji")

# Split Train and Test Dataset

I haven't run this cell, so no train split folder has been created

In [None]:
from sklearn.model_selection import train_test_split

DATA_DIR = '/content/drive/MyDrive/Sign Language/datasets/extracted-keypoints'
SAVE_SPLIT_DIR = '/content/drive/MyDrive/Sign Language/datasets'

# Create directories to save the split data
os.makedirs(SAVE_SPLIT_DIR, exist_ok=True)
os.makedirs(os.path.join(SAVE_SPLIT_DIR, 'train'), exist_ok=True)
os.makedirs(os.path.join(SAVE_SPLIT_DIR, 'test'), exist_ok=True)

# Parameters
test_size = 0.1  # 10% of data for testing

# Process each word class
for word in os.listdir(DATA_DIR):
    word_dir = os.path.join(DATA_DIR, word)
    videos = [os.path.join(word_dir, file) for file in os.listdir(word_dir) if file.endswith('.npy')]

    # Split data into train and test sets
    train_videos, test_videos = train_test_split(videos, test_size=test_size, random_state=42)

    # Save train videos
    train_save_dir = os.path.join(SAVE_SPLIT_DIR, 'train', word)
    os.makedirs(train_save_dir, exist_ok=True)
    for train_video in train_videos:
        data = np.load(train_video)
        np.save(os.path.join(train_save_dir, os.path.basename(train_video)), data)

    # Save test videos
    test_save_dir = os.path.join(SAVE_SPLIT_DIR, 'test', word)
    os.makedirs(test_save_dir, exist_ok=True)
    for test_video in test_videos:
        data = np.load(test_video)
        np.save(os.path.join(test_save_dir, os.path.basename(test_video)), data)