In [1]:
import mediapipe as mp
import cv2

from tokenizers import Tokenizer
from tokenizers.models import Unigram
from tokenizers.trainers import UnigramTrainer
from tokenizers.processors import TemplateProcessing

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from transformers import T5ForConditionalGeneration
from transformers import PreTrainedTokenizerFast
from transformers import Trainer, TrainingArguments

import evaluate

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.animation as animation

import os
import glob
import random

2024-07-15 01:21:14.074982: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-15 01:21:14.161688: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
MAX_FRAMES = 512
MAX_TOKENS = 512

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# landmark functions

In [4]:
mp_holistic = mp.solutions.holistic
holistic = mp_holistic.Holistic(static_image_mode=False, min_detection_confidence=0.5)





























In [5]:
# requirements
# face
# - lips 
# [0, 267, 269, 270, 409, 291, 375, 321, 405, 314, 17, 84, 181, 91, 146, 61, 185, 40, 39, 37] outside
# [13, 312, 311, 310, 415, 308, 324, 318, 402, 317, 14, 87, 178, 88, 95, 78, 191, 80, 81, 82] inside
# - left eye 
# [386, 387, 388, 466, 263, 249, 390, 373, 374, 380, 381, 382, 362, 398, 384, 385]
# - right eye
# [159, 160, 161, 246, 33, 7, 163, 144, 145, 153, 154, 155, 133, 173, 157, 158]
# - left eyebrow
# [336, 296, 334, 293, 300, 285, 295, 282, 283, 276]
# - right eyebrow
# [107, 66, 105, 63, 70, 55, 65, 52, 53, 46]
# - face outline
# [10, 338, 297, 332, 284, 251, 389, 356, 454, 323, 361, 288, 397, 365, 379, 378, 400, 377, 152, 148, 176, 149, 150, 136, 172, 58, 132, 93, 234, 127, 162, 21, 54, 103, 67, 109]

# pose
# [0, 8, 7]
# [11, 13, 15]
# [12, 14, 16]
# [23, 24]
# - nose
# - left ear
# - right ear
# - left shoulder
# - right shoulder
# - left elbow
# - right elbow
# - left wrist
# - right wrist
# - left hip
# - right hip

# hands
# (all)



INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [6]:
len([13, 312, 311, 310, 415, 308, 324, 318, 402, 317, 14, 87, 178, 88, 95, 78, 191, 80, 81, 82])

20

In [7]:
161 * 3

483

In [8]:
lips_outside = [0, 267, 269, 270, 409, 291, 375, 321, 405, 314, 17, 84, 181, 91, 146, 61, 185, 40, 39, 37]
lips_inside = [13, 312, 311, 310, 415, 308, 324, 318, 402, 317, 14, 87, 178, 88, 95, 78, 191, 80, 81, 82]
left_eye = [386, 387, 388, 466, 263, 249, 390, 373, 374, 380, 381, 382, 362, 398, 384, 385]
right_eye = [159, 160, 161, 246, 33, 7, 163, 144, 145, 153, 154, 155, 133, 173, 157, 158]
left_eyebrow = [336, 296, 334, 293, 300, 285, 295, 282, 283, 276]
right_eyebrow = [107, 66, 105, 63, 70, 55, 65, 52, 53, 46]
face_outline = [10, 338, 297, 332, 284, 251, 389, 356, 454, 323, 361, 288, 397, 365, 379, 378, 400, 377, 152, 148, 176, 149, 150, 136, 172, 58, 132, 93, 234, 127, 162, 21, 54, 103, 67, 109]

only_face = []
only_face.extend(lips_inside)
only_face.extend(left_eye)
only_face.extend(right_eye)
only_face.extend(left_eyebrow)
only_face.extend(right_eyebrow)
only_face.extend(face_outline)

In [9]:
only_pose = [0, 8, 7, 11, 13, 15, 12, 14, 16, 23, 24]

In [10]:
len(only_face)

108

In [11]:
len(only_pose)

11

In [12]:
def resize_and_pad(landmarks, original_width, original_height, target_width=854, target_height=480):
    """
    Resize landmarks to fit the height of the target size, center them,
    and add padding to reach the target width.
    """
    # Calculate scaling factor based on height
    scale = target_height / original_height
    
    # Calculate new width after scaling
    new_width = int(original_width * scale)
    
    # Resize landmarks
    landmarks_resized = landmarks.copy()
    landmarks_resized[0::3] = landmarks_resized[0::3] * scale  # x coordinates
    landmarks_resized[1::3] = landmarks_resized[1::3] * scale  # y coordinates
    
    # Calculate padding
    pad_left = (target_width - new_width) // 2
    
    # Add padding to x coordinates
    landmarks_resized[0::3] += pad_left
    
    return landmarks_resized

In [13]:
def extract_keypoints(video_path, mp_holistic=mp_holistic, target_width=854, target_height=480, holistic=None):
    name = os.path.basename(video_path)
    cap = cv2.VideoCapture(video_path)
    frames = []

    if holistic is None:
        holistic = mp_holistic.Holistic(static_image_mode=False, min_detection_confidence=0.5)
        
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            
        results = holistic.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

        #print(results.face_landmarks)
        if results.pose_landmarks is None:
            print(name, "No pose landmarks")
        if results.face_landmarks is None:
            print(name, "No face landmarks")
        if results.left_hand_landmarks is None:
            print(name, "No left hand landmarks")
        if results.right_hand_landmarks is None:
            print(name, "No right hand landmarks")
        
            
        # Extract keypoints (customize based on your needs)
        pose = np.array([[res.x, res.y, res.z] for res in results.pose_landmarks.landmark])[only_pose].flatten() if results.pose_landmarks else np.zeros(11*4)
        face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark])[only_face].flatten() if results.face_landmarks else np.zeros(108*3)
        lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
        rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

        frame_landmarks = np.concatenate([
            pose, 
            face,
            lh, 
            rh])

        # Denormalize landmarks
        frame_landmarks[0::3] *= original_width  # x coordinates
        frame_landmarks[1::3] *= original_height  # y coordinates

        frame_landmarks = resize_and_pad(frame_landmarks, original_width, original_height, target_width, target_height)

        # Normalize landmarks to the target size
        frame_landmarks[0::3] /= target_width  # x coordinates
        frame_landmarks[1::3] /= target_height  # y coordinates
            
        frames.append(frame_landmarks)
    
    cap.release()
    return np.array(frames)

In [14]:
def interpolate_keypoints(keypoints1, keypoints2, num_frames):
    return np.array([keypoints1 + (keypoints2 - keypoints1) * i / (num_frames - 1) for i in range(num_frames)])

In [15]:
def concat_keypoints(videos_keypoints, num_concat_frames=10):
    new_keypoints = videos_keypoints[0]
    for i in range(1, len(videos_keypoints)):
        new_keypoints = np.concatenate([new_keypoints, interpolate_keypoints(videos_keypoints[i-1][-1], videos_keypoints[i][0], num_concat_frames), videos_keypoints[i]])

    return new_keypoints

In [16]:
# convert keypoints back to xyz
def convert_to_xyz(keypoints):
    return keypoints.reshape((keypoints.shape[0], -1, 3))

In [17]:
def random_rotation(landmarks, max_angle=5):
    angle = np.random.uniform(-max_angle, max_angle)
    rad = np.radians(angle)
    c, s = np.cos(rad), np.sin(rad)
    rotation_matrix = np.array([[c, -s, 0], [s, c, 0], [0, 0, 1]])
    reshaped = landmarks.reshape(-1, 161, 3)
    rotated = np.dot(reshaped, rotation_matrix)
    return rotated.reshape(landmarks.shape)

def random_scale(landmarks, scale_range=(0.8, 1.1)):
    scale = np.random.uniform(*scale_range)

    reshaped = landmarks.reshape(-1, 161, 3)
    
    scaled_frames = []
    
    for landmarks in reshaped:
        # Filter out landmarks with any zero value
        non_zero_landmarks = landmarks[~np.any(landmarks == 0, axis=1)]
        
        # Step 1: Find the center of the landmarks based on the specified method
        min_coords = np.min(non_zero_landmarks, axis=0)
        max_coords = np.max(non_zero_landmarks, axis=0)
        center = (min_coords + max_coords) / 2
        
        # Step 2: Translate landmarks to origin
        translated_landmarks = landmarks - center
        
        # Step 3: Scale the landmarks
        scaled_landmarks = translated_landmarks * scale
        
        # Step 4: Translate back to the center of the image (0.5, 0.5, 0.5)
        final_landmarks = scaled_landmarks + center
        
        # Replace zero-value landmarks with their original positions
        final_landmarks[landmarks == 0] = landmarks[landmarks == 0]
        
        scaled_frames.append(final_landmarks)
    
    # Reshape back to the original shape (num_frames, 483)
    scaled_flattened_frames = np.array(scaled_frames).reshape(-1, 483)
    
    return scaled_flattened_frames

def random_translation(landmarks, max_translation=0.1):
    # Generate translation for each x, y, z
    translation = np.random.uniform(-max_translation, max_translation, size=3)
    
    # Reshape landmarks to (num_frames, 161, 3)
    reshaped = landmarks.reshape(-1, 161, 3)
    
    # Apply translation
    translated = reshaped + translation
    
    # Reshape back to original shape
    return translated.reshape(landmarks.shape)

def add_noise(landmarks, noise_level=0.0001):
    noise = np.random.normal(0, noise_level, landmarks.shape)
    return landmarks + noise

def random_frame_dropout(landmarks, max_dropout_ratio=0.1):
    num_frames = landmarks.shape[0]
    num_dropout = int(num_frames * np.random.uniform(0, max_dropout_ratio))
    dropout_indices = np.random.choice(num_frames, num_dropout, replace=False)
    landmarks[dropout_indices] = 0
    return landmarks

def augment_landmarks(landmarks):
    #print(landmarks.shape)
    landmarks = random_rotation(landmarks)
    #print(landmarks.shape)
    landmarks = random_scale(landmarks)
    #print(landmarks.shape)
    landmarks = random_translation(landmarks)
    #print(landmarks.shape)
    landmarks = add_noise(landmarks)
    #print(landmarks.shape)
    landmarks = random_frame_dropout(landmarks)
    #print(landmarks.shape)
    return landmarks

In [18]:
def forward_fill_landmarks(data):
    """
    Forward fill missing landmark data (represented by 0) with the last known value.
    
    :param data: numpy array of shape (num_frames, num_landmarks * 3)
    :return: numpy array of the same shape with missing values filled
    """
    num_frames, num_features = data.shape
    filled_data = np.copy(data)
    
    # Iterate over each landmark
    for i in range(num_features):
        last_valid_value = None
        for frame in range(num_frames):
            if filled_data[frame, i] != 0:
                last_valid_value = filled_data[frame, i]
            elif last_valid_value is not None:
                filled_data[frame, i] = last_valid_value
    
    return filled_data

In [19]:
video1_keypoints = extract_keypoints("sign_language_translate/dataset/videos/โชคดี.mp4")
video2_keypoints = extract_keypoints("sign_language_translate/dataset/videos/สวัสดี.mp4")























































In [20]:
video1_keypoints.shape

(27, 483)

In [21]:
last_frame_video1 = video1_keypoints[-1]
first_frame_video2 = video2_keypoints[0]

In [22]:
new_video = concat_keypoints([video1_keypoints, video2_keypoints, video1_keypoints])

In [23]:
test = augment_landmarks(video1_keypoints)

In [24]:
xyz = convert_to_xyz(test)

In [25]:
# # draw points in 3d space
# def update(i):
#     ax.clear()
#     ax.set_xlim(0, 854)
#     ax.set_ylim(-1, 1)
#     ax.set_zlim(0, 480)
#     ax.scatter(854 - xyz[i, :, 0] * 854, xyz[i, :, 2], 480 - xyz[i, :, 1] * 480, s=1)

# fig = plt.figure()
# ax = fig.add_subplot(111, projection='3d')
# ani = animation.FuncAnimation(fig, update, frames=len(xyz), interval=1000)

# writervideo = animation.FFMpegWriter(fps=14) 
# ani.save("sign_language_translate/ani.mp4", writer=writervideo) 
# plt.close() 


In [26]:
# draw points in 2d space
def update(i):
    ax.clear()
    ax.set_xlim(0, 854)
    ax.set_ylim(0, 480)
    ax.scatter(854 - xyz[i, :, 0] * 854, 480 - xyz[i, :, 1] * 480, s=1)

fig = plt.figure()
ax = fig.add_subplot(111)
ani = animation.FuncAnimation(fig, update, frames=len(xyz), interval=1000)

writervideo = animation.FFMpegWriter(fps=14)
ani.save("sign_language_translate/ani2d2.mp4", writer=writervideo)
plt.close()


# convert videos to landmark data and save

In [29]:
video_files_paths = glob.glob("sign_language_translate/dataset/video_dataset/*/*/*.mp4")
video_files_paths[:5], len(video_files_paths)

(['sign_language_translate/dataset/video_dataset/การกระทำ/กิน/กิน.mp4',
  'sign_language_translate/dataset/video_dataset/การกระทำ/ตื่น/ตื่น(1).mp4',
  'sign_language_translate/dataset/video_dataset/การกระทำ/ทะเลาะ/ทะเลาะ(1).mp4',
  'sign_language_translate/dataset/video_dataset/การกระทำ/ทำ/ทำ(1).mp4',
  'sign_language_translate/dataset/video_dataset/การกระทำ/ทำอาหาร/ทำอาหาร(1).mp4'],
 67)

In [30]:
video_files_paths

['sign_language_translate/dataset/video_dataset/การกระทำ/กิน/กิน.mp4',
 'sign_language_translate/dataset/video_dataset/การกระทำ/ตื่น/ตื่น(1).mp4',
 'sign_language_translate/dataset/video_dataset/การกระทำ/ทะเลาะ/ทะเลาะ(1).mp4',
 'sign_language_translate/dataset/video_dataset/การกระทำ/ทำ/ทำ(1).mp4',
 'sign_language_translate/dataset/video_dataset/การกระทำ/ทำอาหาร/ทำอาหาร(1).mp4',
 'sign_language_translate/dataset/video_dataset/การกระทำ/นอน/นอน(1).mp4',
 'sign_language_translate/dataset/video_dataset/การกระทำ/เรียนหนังสือ/เรียนหนังสือ(1).mp4',
 'sign_language_translate/dataset/video_dataset/การกระทำ/เห็น/เห็น(1).mp4',
 'sign_language_translate/dataset/video_dataset/ความรู้สึก/คิดถึง/คิดถึง(1).mp4',
 'sign_language_translate/dataset/video_dataset/ความรู้สึก/ดีใจ/ดีใจ(1).mp4',
 'sign_language_translate/dataset/video_dataset/ความรู้สึก/รัก/รัก.mp4',
 'sign_language_translate/dataset/video_dataset/ความรู้สึก/ร้องไห้/ร้องไห้(1).mp4',
 'sign_language_translate/dataset/video_dataset/ความรู้สึก/ร

In [31]:
os.path.split(video_files_paths[1])[0].split("/")[-1]

'ตื่น'

In [96]:
os.path.dirname(video_files_paths[1]).split("/")[-1]

'ตื่น'

In [32]:
# extract keypoints from all videos and save as npy
save_path = "sign_language_translate/dataset/keypoints"
os.makedirs(save_path, exist_ok=True)

for video_path in video_files_paths:
    video_keypoints = extract_keypoints(video_path, holistic=holistic)
    dir_name = os.path.dirname(video_path).split("/")[-1]
    save_dir = os.path.join(save_path, dir_name)
    os.makedirs(save_dir, exist_ok=True)
    np.save(os.path.join(save_dir, os.path.basename(video_path).replace(".mp4", ".npy")), video_keypoints)



กิน.mp4 No left hand landmarks
กิน.mp4 No left hand landmarks
กิน.mp4 No left hand landmarks
กิน.mp4 No left hand landmarks
กิน.mp4 No left hand landmarks
กิน.mp4 No left hand landmarks
กิน.mp4 No left hand landmarks
กิน.mp4 No left hand landmarks
ทะเลาะ(1).mp4 No right hand landmarks
ทำอาหาร(1).mp4 No right hand landmarks
ทำอาหาร(1).mp4 No right hand landmarks
ทำอาหาร(1).mp4 No right hand landmarks
ทำอาหาร(1).mp4 No right hand landmarks
ทำอาหาร(1).mp4 No right hand landmarks
ทำอาหาร(1).mp4 No right hand landmarks
ทำอาหาร(1).mp4 No left hand landmarks
นอน(1).mp4 No left hand landmarks
นอน(1).mp4 No left hand landmarks
นอน(1).mp4 No left hand landmarks
นอน(1).mp4 No left hand landmarks
นอน(1).mp4 No left hand landmarks
นอน(1).mp4 No left hand landmarks
นอน(1).mp4 No left hand landmarks
นอน(1).mp4 No left hand landmarks
นอน(1).mp4 No left hand landmarks
นอน(1).mp4 No left hand landmarks
เห็น(1).mp4 No left hand landmarks
เห็น(1).mp4 No left hand landmarks
เห็น(1).mp4 No left hand landmar

In [59]:
def get_reference_scale(all_videos_npz):
    max_scale = np.zeros(3)
    
    for video_npz in all_videos_npz:
        video = np.load(video_npz)
        num_frames = video.shape[0]
        num_landmarks = 161  # Each landmark has (x, y, z) coordinates

        # Reshape flattened frames to (num_frames, num_landmarks, 3)
        frames = video.reshape(num_frames, num_landmarks, 3)

        for landmarks in frames:
            # Filter out landmarks with any zero value
            non_zero_landmarks = landmarks[~np.any(landmarks == 0, axis=1)]

            min_coords = np.min(non_zero_landmarks, axis=0)
            max_coords = np.max(non_zero_landmarks, axis=0)
            size = max_coords - min_coords

            max_scale = np.maximum(max_scale, size)
    
    return max_scale

In [60]:
videos_npz = glob.glob("sign_language_translate/dataset/keypoints/*/*.npy")

In [61]:
reference_scale = get_reference_scale(videos_npz)
reference_scale

array([0.2736964 , 0.68020546, 2.52361624])

In [62]:
def normalize_landmarks_scale(flattened_frames, reference_scale):
    num_frames = flattened_frames.shape[0]
    num_landmarks = 161  # Each landmark has (x, y, z) coordinates

    # Reshape flattened frames to (num_frames, num_landmarks, 3)
    frames = flattened_frames.reshape(num_frames, num_landmarks, 3)
    
    normalized_frames = []
    
    for landmarks in frames:
        # Filter out landmarks with any zero value
        non_zero_landmarks = landmarks[~np.any(landmarks == 0, axis=1)]
        
        # Step 1: Find the center of the landmarks based on the specified method
        min_coords = np.min(non_zero_landmarks, axis=0)
        max_coords = np.max(non_zero_landmarks, axis=0)
        center = (min_coords + max_coords) / 2

        # Step 2: Translate landmarks to origin
        translated_landmarks = landmarks - center
        
        # Step 3: Normalize the landmarks to fit within the reference scale
        size = max_coords - min_coords
        normalized_landmarks = translated_landmarks / size * reference_scale
        
        # Step 4: Translate back to the center of the image (0.5, 0.5, 0.5)
        image_center = np.array([0.5, 0.5, 0.5])
        final_landmarks = normalized_landmarks + image_center
        
        # Replace zero-value landmarks with their original positions
        final_landmarks[landmarks == 0] = landmarks[landmarks == 0]
        
        normalized_frames.append(final_landmarks)
    
    # Reshape back to the original shape (num_frames, 483)
    normalized_flattened_frames = np.array(normalized_frames).reshape(num_frames, num_landmarks * 3)
    
    return normalized_flattened_frames

In [311]:
# # test normalize_landmarks_scale
# video_keypoints = np.load(glob.glob("sign_language_translate/dataset/keypoints/สวัสดี/*.npy")[0])
# normalized_keypoints = normalize_landmarks_scale(video_keypoints, reference_scale)
# normalized_keypoints.shape

(19, 483)

In [312]:
# xyz = convert_to_xyz(normalized_keypoints)

In [313]:
# # draw points in 2d space
# def update(i):
#     ax.clear()
#     ax.set_xlim(0, 854)
#     ax.set_ylim(0, 480)
#     ax.scatter(854 - xyz[i, :, 0] * 854, 480 - xyz[i, :, 1] * 480, s=5)

# fig = plt.figure()
# ax = fig.add_subplot(111)
# ani = animation.FuncAnimation(fig, update, frames=len(xyz), interval=1000)

# writervideo = animation.FFMpegWriter(fps=14)
# ani.save("sign_language_translate/ani2d2.mp4", writer=writervideo)
# plt.close()


In [37]:
# normalize all videos
all_videos_npz = glob.glob("sign_language_translate/dataset/keypoints/*/*.npy")

for video_npz in all_videos_npz:
    video = np.load(video_npz)
    normalized_video = normalize_landmarks_scale(video, reference_scale)
    
    np.save(video_npz, normalized_video)

# load dataset

In [27]:
csv_dataset = pd.read_csv("sign_language_translate/TSLCall.csv")
csv_dataset

Unnamed: 0,sign,thai
0,โกรธ,ฉันโกรธ
1,ร้องไห้,ฉันร้องไห้
2,ดีใจ,ฉันดีใจ
3,สบายดี,ฉันสบายดี
4,เสียใจ,ฉันเสียใจ
...,...,...
92,เรียนหนังสือ,เรียนหนังสือ
93,กิน,กิน
94,ทะเลาะ,ทะเลาะ
95,ทหาร,ทหาร


# check videos

In [28]:
all_words = words = [word for text in csv_dataset["sign"].to_list() for word in text.split()]
all_words = list(set(all_words))

In [29]:
# check if video exists for all words
for word in all_words:
    if not os.path.exists(f"sign_language_translate/dataset/keypoints/{word}"):
        print(word)

# train tokenizer

In [318]:
tokenizer_trainer = UnigramTrainer(special_tokens=["<pad>", "</s>", "<unk>"])

In [319]:
tokenizer = Tokenizer(Unigram())

In [320]:
# train tokenizer
tokenizer.train_from_iterator(csv_dataset["thai"].to_list(), trainer=tokenizer_trainer)





In [321]:
tokenizer.enable_padding(pad_id=tokenizer.token_to_id("<pad>"), pad_token="<pad>", length=MAX_TOKENS)
tokenizer.post_processor = TemplateProcessing(
    single="$A </s>",
    special_tokens=[
        ("</s>", tokenizer.token_to_id("</s>")),
    ],
)

In [322]:
# save tokenizer
tokenizer.save("sign_language_translate/tokenizer-unigram.json")

# load tokenizer

In [30]:
tokenizer = PreTrainedTokenizerFast(tokenizer_file="sign_language_translate/tokenizer-unigram.json")

In [31]:
vocab_size = tokenizer.vocab_size
vocab_size

119

In [32]:
a = tokenizer.encode("สวัสดี", padding="max_length", max_length=MAX_TOKENS, return_tensors="pt")

In [33]:
a.shape

torch.Size([1, 512])

In [34]:
tokenizer.decode(a[0])

'ส วั ส ดี </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

# dataset loader

In [35]:
def pad_landmarks(landmarks, max_frames=512, pad_value=0):
    if landmarks.shape[0] < max_frames:
        padding = torch.full((max_frames - landmarks.shape[0], landmarks.shape[1]), pad_value)
        padded_seq = torch.cat([landmarks, padding], dim=0)
        attention_mask = torch.cat([torch.ones(landmarks.shape[0]), torch.zeros(max_frames - landmarks.shape[0])])
    else:
        padded_seq = landmarks[:max_frames]
        attention_mask = torch.ones(max_frames)

    return padded_seq, attention_mask

In [36]:
class CustomDataset(Dataset):
    def __init__(self, dataset, tokenizer, augmentation=True, max_frames=MAX_FRAMES, max_tokens=MAX_TOKENS, keypoints_path="sign_language_translate/dataset/keypoints", num_concat_range=(7, 13)):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.augmentation = augmentation
        self.max_frames = max_frames
        self.max_tokens = max_tokens
        self.keypoints_path = keypoints_path
        self.num_concat_range = num_concat_range

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sign, thai = self.dataset.iloc[idx]
        
        sign = sign.split(" ")
        # keypoints_files = [random.choice(glob.glob(os.path.join(self.keypoints_path, sign_word, "*.npy"))) for sign_word in sign]
        keypoints_files = [glob.glob(os.path.join(self.keypoints_path, sign_word, "*.npy"))[0] for sign_word in sign]

        keypoints_files[0:0] = [glob.glob(os.path.join(self.keypoints_path, "นิ่ง", "*.npy"))[0]]

        #print(keypoints_files)
        keypoints = [np.load(file) for file in keypoints_files]
        #[print(keypoint.shape) for keypoint in keypoints]
    
        # concat all keypoints
        keypoints = concat_keypoints(keypoints, num_concat_frames=random.randint(*self.num_concat_range))
        
        # fill missing landmarks
        keypoints = forward_fill_landmarks(keypoints)

        # augment keypoints
        if self.augmentation:
            keypoints = augment_landmarks(keypoints)

        # pad keypoints
        keypoints, attention_mask = pad_landmarks(torch.tensor(keypoints, dtype=torch.float32), max_frames=self.max_frames)

        thai = self.tokenizer.encode(thai, padding="max_length", max_length=self.max_tokens, return_tensors="pt")[0]

        return {
            "landmarks": keypoints,
            "attention_mask": attention_mask,
            "labels": thai,
        }


In [37]:
dataset = CustomDataset(csv_dataset, tokenizer)

In [38]:
val_dataset = CustomDataset(csv_dataset, tokenizer, augmentation=False, num_concat_range=(7, 7))

In [39]:
for i in range(len(dataset)):
    dataset[i]


# define model

In [40]:
class Sign2ThaiT5(nn.Module):
    def __init__(self, t5_model="t5-small", vocab_size=vocab_size, input_dim=483, ):
        super().__init__()
        self.t5 = T5ForConditionalGeneration.from_pretrained(t5_model)
        self.t5.resize_token_embeddings(vocab_size)

        self.projection = nn.Linear(input_dim, self.t5.config.d_model, bias=False)


    def forward(self, landmarks, attention_mask=None, decoder_input_ids=None, labels=None):
        # Input shape: (batch_size, frames_len, landmarks_xyz)
        # Project landmarks to T5's d_model dimension
        projected_landmarks = self.projection(landmarks)
        
        # Pass the projected landmarks to T5
        outputs = self.t5(inputs_embeds=projected_landmarks,
                          attention_mask=attention_mask,
                          decoder_input_ids=decoder_input_ids,
                          labels=labels)
        
        return outputs
    
    def generate(self, landmarks, attention_mask, **generate_kwargs):
        projected_landmarks = self.projection(landmarks)
        
        return self.t5.generate(inputs_embeds=projected_landmarks, attention_mask=attention_mask, **generate_kwargs)


In [41]:
model = Sign2ThaiT5()



In [42]:
model.to(device)

Sign2ThaiT5(
  (t5): T5ForConditionalGeneration(
    (shared): Embedding(119, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(119, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=512, out_features=2048, bias=False)
                (wo): Linear(in_features=2048, 

In [43]:
training_args = TrainingArguments(
    output_dir="sign_language_translate/results/sign2thai",
    num_train_epochs=1000,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_strategy="epoch",
    logging_dir="./logs",
    save_strategy="steps",
    save_total_limit=5,
    save_steps=100,
    save_safetensors=False,
    evaluation_strategy="steps",
    eval_steps=100,
)

In [44]:
def compute_metrics(pred):
    logits = pred.predictions[0]
    labels = pred.label_ids
    # print(pred)
    # print(len(pred.predictions))
    # print(pred.predictions[0].shape, pred.predictions[1].shap, pred.label_ids.shape)

    predictions = logits.argmax(-1)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    #print(decoded_preds)
    
    bleu = evaluate.load("bleu")
    bleu_score = bleu.compute(predictions=decoded_preds, references=decoded_labels)["bleu"]
    
    cer = evaluate.load("cer")
    cer_scores = cer.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "bleu": bleu_score,
        "cer": cer_scores,
    }

In [45]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [46]:
trainer.train()

Step,Training Loss,Validation Loss,Bleu,Cer
100,0.0842,0.063596,0.0,0.971516
200,0.0611,0.052096,0.0,0.995931
300,0.0432,0.037754,0.0,0.986775
400,0.0349,0.028257,0.0,0.949135
500,0.0312,0.025975,0.0,0.872838
600,0.0269,0.023359,0.0,0.754832
700,0.0252,0.021204,0.093781,0.701933
800,0.0227,0.01947,0.108527,0.694812
900,0.0216,0.017866,0.157637,0.62767
1000,0.0191,0.016047,0.224327,0.636826


TrainOutput(global_step=13000, training_loss=0.022626830858106798, metrics={'train_runtime': 4895.5357, 'train_samples_per_second': 19.814, 'train_steps_per_second': 2.655, 'total_flos': 0.0, 'train_loss': 0.022626830858106798, 'epoch': 1000.0})

In [57]:
# load best model
model.load_state_dict(torch.load("sign_language_translate/results/sign2thai/checkpoint-13000/pytorch_model.bin"))

<All keys matched successfully>

In [48]:
trainer.predict(val_dataset)

PredictionOutput(predictions=(array([[[-10.192749 ,  -5.18979  ,  -6.994374 , ...,  -5.0817714,
          -7.0375032,  -6.000517 ],
        [-27.106426 ,  -1.7173628,  -8.824453 , ...,  -9.862619 ,
          -8.8777275, -10.221213 ],
        [-14.948016 ,  -1.5237603,  -7.911169 , ...,  -6.5466714,
          -4.371349 ,  -7.2288237],
        ...,
        [ 41.849007 ,   1.6778526,   3.787861 , ...,  -2.6688786,
           4.199767 ,  -3.3840299],
        [ 41.84768  ,   1.6777966,   3.7878034, ...,  -2.6689324,
           4.199689 ,  -3.3840656],
        [ 41.846375 ,   1.6777432,   3.7877479, ...,  -2.668987 ,
           4.199614 ,  -3.3840966]],

       [[-10.802139 ,  -5.5183573,  -7.227139 , ...,  -5.096293 ,
          -7.3213387,  -6.1159916],
        [-25.905344 ,  -6.1369066, -10.088974 , ...,  -6.0714126,
         -10.764919 ,  -9.126593 ],
        [-24.488842 ,  -6.044885 ,  -9.21102  , ..., -12.1679535,
         -15.132885 ,  -8.559867 ],
        ...,
        [ 42.61711  ,   

In [69]:
e = extract_keypoints("sign_language_translate/ฉัน เรียนหนังสือ.MOV")





























ฉัน เรียนหนังสือ.MOV No right hand landmarks
ฉัน เรียนหนังสือ.MOV No right hand landmarks
ฉัน เรียนหนังสือ.MOV No right hand landmarks
ฉัน เรียนหนังสือ.MOV No right hand landmarks
ฉัน เรียนหนังสือ.MOV No right hand landmarks
ฉัน เรียนหนังสือ.MOV No right hand landmarks
ฉัน เรียนหนังสือ.MOV No right hand landmarks
ฉัน เรียนหนังสือ.MOV No right hand landmarks
ฉัน เรียนหนังสือ.MOV No right hand landmarks
ฉัน เรียนหนังสือ.MOV No right hand landmarks
ฉัน เรียนหนังสือ.MOV No left hand landmarks
ฉัน เรียนหนังสือ.MOV No left hand landmarks
ฉัน เรียนหนังสือ.MOV No left hand landmarks
ฉัน เรียนหนังสือ.MOV No left hand landmarks
ฉัน เรียนหนังสือ.MOV No left hand landmarks
ฉัน เรียนหนังสือ.MOV No right hand landmarks
ฉัน เรียนหนังสือ.MOV No left hand landmarks
ฉัน เรียนหนังสือ.MOV No right hand landmarks
ฉัน เรียนหนังสือ.MOV No left hand landmarks
ฉัน เรียนหนังสือ.MOV No right hand landmarks
ฉัน เรียนหนังสือ.MOV No left hand landmarks
ฉัน เรียนหนังสือ.MOV No right hand landmarks
ฉัน เรียนหนังสือ.M

In [70]:
norm = normalize_landmarks_scale(e, reference_scale)

In [71]:
fill = forward_fill_landmarks(norm)

In [72]:
xyz = convert_to_xyz(fill)

In [73]:
# draw points in 2d space
def update(i):
    ax.clear()
    ax.set_xlim(0, 854)
    ax.set_ylim(0, 480)
    ax.scatter(854 - xyz[i, :, 0] * 854, 480 - xyz[i, :, 1] * 480, s=1)

fig = plt.figure()
ax = fig.add_subplot(111)
ani = animation.FuncAnimation(fig, update, frames=len(xyz), interval=1000)

writervideo = animation.FFMpegWriter(fps=14)
ani.save("sign_language_translate/ani2d2.mp4", writer=writervideo)
plt.close()


In [65]:
e_pad = pad_landmarks(torch.tensor(fill, dtype=torch.float32), max_frames=512)

In [66]:
outp = model.generate(e_pad[0].unsqueeze(0).to(device), e_pad[1].unsqueeze(0).to(device), max_length=512)

In [67]:
outp

tensor([[ 0, 69, 52, 21,  1]], device='cuda:0')

In [68]:
tokenizer.decode(outp[0], skip_special_tokens=True)

'วันนี้อากาศ ร้อ น'

torch.Size([512, 483])