In [9]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from transformers import T5ForConditionalGeneration
from transformers import PreTrainedTokenizerFast
from transformers import Trainer, TrainingArguments

import numpy as np
import pandas as pd


In [11]:
MAX_INPUT = 512
MAX_TARGET = 512

In [30]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# load dataset

In [53]:
csv_dataset = pd.read_csv("sign_language_translate/test.csv")
csv_dataset

Unnamed: 0,sign,thai
0,สวัสดี โชคดี,สวัสดีครับ
1,โชคดี สวัสดี,โชคดีครับ


# make sign tokenizer

In [6]:
tokenizer_trainer = WordLevelTrainer(special_tokens=["<pad>", "</s>", "<unk>"])

In [7]:
sign_tokenizer = Tokenizer(WordLevel(unk_token="<unk>"))

In [10]:
sign_tokenizer.pre_tokenizer = Whitespace()

In [13]:
# train tokenizer
sign_tokenizer.train_from_iterator(csv_dataset["sign"].to_list(), trainer=tokenizer_trainer)

In [16]:
sign_tokenizer.enable_padding(pad_id=sign_tokenizer.token_to_id("<pad>"), pad_token="<pad>", length=MAX_TARGET)
sign_tokenizer.post_processor = TemplateProcessing(
    single="$A </s>",
    special_tokens=[
        ("</s>", sign_tokenizer.token_to_id("</s>")),
    ],
)

In [18]:
sign_tokenizer.save("sign_language_translate/sign_tokenizer.json")

# load sign tokenizer

In [19]:
sign_tokenizer = PreTrainedTokenizerFast(tokenizer_file="sign_language_translate/sign_tokenizer.json")

In [20]:
sign_vocab_size = sign_tokenizer.vocab_size
sign_vocab_size

5

In [24]:
sign_tokenizer.pad_token_id, sign_tokenizer.eos_token_id

(0, None)

In [21]:
a = sign_tokenizer.encode("สวัสดี", padding="max_length", max_length=MAX_TARGET, return_tensors="pt")

# load tokenizer (from sign2thai)

In [27]:
tokenizer = PreTrainedTokenizerFast(tokenizer_file="sign_language_translate/tokenizer-unigram.json")

In [46]:
tokenizer("สวัสดี", padding="max_length", max_length=MAX_INPUT, return_tensors="pt")

{'input_ids': tensor([[ 9, 12,  4,  9,  5,  6,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  

In [28]:
vocab_size = tokenizer.vocab_size
vocab_size

13

# dataset loader

In [57]:
class CustomDataset(Dataset):
    def __init__(self, csv_dataset):
        self.data = csv_dataset
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sign, thai = self.data.iloc[idx]

        tokenized_thai = tokenizer(thai, padding="max_length", max_length=MAX_INPUT, return_tensors="pt")

        thai_input, thai_attention_mask = tokenized_thai["input_ids"].squeeze(), tokenized_thai["attention_mask"].squeeze()

        sign = sign_tokenizer.encode(sign, padding="max_length", max_length=MAX_TARGET, return_tensors="pt")

        return {
            "input_ids": thai_input,
            "attention_mask": thai_attention_mask,
            "labels": sign,
        }

In [55]:
dataset = CustomDataset(csv_dataset)

In [56]:
dataset[0]

สวัสดี โชคดี สวัสดีครับ


{'input_ids': tensor([ 9, 12,  4,  9,  5,  6,  3,  7,  4,  8,  1,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0

# define model

In [29]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model.resize_token_embeddings(vocab_size)
model.decoder.embed_tokens = nn.Embedding(sign_vocab_size, model.config.d_model)
model.lm_head = nn.Linear(model.config.d_model, sign_vocab_size, bias=False)

In [31]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(13, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(13, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dropout(p=

In [5]:
model.config.decoder_start_token_id

0

In [None]:
training_args = TrainingArguments(
    output_dir="sign_language_translate/results/thai2sign",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()