In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import string
import h5py
import torch.nn.functional as F
import string
import re
import sys
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
from dataclasses import dataclass

from datetime import datetime

## Load dataset

## Dataset and Data Loader

## Model

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [3]:
df=pd.read_csv("how2sign.csv",sep="\t") # Load how2Sign dataset
df.tail()
asl_df=pd.read_csv("ASLens-landmarks.csv") # Load ASLens Landmarks dataset
asl_df.drop(121,inplace=True)
asl_df.index = np.arange(0,len(asl_df))
#asl_df.drop("Unnamed: 0", axis=1, inplace=True)
asl_df.head()

Unnamed: 0,file_name,landmarks,frames,sentences
0,FzmL8SL6Bow,FzmL8SL6Bow.h5,1196,4
1,FZrU_mEryAs,FZrU_mEryAs.h5,1213,7
2,-g45vqccdzI,-g45vqccdzI.h5,1332,10
3,FzUdcaxw_vs,FzUdcaxw_vs.h5,1826,19
4,-g0iPSnQt6w,-g0iPSnQt6w.h5,1657,17


In [4]:
import json
# Load character and word vocabulary
with open('char_vocab.json') as json_file:
    vocabulary = json.load(json_file)

with open('word_vocab.json') as json_file:
    word_vocabulary = json.load(json_file)

In [5]:
import string
accepted_text= string.ascii_lowercase+ string.digits + '!?.,\"()&+-/@%–' + ":" + " " + "\n"
chars = [x for x in accepted_text]
print(chars)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '!', '?', '.', ',', '"', '(', ')', '&', '+', '-', '/', '@', '%', '–', ':', ' ', '\n']


In [6]:
class Tokenizer:
  def __init__(self,char_vocab,word_vocab,chars):
    self.char_vocabulary = char_vocab
    self.word_vocabulary = word_vocab

    self.chars = chars

  def __call__(self,text,label=False):
    char_tokens = []
    word_tokens = []
    replacements = str.maketrans({
    'İ': 'I',
    'ç': 'c',
    'ş': 's',
    'ü': 'u',
    'ö': 'o',
    'ğ': 'g',
    'ı': 'i'
    })
    text = text.translate(replacements) # Replace special characters
    for char in text.lower():
      if char not in "#*":
        if char not in self.char_vocabulary.keys():
          char_tokens.append(self.char_vocabulary["<UNK>"]) # Add unknown token if character is not in vocabullary
        else:
          char_tokens.append(self.char_vocabulary[char])
      else:
        if char == "#":
          char_tokens.append(self.char_vocabulary["<SOS>"]) # Add Start of the sentence token 
        else:
          char_tokens.append(self.char_vocabulary["<EOS>"])  # Add End of the sentence token 

    if label:
      return torch.tensor(char_tokens)

    # Word tokenization
    for word in text.lower().split(" "): #
      k=word.strip().lower().replace('"','').replace("\n",' ') # Replace special characters
      k=re.sub(r'[.,()]', '', k)
      if k not in self.word_vocabulary.keys():
        word_tokens.append(self.char_vocabulary["<UNK>"]) # Add unknown token if word is not in vocabullary
        continue
      word_tokens.append(self.word_vocabulary[k]) # add word token to tokens
    if word_tokens[0]==1: 
      word_tokens = word_tokens[1:]
    word_tokens = torch.tensor(word_tokens) # Convert to tensor
    padded_sequences =F.pad(word_tokens, (90-len(word_tokens),0), "constant", self.word_vocabulary["<PAD>"]) # Pad word_tokens to match char_tokens size
    return torch.tensor(char_tokens),padded_sequences




In [7]:
tokenizer = Tokenizer(vocabulary,word_vocabulary,chars) # Get TOkenizer


In [8]:
class ASLensDataset(Dataset):
  def __init__(self, df, asl_df, tokenizer=None, seq_len=90):
    self.tokenizer=tokenizer
    self.df=df
    self.asl_df=asl_df
    self.seq_len=seq_len
  def __len__(self):
    return self.asl_df['sentences'].sum()

  def extract_number(self,sentence_id):
    # Extract the numeric part after the last underscore
    match = re.search(r'_(\d+)$', sentence_id)
    return int(match.group(1)) if match else 0

  def extractFrames(self,ex,index):
    fName=self.asl_df["landmarks"][index] # Get file name
    file=h5py.File(f"landmarks/{fName}") # Load h5 file
    # Determine start and end frame
    start_frame=int(ex["START_REALIGNED"]*15) 
    end_frame=int(ex["END_REALIGNED"]*15)
    
    # Extract each component of the landmarks and concatenate 
    hand_left=file["handLeft"][start_frame:end_frame]
    hand_right=file["handRight"][start_frame:end_frame]
    face_lips=file["faceLips"][start_frame:end_frame]
    face_oval=file["faceOval"][start_frame:end_frame]
    file.close()
    x=np.concatenate([hand_left,hand_right,face_lips,face_oval],axis=1)
    
    return torch.tensor(x)

  def __getitem__ (self, idx):
    # Determine valid dataframe index from idx
    index=0
    while idx>self.asl_df['sentences'][index]-1: 
      idx-=self.asl_df['sentences'][index]
      index+=1
    file_name=self.asl_df["file_name"][index]
    
    # Remove sufficient - from file_name
    if file_name[-1]=="-":
      file_name=file_name[:-1]
    
    # Get values dataframe
    sent=df[df['VIDEO_ID']==file_name]
    sent = sent.copy()  # Explicit copy
    sent['SENTENCE_NUM'] = sent['SENTENCE_ID'].apply(self.extract_number)
    sent = sent.sort_values(["VIDEO_NAME","SENTENCE_NUM"])
    
    # Remove duplicates if there are any
    if len(sent[sent.duplicated("SENTENCE_ID")])>0:
      first = sent.drop_duplicates(subset="SENTENCE_NUM",keep="first")
      last = sent.drop_duplicates(subset="SENTENCE_NUM",keep="last")
      numOfFrames=self.asl_df["frames"][index]
      if numOfFrames>first["END_REALIGNED"].max()*15:
        sent = first
      else:
        sent = last

    
    ex=sent.iloc[idx] # Get data from asl_df dataframe
    
    frames=self.extractFrames(ex,index) # Extract franes
    frames=frames.type(torch.float32) # Convert to tensor
    text=ex["SENTENCE"] # Get text
    x2=[]
    ys=[]
    for idx in range(1,len(text)+1): # For each character in text
        x = "#"+text[:idx]# <SOS> + text to index
        y = text[:idx+1] # Text with one step ahead
        
        if idx==len(text): # If we are at the end of the sentence add <EOS> token
            y+="*"
        if self.tokenizer:
            x = self.tokenizer(x) # Tokenize x 
            y = self.tokenizer(y,label=True) # Tokenize y
        x2.append(x)
        ys.append(y)
            
    return (frames,x2),ys

In [9]:
# Train test split
test_size = 0.15
train_df = asl_df.iloc[:int(len(asl_df)*(1-test_size))]
test_df = asl_df.iloc[int(len(asl_df)*(1-test_size)):]
train_df.index = np.arange(0,len(train_df))
test_df.index = np.arange(0,len(test_df))

In [10]:
# Initialize datasets

train_asl_dataset = ASLensDataset(df,train_df,tokenizer=tokenizer)
test_asl_dataset = ASLensDataset(df,test_df,tokenizer=tokenizer)

In [11]:
for x,y in train_asl_dataset:
    #print(x[1])
    print(y)
    break

[tensor([20, 16]), tensor([20, 16, 53]), tensor([20, 16, 53, 15]), tensor([20, 16, 53, 15, 16]), tensor([20, 16, 53, 15, 16, 24]), tensor([20, 16, 53, 15, 16, 24, 53]), tensor([20, 16, 53, 15, 16, 24, 53, 24]), tensor([20, 16, 53, 15, 16, 24, 53, 24,  6]), tensor([20, 16, 53, 15, 16, 24, 53, 24,  6, 53]), tensor([20, 16, 53, 15, 16, 24, 53, 24,  6, 53,  9]), tensor([20, 16, 53, 15, 16, 24, 53, 24,  6, 53,  9,  2]), tensor([20, 16, 53, 15, 16, 24, 53, 24,  6, 53,  9,  2, 23]), tensor([20, 16, 53, 15, 16, 24, 53, 24,  6, 53,  9,  2, 23,  6]), tensor([20, 16, 53, 15, 16, 24, 53, 24,  6, 53,  9,  2, 23,  6, 53]), tensor([20, 16, 53, 15, 16, 24, 53, 24,  6, 53,  9,  2, 23,  6, 53,  6]), tensor([20, 16, 53, 15, 16, 24, 53, 24,  6, 53,  9,  2, 23,  6, 53,  6, 20]), tensor([20, 16, 53, 15, 16, 24, 53, 24,  6, 53,  9,  2, 23,  6, 53,  6, 20, 21]), tensor([20, 16, 53, 15, 16, 24, 53, 24,  6, 53,  9,  2, 23,  6, 53,  6, 20, 21,
         2]), tensor([20, 16, 53, 15, 16, 24, 53, 24,  6, 53,  9,  2,

In [12]:
# Initialize data loaders

train_loader = DataLoader(train_asl_dataset, batch_size=1, shuffle=False, drop_last=False)
test_loader = DataLoader(test_asl_dataset, batch_size=1, shuffle=False, drop_last=False)


In [13]:
@dataclass
class ModelConfig:
    hidden_size: int
    num_layers: int
    dropout_rate:  float =0.1
    learning_rate:float= 0.001

In [14]:

class ASLensEncoder(nn.Module):
  def __init__(self,config):
    super(ASLensEncoder,self).__init__()
    self.config=config
    self.conv1 = nn.Sequential(
      nn.Conv1d(3, 16, kernel_size=3, padding=1),  # preserves (90, 3)
      nn.ReLU(),
      nn.Conv1d(16, 32, kernel_size=2,padding=1),                 # reduces width
      nn.ReLU(),
      nn.Conv1d(32, 64, kernel_size=2,padding=1),                 # reduces width
      nn.ReLU(),
    self.lstm= nn.LSTM(input_size=6400,
                       hidden_size=self.config.hidden_size,
                       num_layers=config.num_layers,
                       dropout=config.dropout_rate,
                       batch_first=True)


  def forward(self,x):
    time = x.shape[1] # Get time size

    x=x.view(-1, 98,3) # Reshape data tensor
    x=x.permute(0,2, 1) # Set 98 as first dim
    out = self.conv1(x) # Go through conv1D

    out=out.reshape(1, time, -1) # Reshape to match time 
    out,hidden = self.lstm(out) # LSTM
    return out,hidden

In [15]:
config = ModelConfig(hidden_size=384,num_layers=3,dropout_rate=0.2)
encoder = ASLensEncoder(config)
encoder.to(device)

ASLensEncoder(
  (conv1): Sequential(
    (0): Conv1d(3, 16, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
    (2): Conv1d(16, 32, kernel_size=(2,), stride=(1,), padding=(1,))
    (3): ReLU()
    (4): Conv1d(32, 64, kernel_size=(2,), stride=(1,), padding=(1,))
    (5): ReLU()
  )
  (lstm): LSTM(6400, 384, num_layers=3, batch_first=True, dropout=0.2)
)

In [16]:
class ASLenseRNN(nn.Module):
  def __init__(self,vocab_size,embed_size,n_hidden,n_layers,embedding_matrix):
    super(ASLenseRNN,self).__init__()
    self.V = vocab_size
    self.D = embed_size
    self.M = n_hidden
    self.L = n_layers

    self.embedding = nn.Embedding(self.V,self.D)
    self.word2vec = nn.Embedding.from_pretrained(embedding_matrix, freeze=True,padding_idx=word_vocabulary["<PAD>"])  # Set freeze=False to fine-tune


    # Initalize rnn and fc layers
    self.rnn = nn.LSTM(input_size=self.D*2,
                      hidden_size=self.M,
                      num_layers=self.L,
                      dropout=0.2,
                      batch_first=True)

    self.fc = nn.Sequential(
          nn.Linear(self.M, 1024),
          nn.ReLU(),
          nn.Linear(1024, self.V),

        )

  def forward(self, X, hidden):
    # Embedding layer:

    char_embed = self.embedding(X[0])
    word_embed = self.word2vec(X[1])
    out = torch.cat([char_embed, word_embed], dim=-1)
    
    # pass through rnn
    out,hidden_state= self.rnn(out,hidden)
    out = self.fc(out)
    return out,(hidden_state[0].detach(), hidden_state[1].detach())


In [17]:
decoder = ASLenseRNN(vocab_size=57,
            embed_size=300,
            n_hidden=128*3,
            n_layers=3,
            embedding_matrix=None)
decoder.to(device)
decoder.load_state_dict(torch.load('model_checkpoint_ft.pt',map_location=device,weights_only=False)['model_state_dict'])

<All keys matched successfully>

In [18]:
for x,y in train_loader:
    print(x[0].shape)
    data=x[0].to(device)
    encoder(data)
    break

torch.Size([1, 581, 98, 3])


# Model

In [19]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    list(encoder.parameters()) + list(decoder.parameters()),
    lr=0.001,
    weight_decay=1e-4
)


In [20]:
vocabulary["<EOS>"]

55

In [21]:
from helper_functions import progress_bar, plot_loss_curves,SaveModelCheckpoint

In [22]:
save_model_checkpoint = SaveModelCheckpoint(path="asl_lens_model_checkpoint.pt")
save_decoder_model_checkpoint = SaveModelCheckpoint(path="asl_lens_model_decoder_checkpoint.pt")

best_val_loss=float('inf')

In [23]:
epoches=20
train_losses = np.zeros(epoches)
val_losses = np.zeros(epoches)
for it in range(epoches):
  t0 = datetime.now()
  current_batch = 0
  total_batches = len(train_loader)
  encoder.train();decoder.train() # set model to train mode
  train_loss=[]
  val_loss=[]
  hidden_state = None
  # train
  for inputs,targets in train_loader:
    #break
    # move data to gpu
    inputs,targets = (inputs[0].to(device),inputs[1]),targets
    #inputs = inputs.permute(0,2,1)
    # zero gradients
    optimizer.zero_grad()
    # forward pass
    encoder_outputs, encoder_hidden = encoder(inputs[0].to(device))  # video_features: [T, batch, feature_dim]
    loss=torch.tensor(0.).to(device)
    #decoder_input = torch.tensor([vocabulary['<SOS>']])
    decoder_hidden = encoder_hidden

    for t in range(1000):
       # print(torch.tensor(np.array(inputs[1][t])))
        decoder_input=torch.tensor(np.array(inputs[1][t])).long().to(device)
        decoder_output, decoder_hidden = decoder(decoder_input.to(device), decoder_hidden)
        decoder_output=decoder_output.view(-1, decoder_output.size(-1))
        loss += criterion(decoder_output, targets[t].view(-1).to(device))
        decoder_input = targets[t]  # Teacher forcing
        if targets[t][0][-1]==vocabulary['<EOS>']:
            break


    # backward
    loss.backward()
   # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
    optimizer.step()

    train_loss.append(loss.item())
    current_batch = progress_bar(current_batch,total_batches)

  encoder.eval();decoder.eval() # set model to eval mode
  current_batch = 0
  total_batches = len(test_loader)
  for inputs,targets in test_loader:
    # move data to gpu
    inputs,targets = (inputs[0].to(device),inputs[1]),targets
    encoder_outputs, encoder_hidden = encoder(inputs[0].to(device))  # video_features: [T, batch, feature_dim]
    loss=torch.tensor(0.).to(device)
    #decoder_input = torch.tensor([vocabulary['<SOS>']])
    decoder_hidden = encoder_hidden

    for t in range(1000):
       # print(torch.tensor(np.array(inputs[1][t])))
        decoder_input=torch.tensor(np.array(inputs[1][t])).long().to(device)
        decoder_output, decoder_hidden = decoder(decoder_input.to(device), decoder_hidden)
        decoder_output=decoder_output.view(-1, decoder_output.size(-1))
        loss += criterion(decoder_output, targets[t].view(-1).to(device))
        decoder_input = targets[t]  # Teacher forcing
        if targets[t][0][-1]==vocabulary['<EOS>']:
            break

    val_loss.append(loss.item())
    current_batch = progress_bar(current_batch,total_batches,validation=True)


  # calculate loss
  print('\r')
  train_loss = np.mean(train_loss)
  val_loss = np.mean(val_loss)
  best_val_loss=float('inf')
  best_val_loss=  save_model_checkpoint(val_loss,best_val_loss,train_loss,it, model=encoder, optimizer=optimizer)
  best_val_loss=float('inf')
  best_val_loss= save_decoder_model_checkpoint(val_loss,best_val_loss,train_loss,it, model=decoder, optimizer=optimizer)
  # append loss
  train_losses[it]=train_loss
  val_losses[it]=val_loss
  dt = datetime.now() - t0
  print(f"Epoch {it+1}/{epoches}, Train loss: {train_loss:.4f}, Val loss: {val_loss:.4f}, Duration: {dt}")
  print('-------------------------------------------------------------')

[92m[1mModel saved at epoch: 1, val_loss improved from: inf to: 115.3471[0m
[92m[1mModel saved at epoch: 1, val_loss improved from: inf to: 115.3471[0m
Epoch 1/20, Train loss: 116.5364, Val loss: 115.3471, Duration: 0:30:38.766572
-------------------------------------------------------------
[92m[1mModel saved at epoch: 2, val_loss improved from: inf to: 115.1349[0m
[92m[1mModel saved at epoch: 2, val_loss improved from: inf to: 115.1349[0m
Epoch 2/20, Train loss: 102.3661, Val loss: 115.1349, Duration: 0:30:24.644113
-------------------------------------------------------------
[92m[1mModel saved at epoch: 3, val_loss improved from: inf to: 116.3224[0m
[92m[1mModel saved at epoch: 3, val_loss improved from: inf to: 116.3224[0m
Epoch 3/20, Train loss: 92.6411, Val loss: 116.3224, Duration: 0:30:20.230718
-------------------------------------------------------------
[92m[1mModel saved at epoch: 4, val_loss improved from: inf to: 119.2806[0m
[92m[1mModel saved at e