In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import string
import h5py
import torch.nn.functional as F
import string
import re
import sys
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
from dataclasses import dataclass

from datetime import datetime

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [3]:
df=pd.read_csv("../how2sign.csv",sep="\t")
df.tail()
asl_df=pd.read_csv("../ASLens-landmarks.csv")
asl_df.drop(121,inplace=True)
asl_df.index = np.arange(0,len(asl_df))
#asl_df.drop("Unnamed: 0", axis=1, inplace=True)
asl_df.head()

Unnamed: 0,file_name,landmarks,frames,sentences
0,FzmL8SL6Bow,FzmL8SL6Bow.h5,1196,4
1,FZrU_mEryAs,FZrU_mEryAs.h5,1213,7
2,-g45vqccdzI,-g45vqccdzI.h5,1332,10
3,FzUdcaxw_vs,FzUdcaxw_vs.h5,1826,19
4,-g0iPSnQt6w,-g0iPSnQt6w.h5,1657,17


In [4]:
import json
with open('../char_vocab.json') as json_file:
    vocabulary = json.load(json_file)

with open('../word_vocab.json') as json_file:
    word_vocabulary = json.load(json_file)

In [5]:
import string
accepted_text= string.ascii_lowercase+ string.digits + '!?.,\"()&+-/@%–' + ":" + " " + "\n"
chars = [x for x in accepted_text]
print(chars)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '!', '?', '.', ',', '"', '(', ')', '&', '+', '-', '/', '@', '%', '–', ':', ' ', '\n']


In [6]:
class Tokenizer:
  def __init__(self,char_vocab,word_vocab,chars):
    self.char_vocabulary = char_vocab
    self.word_vocabulary = word_vocab

    self.chars = chars

  def __call__(self,text,label=False):
    char_tokens = []
    word_tokens = []
    replacements = str.maketrans({
    'İ': 'I',
    'ç': 'c',
    'ş': 's',
    'ü': 'u',
    'ö': 'o',
    'ğ': 'g',
    'ı': 'i'
    })
    text = text.translate(replacements)
    for char in text.lower():
      if char not in "#*":
        if char not in self.char_vocabulary.keys():
          char_tokens.append(self.char_vocabulary["<UNK>"])
        else:
          char_tokens.append(self.char_vocabulary[char])
      else:
        if char == "#":
          char_tokens.append(self.char_vocabulary["<SOS>"])
        else:
          char_tokens.append(self.char_vocabulary["<EOS>"])

    if label:
      return torch.tensor(char_tokens)
      token = F.one_hot(torch.tensor(char_tokens),num_classes=len(self.char_vocabulary))
      return token.type(torch.float32).view(1,-1)

    for word in text.lower().split(" "):
      k=word.strip().lower().replace('"','').replace("\n",' ')
      k=re.sub(r'[.,()]', '', k)
      if k not in self.word_vocabulary.keys():
        word_tokens.append(self.char_vocabulary["<UNK>"])
        continue
      word_tokens.append(self.word_vocabulary[k])
    if word_tokens[0]==1:
      word_tokens = word_tokens[1:]
    word_tokens = torch.tensor(word_tokens)
    padded_sequences =F.pad(word_tokens, (len(text)-len(word_tokens),0), "constant", self.word_vocabulary["<PAD>"])
    return torch.tensor(char_tokens),padded_sequences




In [7]:
tokenizer = Tokenizer(vocabulary,word_vocabulary,chars)


In [8]:
tokenizer("How are you today")

(tensor([ 9, 16, 24, 53,  2, 19,  6, 53, 26, 16, 22, 53, 21, 16,  5,  2, 26]),
 tensor([    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0, 11437,  8920,  5816, 37680]))

In [9]:
class ASLensDataset(Dataset):
  def __init__(self, df, asl_df, tokenizer=None, seq_len=90,landmarkspath="../landmarks"):
    self.tokenizer=tokenizer
    self.df=df
    self.asl_df=asl_df
    self.seq_len=seq_len
    self.landmarkspath =landmarkspath
  def __len__(self):
    return self.asl_df['sentences'].sum()

  def extract_number(self,sentence_id):
    # Extract the numeric part after the last underscore
    match = re.search(r'_(\d+)$', sentence_id)
    return int(match.group(1)) if match else 0

  def extractFrames(self,ex,index):
    fName=self.asl_df["landmarks"][index]
    file=h5py.File(f"{self.landmarkspath}/{fName}")
    start_frame=int(ex["START_REALIGNED"]*15)
    end_frame=int(ex["END_REALIGNED"]*15)
    hand_left=file["handLeft"][start_frame:end_frame]
    # print(self.asl_df["frames"][index])
    # print(start_frame,end_frame)
    hand_right=file["handRight"][start_frame:end_frame]
    face_lips=file["faceLips"][start_frame:end_frame]
    face_oval=file["faceOval"][start_frame:end_frame]
    file.close()
    #x = self.text[idx:idx + self.seq_len]
    x=np.concatenate([hand_left,hand_right,face_lips,face_oval],axis=1)
    return torch.tensor(x)
  def __getitem__ (self, idx):
    index=0
    while idx>self.asl_df['sentences'][index]-1:
      idx-=self.asl_df['sentences'][index]
      index+=1
    file_name=self.asl_df["file_name"][index]
    
    if file_name[-1]=="-":

      file_name=file_name[:-1]
    sent=df[df['VIDEO_ID']==file_name]
    sent = sent.copy()  # Explicit copy
    sent['SENTENCE_NUM'] = sent['SENTENCE_ID'].apply(self.extract_number)
    sent = sent.sort_values(["VIDEO_NAME","SENTENCE_NUM"])

    if len(sent[sent.duplicated("SENTENCE_ID")])>0:
      first = sent.drop_duplicates(subset="SENTENCE_NUM",keep="first")
      last = sent.drop_duplicates(subset="SENTENCE_NUM",keep="last")
      numOfFrames=self.asl_df["frames"][index]
      if numOfFrames>first["END_REALIGNED"].max()*15:
        sent = first
      else:
        sent = last
    #print(sent)
    ex=sent.iloc[idx]
    
    frames=self.extractFrames(ex,index)
    frames=frames.type(torch.float32)
    text=ex["SENTENCE"]
    x2=[]
    ys=[]
    for idx in range(1,len(text)+1):
        x = "#"+text[:idx]
        y = text[:idx+1]
        
        if idx==len(text):
            y+="*"
        if self.tokenizer:
            x = self.tokenizer(x)
            y = self.tokenizer(y,label=True)
        #print(x[1].shape)
        x2.append(x)
        ys.append(y)
            
    #x2 = np.array(x2)
    #ys= np.array(ys)
    #if self.tokenizer:
    #    x2 = torch.tensor(x2)
        #ys = torch.tensor(ys)
    return (frames,x2),ys

In [10]:
test_size = 0.15
train_df = asl_df.iloc[:int(len(asl_df)*(1-test_size))]
test_df = asl_df.iloc[int(len(asl_df)*(1-test_size)):]
train_df.index = np.arange(0,len(train_df))
test_df.index = np.arange(0,len(test_df))

In [11]:
train_asl_dataset = ASLensDataset(df,train_df,tokenizer=None)
test_asl_dataset = ASLensDataset(df,test_df,tokenizer=None)

In [12]:
@dataclass
class ModelConfig:
    hidden_size: int
    num_layers: int
    dropout_rate:  float =0.1
    learning_rate:float= 0.001


class ASLensEncoder(nn.Module):
  def __init__(self,config):
    super(ASLensEncoder,self).__init__()
    self.config=config
    self.conv1 = nn.Sequential(
        #nn.Conv2d(1,16,kernel_size=(3,1),padding=(1,0)),
        #nn.ReLU(),

       # nn.MaxPool2d(kernel_size=(2,2)),
        #nn.Conv2d(16,32,kernel_size=(2,1),padding=(1,0)),
       # nn.ReLU(),
        #nn.Conv2d(64,128,kernel_size=(3,3)),
        #nn.ReLU(),
        #nn.Flatten()
      nn.Conv1d(3, 16, kernel_size=3, padding=1),  # preserves (90, 3)
      nn.ReLU(),
      nn.Conv1d(16, 32, kernel_size=2,padding=1),                 # reduces width
      nn.ReLU(),
      nn.Conv1d(32, 64, kernel_size=2,padding=1),                 # reduces width
      nn.ReLU(),
      #nn.AdaptiveAvgPool2d((45, 3))
        #nn.Conv3d(128,64,kernel_size=(3,3,1)),
        #nn.ReLU(),
        )
    self.lstm= nn.LSTM(input_size=6400,
                       hidden_size=self.config.hidden_size,
                       num_layers=config.num_layers,
                       dropout=config.dropout_rate,
                       batch_first=True)


  def forward(self,x):
    time = x.shape[0]
    #print(x.shape)

    x=x.view(-1, 98,3)
    x=x.permute(0,2, 1)
    out = self.conv1(x)

    out=out.reshape(1, time, -1)
    out,hidden = self.lstm(out)
    return out,hidden

config = ModelConfig(hidden_size=384,num_layers=3,dropout_rate=0.2)
encoder = ASLensEncoder(config)
encoder.to(device)
encoder.load_state_dict(torch.load('../asl_lens_model_checkpoint.pt',map_location=device,weights_only=False)['model_state_dict'])
class ASLenseRNN(nn.Module):
  def __init__(self,vocab_size,embed_size,n_hidden,n_layers,embedding_matrix):
    super(ASLenseRNN,self).__init__()
    self.V = vocab_size
    self.D = embed_size
    self.M = n_hidden
    self.L = n_layers

    self.embedding = nn.Embedding(self.V,self.D)
    self.word2vec = nn.Embedding(42605,self.D,padding_idx=0)#.from_pretrained(embedding_matrix, freeze=True,padding_idx=word_vocabulary["<PAD>"])  # Set freeze=False to fine-tune


    # Initalize rnn and fc layers
    self.rnn = nn.LSTM(input_size=self.D*2,
                      hidden_size=self.M,
                      num_layers=self.L,
                      dropout=0.2,
                      batch_first=True)

    self.fc = nn.Sequential(
          nn.Linear(self.M, 1024),
          nn.ReLU(),
          nn.Linear(1024, self.V),
         # nn.ReLU(),
         # nn.Linear(512,self.K)
        )

  def forward(self, X, hidden):
    # Embedding layer:

    char_embed = self.embedding(X[0])
    word_embed = self.word2vec(X[1])
    # pass through rnn
  #  print(char_embed.shape,word_embed.shape)
    out = torch.cat([char_embed, word_embed], dim=-1)
    out=torch.unsqueeze(out,0)

    out,hidden_state= self.rnn(out,hidden)
   # out = F.relu(out)
    out = self.fc(out)
    return out,(hidden_state[0].detach(), hidden_state[1].detach())


decoder = ASLenseRNN(vocab_size=57,
            embed_size=300,
            n_hidden=128*3,
            n_layers=3,
            embedding_matrix=None)
decoder.to(device)
decoder.load_state_dict(torch.load('../asl_lens_model_decoder_checkpoint.pt',map_location=device,weights_only=False)['model_state_dict'])

<All keys matched successfully>

In [13]:
flipped_dict = {v: k for k, v in vocabulary.items()}


In [14]:
train_loader = DataLoader(train_asl_dataset, batch_size=1, shuffle=False, drop_last=False)
#test_loader = DataLoader(test_asl_dataset, batch_size=1, shuffle=False, drop_last=False)


In [32]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from mosestokenizer import MosesTokenizer
from mosestokenizer import MosesDetokenizer
import subprocess
import tempfile
import os
from rouge_score import rouge_scorer
from jiwer import wer  # Install with: pip install jiwer

# Download NLTK data (run once)
nltk.download('wordnet')
nltk.download('omw-1.4')

def compute_metrics(reference, hypothesis):
    # Tokenize sentences
    tokenize = MosesTokenizer('en')
    detokenize = MosesDetokenizer('en')
    
    ref_tokens = tokenize(reference)
    hyp_tokens = tokenize(hypothesis)
    
    # Compute BLEU (with smoothing)
    smoothing = SmoothingFunction().method1
    bleu_score = sentence_bleu(
        [ref_tokens],
        hyp_tokens,
        smoothing_function=smoothing
    )
    
    # Compute METEOR
    meteor_score_val = meteor_score(
        [ref_tokens],
        hyp_tokens
    )
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference, hypothesis)
    
    return {
        "BLEU": bleu_score,
        "METEOR": meteor_score_val,
        "ROUGE-1": rouge_scores['rouge1'].fmeasure,
        "ROUGE-2": rouge_scores['rouge2'].fmeasure,
        "ROUGE-L": rouge_scores['rougeL'].fmeasure,
        "WER": wer(reference, hypothesis)

    }

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [33]:
metrics=[]

In [34]:
for i in range(len(test_asl_dataset)):
    decoder_hidden=None
    x,y = test_asl_dataset.__getitem__(i)
    inputs = "#"
    encoder_outputs, encoder_hidden = encoder(x[0].to(device))
    decoder_hidden=encoder_hidden
    for t in range(len(y[-1])):
           # print(torch.tensor(np.array(inputs[1][t])))
        decoder_input=tokenizer(inputs)
        decoder_output, decoder_hidden = decoder((decoder_input[0].to(device).long(),decoder_input[1].to(device).long()), decoder_hidden)
        decoder_output = decoder_output / 1
        decoder_output[..., 1] = -float('inf')
        out = F.softmax(decoder_output[:,-1,:], dim=-1).squeeze(0)
        #out = out /0.1

       # print(out.shape)
        char = torch.multinomial(out, num_samples=1).detach().cpu().item()
        inputs += flipped_dict[char] 
        #print(inputs)
        if inputs[-1]==vocabulary['<EOS>']:
            break
            
            
    metrics.append(compute_metrics(y[-1],inputs.replace("\n", "")))
print(inputs)

#we have por


In [35]:
metrics

[{'BLEU': 0.020144990145560468,
  'METEOR': 0.11210762331838564,
  'ROUGE-1': 0.23255813953488372,
  'ROUGE-2': 0.04878048780487805,
  'ROUGE-L': 0.13953488372093023,
  'WER': 1.1578947368421053},
 {'BLEU': 0.01553712569276035,
  'METEOR': 0.044642857142857144,
  'ROUGE-1': 0.09523809523809525,
  'ROUGE-2': 0.0,
  'ROUGE-L': 0.09523809523809525,
  'WER': 1.2222222222222223},
 {'BLEU': 0.009629943614188135,
  'METEOR': 0.025000000000000005,
  'ROUGE-1': 0.05555555555555555,
  'ROUGE-2': 0.0,
  'ROUGE-L': 0.05555555555555555,
  'WER': 1.1875},
 {'BLEU': 0.008186841244220632,
  'METEOR': 0.12448132780082988,
  'ROUGE-1': 0.125,
  'ROUGE-2': 0.0,
  'ROUGE-L': 0.08333333333333333,
  'WER': 1.3},
 {'BLEU': 0.007918430003499762,
  'METEOR': 0.02109704641350211,
  'ROUGE-1': 0.04761904761904762,
  'ROUGE-2': 0.0,
  'ROUGE-L': 0.04761904761904762,
  'WER': 1.0},
 {'BLEU': 0.008429869592787348,
  'METEOR': 0.0900900900900901,
  'ROUGE-1': 0.1818181818181818,
  'ROUGE-2': 0.0,
  'ROUGE-L': 0.1090

In [38]:
b={'BLEU': 0.,
  'METEOR': 0.,
  'ROUGE-1': 0.,
  'ROUGE-2': 0.0,
  'ROUGE-L': 0.,
  'WER': 0.}
for k in metrics:
    b["BLEU"]+=k["BLEU"]
    b["METEOR"]+=k["METEOR"]
    b["ROUGE-1"]+=k["ROUGE-1"]
    b["ROUGE-2"]+=k["ROUGE-2"]
    b["ROUGE-L"]+=k["ROUGE-L"]
    b["WER"]+=k["WER"]


In [39]:
b["BLEU"]/=len(test_asl_dataset)
b["METEOR"]/=len(test_asl_dataset)
b["ROUGE-1"]/=len(test_asl_dataset)
b["ROUGE-2"]/=len(test_asl_dataset)
b["ROUGE-L"]/=len(test_asl_dataset)
b["WER"]/=len(test_asl_dataset)

In [40]:
b

{'BLEU': 0.01168252550239433,
 'METEOR': 0.08298357212769919,
 'ROUGE-1': 0.1231039253602089,
 'ROUGE-2': 0.01155829765351695,
 'ROUGE-L': 0.09495269000624276,
 'WER': 1.1116735209615756}