In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import string
import h5py
import torch.nn.functional as F
import string
import re
import sys
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
from dataclasses import dataclass

from datetime import datetime

## Load dataset

In [2]:
# Get gpt2
from transformers import GPT2Tokenizer, GPT2LMHeadModel,GPT2Config
gptconfig = GPT2Config.from_pretrained("gpt2",add_cross_attention=True)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a pad token
tokenizer.add_special_tokens({"bos_token":"<s>","eos_token":"</s>"})
gpt2 = GPT2LMHeadModel.from_pretrained("gpt2",config=gptconfig)

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.1.ln_cross_attn.bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.10.crossattention.c_attn.bias', 'transformer.h.10.crossattention.c_attn.weight', 'transformer.h.10.crossattention.c_proj.bias', 'transformer.h.10.cros

## Dataset and Data Loader

## Model

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [4]:
df=pd.read_csv("how2sign.csv",sep="\t") # Load how2Sign dataset
df.tail()
asl_df=pd.read_csv("ASLens-landmarks.csv") # Load ASLlens-landmarks dataset
asl_df.drop(121,inplace=True)
asl_df.index = np.arange(0,len(asl_df))
asl_df.head()

file_name       g0yUlOaqL6k
landmarks    g0yUlOaqL6k.h5
frames                 2026
sentences                24
Name: 121, dtype: object


Unnamed: 0,file_name,landmarks,frames,sentences
0,FzmL8SL6Bow,FzmL8SL6Bow.h5,1196,4
1,FZrU_mEryAs,FZrU_mEryAs.h5,1213,7
2,-g45vqccdzI,-g45vqccdzI.h5,1332,10
3,FzUdcaxw_vs,FzUdcaxw_vs.h5,1826,19
4,-g0iPSnQt6w,-g0iPSnQt6w.h5,1657,17


In [52]:
class ASLensDataset(Dataset):
  def __init__(self, df, asl_df, tokenizer=None, seq_len=90):
    self.tokenizer=tokenizer
    self.df=df
    self.asl_df=asl_df
    self.seq_len=seq_len
  def __len__(self):
    return self.asl_df['sentences'].sum()

  def extract_number(self,sentence_id):
    # Extract the numeric part after the last underscore
    match = re.search(r'_(\d+)$', sentence_id)
    return int(match.group(1)) if match else 0

  def extractFrames(self,ex,index):
    fName=self.asl_df["landmarks"][index] # Get file name
    file=h5py.File(f"landmarks/{fName}") # Load h5 file
    # Determine start and end frame
    start_frame=int(ex["START_REALIGNED"]*15) 
    end_frame=int(ex["END_REALIGNED"]*15)
    
    # Extract each component of the landmarks and concatenate 
    hand_left=file["handLeft"][start_frame:end_frame]
    hand_right=file["handRight"][start_frame:end_frame]
    face_lips=file["faceLips"][start_frame:end_frame]
    face_oval=file["faceOval"][start_frame:end_frame]
    file.close()
    x=np.concatenate([hand_left,hand_right,face_lips,face_oval],axis=1)
    
    return torch.tensor(x)

  def __getitem__ (self, idx):
    # Determine valid dataframe index from idx
    index=0
    while idx>self.asl_df['sentences'][index]-1: 
      idx-=self.asl_df['sentences'][index]
      index+=1
    file_name=self.asl_df["file_name"][index]
    
    # Remove sufficient - from file_name
    if file_name[-1]=="-":
      file_name=file_name[:-1]
    
    # Get values dataframe
    sent=df[df['VIDEO_ID']==file_name]
    sent = sent.copy()  # Explicit copy
    sent['SENTENCE_NUM'] = sent['SENTENCE_ID'].apply(self.extract_number)
    sent = sent.sort_values(["VIDEO_NAME","SENTENCE_NUM"])
    
    # Remove duplicates if there are any
    if len(sent[sent.duplicated("SENTENCE_ID")])>0:
      first = sent.drop_duplicates(subset="SENTENCE_NUM",keep="first")
      last = sent.drop_duplicates(subset="SENTENCE_NUM",keep="last")
      numOfFrames=self.asl_df["frames"][index]
      if numOfFrames>first["END_REALIGNED"].max()*15:
        sent = first
      else:
        sent = last

    
    ex=sent.iloc[idx] # Get data from asl_df dataframe
    
    frames=self.extractFrames(ex,index) # Extract franes
    frames=frames.type(torch.float32) # Convert to tensor
    text=ex["SENTENCE"] # Get text
    if len(text)<4: # If length of text is less than 4 add ' ' to text
        text+=' '
    if self.tokenizer:
        # Tokenize text using gpt2 tokenizer
        inputs = self.tokenizer(text[:int(len(text)*0.33)], return_tensors="pt", padding="max_length", max_length=25, truncation=True)
        labels = self.tokenizer(text[int(len(text)*0.33):], return_tensors="pt", padding="max_length", max_length=25, truncation=True).input_ids
    return frames, inputs.input_ids.squeeze(0),inputs.attention_mask.squeeze(0), labels.squeeze(0)



In [53]:
# Train test split

test_size = 0.15
train_df = asl_df.iloc[:int(len(asl_df)*(1-test_size))]
test_df = asl_df.iloc[int(len(asl_df)*(1-test_size)):]
train_df.index = np.arange(0,len(train_df))
test_df.index = np.arange(0,len(test_df))

In [54]:
# Initialize datasets and data loaders

train_asl_dataset = ASLensDataset(df,train_df,tokenizer=tokenizer)
test_asl_dataset = ASLensDataset(df,test_df,tokenizer=tokenizer)

train_loader = DataLoader(train_asl_dataset, batch_size=1, shuffle=False)
test_loader = DataLoader(test_asl_dataset, batch_size=1, shuffle=False)

In [55]:
train_asl_dataset.__getitem__(211)[2].shape

torch.Size([25])

In [56]:
for data,x,m,y in train_loader:
  print(x.shape)
  break
  pass

torch.Size([1, 25])


In [63]:
@dataclass
class ModelConfig:
    hidden_size: int
    num_layers: int
    dropout_rate:  float =0.1
    learning_rate:float= 0.001

In [64]:

class ASLensEncoder(nn.Module):
  def __init__(self,config):
    super(ASLensEncoder,self).__init__()
    self.config=config
    self.conv1 = nn.Sequential(
      nn.Conv1d(3, 16, kernel_size=3, padding=1),  # preserves (90, 3)
      nn.ReLU(),
      nn.Conv1d(16, 32, kernel_size=2,padding=1),                 # reduces width
      nn.ReLU(),
      nn.Conv1d(32, 64, kernel_size=2,padding=1),                 # reduces width
      nn.ReLU(),
        )
    self.lstm= nn.LSTM(input_size=98,
                       hidden_size=self.config.hidden_size,
                       num_layers=config.num_layers,
                       dropout=config.dropout_rate,
                       batch_first=True)


  def forward(self,x):
    time = x.shape[2] # Get time size
    x=x.view(-1, 98,3) # Reshape data tensor
    x=x.permute(1,2, 0)   # Set 98 as first 
    out = self.conv1(x)  # Go through conv1D
    out=out.reshape(1, -1, 98).contiguous() # Reshape and use 98 as input_dim to LSTM
    out,hidden = self.lstm(out)
    return out,hidden

In [65]:
config = ModelConfig(hidden_size=128,num_layers=2,dropout_rate=0.2)
encoder = ASLensEncoder(config)
encoder.to(device)

ASLensEncoder(
  (conv1): Sequential(
    (0): Conv1d(3, 16, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
    (2): Conv1d(16, 32, kernel_size=(2,), stride=(1,), padding=(1,))
    (3): ReLU()
    (4): Conv1d(32, 64, kernel_size=(2,), stride=(1,), padding=(1,))
    (5): ReLU()
  )
  (lstm): LSTM(98, 128, num_layers=2, batch_first=True, dropout=0.2)
)

In [66]:
for x,y,m,z in train_loader:
  x=x.type(torch.float32)
  encoder(x.to(device))
  break

In [67]:
class GPT2ConditionedOnEncoder(nn.Module):
    def __init__(self, encoder, hidden_size=768):
        super().__init__()
        self.encoder = encoder
        self.gpt2 = GPT2LMHeadModel.from_pretrained("gpt2",config=gptconfig)
        self.gpt2.resize_token_embeddings(self.gpt2.config.vocab_size + 2)  # in case of custom tokenizer
        self.projection = nn.Linear(hidden_size, self.gpt2.config.n_embd)

    def forward(self, landmarks, input_ids, attention_mask,labels=None,proj=False):

        encoder_output, (h_n, _) = self.encoder(landmarks)  # h_n: [num_layers, 1, hidden_size]
        h_n = h_n[-1]  # use top layer [1, hidden_size]
        projected = self.projection(h_n)  # [1, emb_dim]
        projected = projected.unsqueeze(1)  # [1, 1, emb_dim]

        outputs = self.gpt2(input_ids=input_ids, encoder_hidden_state=encoder_output, labels=labels) # Pass encoder _hiddent to 
        return outputs.logits,outputs.loss,projected

In [68]:
model = GPT2ConditionedOnEncoder(encoder,hidden_size=config.hidden_size)
model.to(device)
print()


Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.1.ln_cross_attn.bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.10.crossattention.c_attn.bias', 'transformer.h.10.crossattention.c_attn.weight', 'transformer.h.10.crossattention.c_proj.bias', 'transformer.h.10.cros




In [70]:
from helper_functions import progress_bar, plot_loss_curves,SaveModelCheckpoint

In [71]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=0.001,
    weight_decay=1e-4
)

save_model_checkpoint = SaveModelCheckpoint(path="asl_lens_model_gpt2_checkpoint_hidden1.pt")

best_val_loss=float('inf')

In [None]:
epoches=20
train_losses = np.zeros(epoches)
val_losses = np.zeros(epoches)
for it in range(epoches):
  t0 = datetime.now()
  current_batch = 0
  total_batches = len(train_loader)
  model.train()
  train_loss=[]
  val_loss=[]
  hidden_state = None
  # train
  for data,x,mask,targets in train_loader:
    #break
    # move data to gpu
    #inputs,targets = (inputs[0].to(device),inputs[1].to(device),inputs[2].to(device)),targets.to(device)
    #inputs = inputs.permute(0,2,1)
    # zero gradients
    optimizer.zero_grad()
    # forward pass
    _,loss,_=model(data.to(device),x.to(device),mask.to(device),labels=targets.to(device))
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
    optimizer.step()

    train_loss.append(loss.item())
    current_batch = progress_bar(current_batch,total_batches)

  model.eval()
  current_batch = 0
  total_batches = len(test_loader)
  for data,x,mask,targets in test_loader:
    # move data to gpu
    _,loss,_=model(data.to(device),x.to(device),mask.to(device),labels=targets.to(device))
    val_loss.append(loss.item())
    current_batch = progress_bar(current_batch,total_batches,validation=True)


  # calculate loss
  print('\r')
  train_loss = np.mean(train_loss)
  val_loss = np.mean(val_loss)
  best_val_loss=  save_model_checkpoint(val_loss,best_val_loss,train_loss,it, model=model, optimizer=optimizer)
  # append loss
  train_losses[it]=train_loss
  val_losses[it]=val_loss
  dt = datetime.now() - t0
  print(f"Epoch {it+1}/{epoches}, Train loss: {train_loss:.4f}, Val loss: {val_loss:.4f}, Duration: {dt}")
  print('-------------------------------------------------------------')

[92m[1mModel saved at epoch: 1, val_loss improved from: inf to: 3.5965[0m
Epoch 1/20, Train loss: 3.6605, Val loss: 3.5965, Duration: 0:07:07.233264
-------------------------------------------------------------
Epoch 2/20, Train loss: 3.3403, Val loss: 3.6906, Duration: 0:04:28.839368
-------------------------------------------------------------
Epoch 3/20, Train loss: nan, Val loss: nan, Duration: 0:06:06.913859
-------------------------------------------------------------
Batch 60/1820 - [=.................................................]

In [None]:

file=h5py.File(f"landmarks/g0yUlOaqL6k.h5")
start_frame=0
end_frame=100
hand_left=file["handLeft"][start_frame:end_frame]
# print(self.asl_df["frames"][index])
# print(start_frame,end_frame)
hand_right=file["handRight"][start_frame:end_frame]
face_lips=file["faceLips"][start_frame:end_frame]
face_oval=file["faceOval"][start_frame:end_frame]
file.close()
#x = self.text[idx:idx + self.seq_len]
x=np.concatenate([hand_left,hand_right,face_lips,face_oval],axis=1)
test= torch.tensor(x).long()
test=test.type(torch.float32)


In [None]:
data,l,m,b = train_asl_dataset.__getitem__(3)
#data,y = (data[0].to(device),data[1].to(device),data[2].to(device)),y.to(device)
# Suppose <SOS> is a special token you've added, or just a regular token
#data,k,l = data[0].unsqueeze(0),(l[0].unsqueeze(0),l[1].unsqueeze(0)),b.unsqueeze(0)
decoded_text=tokenizer.eos_token
for i in range(1):
  #inputs = tokenizer(tokenizer.decode(k[0][0], skip_special_tokens=True), return_tensors="pt", padding="max_length", max_length=40, truncation=True).to(device)#data[1]#torch.tensor([[tokenizer.bos_token_id]]).to(device)  # shape: [1, 1]
  #input_ids = x2[0].to(device).long()
  #mask = x2[1].to(device).long()
  #y=ys.to(device).long()
  print(data[0][0])
  data,x,m,y = train_asl_dataset.__getitem__(3)
  outputs,loss,_=model(data.to(device),x.to(device), m.to(device),labels=y.to(device))
  predicted_token_ids = torch.argmax(outputs, dim=-1)  # shape: [batch_size, seq_len]
  print(predicted_token_ids)
  print(torch.isnan(m).any())
  # Example: remove the first token from each sequence (e.g., your conditioning token)
  #predicted_token_ids = predicted_token_ids[:, -1]
  #print(predicted_token_ids)
  #input_ids = torch.cat((x.to(device),predicted_token_ids),dim=1)
  #decoded_text += tokenizer.decode(predicted_token_ids, skip_special_tokens=True)
  #break
print(decoded_text+tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True))
print(loss)#print(predicted_token_ids)
#print(b)
#print(input_ids.shape)

In [None]:
tokenizer.decode(k[0][0], skip_special_tokens=True)

In [None]:
df[df['VIDEO_ID']=="g0yUlOaqL6k"]