<a href="https://colab.research.google.com/github/ADA-SITE-JML/sign-lang/blob/main/jamal/CNN_LSTM_Attention_with_feats.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook, all the features from the video frames are extracted and saved separately. The model in this notebook uses the ready features, instead of real-time video processing.

In [1]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import (CosineAnnealingLR,
                                      CosineAnnealingWarmRestarts,
                                      StepLR,
                                      ExponentialLR)
import sklearn.utils
from sklearn.model_selection import train_test_split

In CeDAR (Center for Data Analytics Research, ADA University) option, the notebook shall connevt to the local machine (where the whole dataset is supposed to be). To connect to the CeDAR's environment run the following to start Jupyter with access:


```
jupyter notebook \
>   --NotebookApp.allow_origin='https://colab.research.google.com' \
>   --port=8888 \
>   --NotebookApp.port_retries=0
```
Then select "Connect to a local runtime" and put the link of notebook environment (from the console)



In [2]:
class Config:
    debug = False
    env = 'Prod' # Dev (Jamal's GoogleDrive), Prod (SLR GDrive) or CeDAR (local)
    csv_path = ''
    seed = 44
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    #device = 'tpu' # uncomment to switch to TPU usage

    video_processing_tool = 'TorchVision' # OpenCV, VidGear or TorchVision
    max_frames = 5 # was 64 for the frames
    max_words_in_sentence = 10


    drive_folder = '/home/sign-lang/jamal/Dataset' # path for the local (CeDAR)
    if (env == 'Dev'):
      drive_folder = 'drive/MyDrive/SLR_test'
    elif (env == 'Prod'):
      drive_folder = 'drive/MyDrive/SLR/Data'

    feat_folder = drive_folder + '/jamal/Video_features'

    train_csv_path = drive_folder+'/sentences_all.csv'
    BATCH_SIZE = 1 #updated before the training
    rnn_type = 'GRU' # GRU or LSTM

def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
#    torch.manual_seed(seed)
    # if torch.cuda.is_available():
    #     torch.cuda.manual_seed(seed)

config = Config()
seed_everything(config.seed)
print('Running on',config.device)
if torch.cuda.is_available():
    print('GPU number:',torch.cuda.device_count())

Running on cuda:0
GPU number: 1


In [3]:
if (config.device == 'tpu'):
  !pip install cloud-tpu-client==0.10 torch==2.0.0 torchvision==0.15.1 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-2.0-cp39-cp39-linux_x86_64.whl

  import torch_xla
  import torch_xla.core.xla_model as xm


  dev = xm.xla_device()
  t1 = torch.ones(3, 3, device = dev)
  print(t1)
  config.device = dev

In [4]:
if (config.env != 'CeDAR'):
  from google.colab import drive
  drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import sys
import subprocess

def pip_install(package):
  subprocess.check_call([sys.executable, '-m', 'pip', 'install',package])

In [None]:
import re

train_set_size = 65

# read cvs file
sentences = pd.read_csv(config.train_csv_path)
sentences = sentences.iloc[:,:train_set_size]

# unique words
word_set = set(['SOS','EOS'])
sentences.iloc[:,2].str.lower().str.split().apply(word_set.update)
sorted_word_set = sorted(word_set)
print('Unique words',sorted_word_set)

# create word encoding
encodings = { k:v for v,k in enumerate(sorted_word_set)}
word_idx  = { v:k for k,v in encodings.items()}
print('Word encodings',encodings)
print('Words by index',word_idx)
torch.save(encodings,config.drive_folder+'/jamal/encodings.dict')
torch.save(word_idx,config.drive_folder+'/jamal/word_idx.dict')

# converts a sentence with zero padded encoding list
def get_sentence_encoded(sentence):
    encoded = [encodings[key] for key in ('SOS '+sentence+' EOS').split()]
    return  encoded + list([0]) * (config.max_words_in_sentence - len(encoded))

if config.debug:
  print(get_sentence_encoded('mən hansı sənəd vermək'))
  print(get_sentence_encoded('mən bakı yaşamaq'))

# generate (video file name, encoding list)
# Good recommendation on not to iterate over DFs like this:
# https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas
# but it's not my case - I have fewer rows and one to many with videos.
df = pd.DataFrame(columns=["id", "feat_file","encoding"])

#for index, row in sentences.iterrows():
for id in range(2,train_set_size):
    phrase = sentences.iloc[id,2].lower()
    encoded = get_sentence_encoded(phrase)

    dir = config.feat_folder + '/' + str(id)
    # iterate over video folders
    for filename in os.listdir(dir):
        f = os.path.join(dir, filename)
        # checking if it is a file
        if os.path.isfile(f):
            entry = pd.DataFrame.from_dict({"id": id, "feat_file": f, "encoding": [encoded]})
            df = pd.concat([df, entry], ignore_index = True)

if config.debug:
    print(df)

In [7]:
df.head()

Unnamed: 0,id,feat_file,encoding
0,2,drive/MyDrive/SLR/Data/jamal/Video_features/2/...,"[35, 561, 368, 841, 923, 33, 34, 0, 0, 0]"
1,2,drive/MyDrive/SLR/Data/jamal/Video_features/2/...,"[35, 561, 368, 841, 923, 33, 34, 0, 0, 0]"
2,2,drive/MyDrive/SLR/Data/jamal/Video_features/2/...,"[35, 561, 368, 841, 923, 33, 34, 0, 0, 0]"
3,2,drive/MyDrive/SLR/Data/jamal/Video_features/2/...,"[35, 561, 368, 841, 923, 33, 34, 0, 0, 0]"
4,2,drive/MyDrive/SLR/Data/jamal/Video_features/2/...,"[35, 561, 368, 841, 923, 33, 34, 0, 0, 0]"


In [8]:
import torchvision

class SLDataset(Dataset):

    def __init__(self, df):
        # shuffle and save
        self.df = sklearn.utils.shuffle(df)

    def __getitem__(self, idx):
        if config.debug:
          print(f"Got item at index: {idx}")

        feat_path = df.iloc[idx,1]
        # loading to CPU is to avoid problems with TPU - it's impossible to convert from GPU to TPU
        # feats = torch.load(feat_path, map_location=torch.device('cpu')).to(config.device)
        feats = torch.load(feat_path)

        encoding = torch.tensor(df.iloc[idx,2]).to(config.device)
        enc_shape = encoding.shape[0]

        return feats.float(), torch.reshape(encoding,(enc_shape,1)),feat_path

    def __len__(self):
        return len(self.df)

def get_dataloader(df, phase: str, batch_size: int = 96) -> DataLoader:
    train_df, val_df = train_test_split(df, test_size=0.1, random_state=config.seed, stratify=df['id'])
    train_df, val_df = train_df.reset_index(drop=True), val_df.reset_index(drop=True)
    df = train_df if phase == 'train' else val_df
    dataset = SLDataset(df)
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=0, shuffle=True)
    return dataloader

dl = get_dataloader(df,'train',1)


In [9]:
import time

measure = time.time()
dl_next = next(iter(dl))
print('Data fetching:',time.time() - measure,'sec')

a,b,fname = dl_next

if config.debug:
  print(a.shape,b.shape,fname)

Data fetching: 5.968267440795898 sec


In [10]:
import torchvision
from torchvision.models import squeezenet1_1
from torchvision.models.feature_extraction import create_feature_extractor

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, device, biDirectional = False):
        super(EncoderRNN, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.device = device
        # this is for LSTM
        self.D = 2 if biDirectional else 1

        if config.rnn_type == 'GRU':
          self.rnn = nn.GRU(
                  input_size = self.input_size,
                  hidden_size = self.hidden_size*self.D,
                  batch_first = True).to(config.device)
        elif config.rnn_type == 'LSTM':
          self.rnn = nn.LSTM(
                  input_size = self.input_size,
                  hidden_size = self.hidden_size*self.D,
                  num_layers = 1,
                  dropout = 0,
                  bidirectional = biDirectional,
                  batch_first = True).to(config.device)

    def forward(self, input, hidden):
        output, hidden = self.rnn(input, hidden)
        return output, hidden

    def initHidden(self):
        if config.rnn_type == 'GRU':
          return torch.zeros(self.D, config.BATCH_SIZE, self.hidden_size*self.D, device=self.device)
        elif config.rnn_type == 'LSTM':
          return (torch.zeros(self.D, config.BATCH_SIZE, self.hidden_size*self.D, device=self.device),
                  torch.zeros(self.D, config.BATCH_SIZE, self.hidden_size*self.D, device=self.device))

In [11]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, device, dropout_p=0.1, max_length=config.max_frames, biDirectional = False, debug=False): #max_length=config.max_words_in_sentence
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        self.debug = debug
        self.device = device

        self.D = 2 if biDirectional else 1

        if self.debug:
          print('Attn.init() hidden_size',hidden_size)
          print('Attn.init() output_size',output_size)
          print('Attn.init() max_length',max_length)

        self.embedding = nn.Embedding(self.output_size, self.hidden_size*2)
        self.attn = nn.Linear(self.hidden_size * 3, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 3, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)

        if config.rnn_type == 'GRU':
          self.rnn = nn.GRU(
                  input_size = self.hidden_size,
                  hidden_size = self.hidden_size*self.D,
                  batch_first = True).to(config.device)
        elif config.rnn_type == 'LSTM':
          self.rnn = nn.LSTM(
                  input_size = self.hidden_size,
                  hidden_size = self.hidden_size*self.D,
                  num_layers = 1,
                  dropout = 0,
                  bidirectional = biDirectional,
                  batch_first = True)
        self.out = nn.Linear(self.hidden_size*self.D, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        if  config.rnn_type == 'GRU':
          hidden = hidden.unsqueeze(0)

        embedded = self.embedding(input).view(input.shape[0],input.shape[1], self.hidden_size*2)
        embedded = self.dropout(embedded)

        if self.debug:
          print('Attn.forward() input',input.shape)
          print('Attn.forward() hidden',type(hidden),len(hidden),hidden[0].shape)
          print('Attn.forward() encoder_outputs',encoder_outputs.shape)
          print('embedded: ',embedded.shape)

        tcat = torch.cat((embedded[0], hidden[0]), 1)
        attn_weights = F.softmax(self.attn(tcat), dim=1).to(device=self.device)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),encoder_outputs).to(device=self.device)

        output = torch.cat((embedded[0], attn_applied[0]), 1).to(device=self.device)
        output = self.attn_combine(output).unsqueeze(0).to(device=self.device)

        output = F.relu(output)
        if  config.rnn_type == 'GRU':
          output, hidden = self.rnn(output, hidden[0].unsqueeze(0))
        elif config.rnn_type == 'LSTM':
          output, hidden = self.rnn(output, (hidden[0].unsqueeze(0),hidden[0].unsqueeze(0)))

        output = F.log_softmax(self.out(output[0]), dim=1).to(device=self.device)
        return output, hidden, attn_weights

    def initHidden(self):
        if config.rnn_type == 'GRU':
          return torch.zeros(self.D, 1, self.hidden_size*self.D, device=self.device)
        elif config.rnn_type == 'LSTM':
          return (torch.zeros(self.D, 1, self.hidden_size*self.D, device=self.device),
                  torch.zeros(self.D, 1, self.hidden_size*self.D, device=self.device))

In [12]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=config.max_words_in_sentence):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    if config.debug:
      print('Input len',input_tensor.shape,'Target len',target_tensor.shape)

    loss = 0

    encoder_hidden = encoder.initHidden()
#    print(input_tensor.shape,encoder_hidden.shape)
    encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)

    decoder_hidden = encoder_hidden

    decoder_input  = target_tensor[:,:(max_length-2),:]   # words from 1 to n-1
    decoder_target = target_tensor[:,1:(max_length-1),:]  # words from 2 to n (the target to the input word is the next word)
    tar_1hot = torch.nn.functional.one_hot(decoder_target, num_classes = len(encodings))

    if config.debug:
      print('Encoder hidden_0',len(encoder_hidden),'shape',encoder_hidden[0].shape)
      print('enc_out',encoder_output.shape)
      print('dec_in',decoder_input.shape)
      print('dec_target',decoder_target.shape)

    target_length = decoder_target.size(1)

    for di in range(target_length):
        if config.debug:
          print('dec hidden', decoder_hidden.shape)

        decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input[:,di,:], decoder_hidden[0], encoder_output)

        if config.debug:
          print('decoder_output & attn',decoder_output.shape, decoder_attention.shape)

        loss += criterion(decoder_output.squeeze(0), tar_1hot[0,di,:].squeeze(0).double())

        if (decoder_target[:,di,:] == torch.tensor(encodings['EOS'], device=config.device)):
          break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    if str(config.device).startswith('xla'):
      xm.mark_step()

    return loss.item() / (config.BATCH_SIZE*target_length)

In [13]:
import time
from torch import optim
import torch.nn.functional as F
import gc

def trainIters(encoder, decoder, print_every=1000, plot_every=100, learning_rate=0.01):
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

    criterion = nn.CrossEntropyLoss()

    trainloader = get_dataloader(df,'train',config.BATCH_SIZE)

    max_epochs = 10

    iter = 1
    start = time.time()
    for epoch in range(max_epochs):
      print('\nStarting epoch', epoch)
      for inputs, labels,fname in trainloader:
          if (iter%10 == 0):
            print('|', end = '')
          else:
            print('.', end = '')
          input_tensor = inputs.to(config.device)
          target_tensor = labels.to(config.device)

          # try:
          loss = train(input_tensor, target_tensor, encoder,
                      decoder, encoder_optimizer, decoder_optimizer, criterion)
          # except Exception as exp:
          #   print('There was an error: ',fname,exp)
          #   continue

          print_loss_total += loss
          plot_loss_total += loss

          if iter % print_every == 0:
              print_loss_avg = print_loss_total / print_every
              print_loss_total = 0
              print('%.4f' % (print_loss_avg))

              # model_scripted = torch.jit.script(encoder) # Export to TorchScript
              # model_scripted.save('/jamal/encoder.model') # Save
              # model_scripted = torch.jit.script(decoder) # Export to TorchScript
              # model_scripted.save('/jamal/decoder.model') # Save
              print('Time spent in seconds:',time.time() - start)
              start = time.time()

              torch.save(encoder.state_dict(),config.drive_folder+'/jamal/encoder_' + str(config.device) + '.model')
              torch.save(decoder.state_dict(),config.drive_folder+'/jamal/decoder_' + str(config.device) + '.model')

              gc.collect()

              if str(config.device).startswith('cuda'):
                torch.cuda.empty_cache()

          if iter % plot_every == 0:
              plot_loss_avg = plot_loss_total / plot_every
              plot_losses.append(plot_loss_avg)
              plot_loss_total = 0

          iter += 1

    #showPlot(plot_losses)

In [14]:
input_size = 2048 #86528
hidden_size = 64
config.debug = False

print('Running on '+config.device+ ' with '+config.rnn_type)

encoder = EncoderRNN(input_size, hidden_size, device=config.device, biDirectional = False).to(config.device)
attn_decoder = AttnDecoderRNN(hidden_size, len(encodings), device=config.device, dropout_p=0.1, biDirectional = False, debug=config.debug).to(config.device)

# use the previous weights
#encoder.load_state_dict(torch.load(config.drive_folder+'/jamal/encoder_' + str(config.device) + '.model', map_location=torch.device(config.device)))
#attn_decoder.load_state_dict(torch.load(config.drive_folder+'/jamal/decoder_' + str(config.device) + '.model', map_location=torch.device(config.device)))

config.BATCH_SZIE=64
trainIters(encoder, attn_decoder, print_every=50)

Running on cuda:0 with GRU

Starting epoch 0
.........|.........|.........|.........|.........|3.4946
Time spent in seconds: 44.0973961353302
.........|.........|.........|.........|.........|2.7036
Time spent in seconds: 42.784175872802734
.........|.........|.........|.........|.........|2.0555
Time spent in seconds: 42.92511200904846
.........|.........|.........|.........|.........|1.5849
Time spent in seconds: 43.479326248168945
.........|.........|.........|.........|.........|1.5479
Time spent in seconds: 42.04795837402344
.........|.........|.........|.........|.........|1.5093
Time spent in seconds: 41.05401611328125
.........|.........|.........|.........|.........|1.4816
Time spent in seconds: 45.567248821258545
.........|.........|.........|.........|.........|1.4143
Time spent in seconds: 41.81135582923889
.........|.........|.........|.........|.........|1.1327
Time spent in seconds: 42.25762701034546
.........|.........|.........|.........|.........|1.0427
Time spent in 