In this notebook, I took out the feature extractor (a pre-trained model) from the Data Loader and added it into the Encoder. The name of the notebook also holds this: *_TRansferLEarningInENCoder

In [1]:
import os
import random
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import (CosineAnnealingLR,
                                      CosineAnnealingWarmRestarts,
                                      StepLR,
                                      ExponentialLR)
import sklearn.utils
from sklearn.model_selection import train_test_split

In CeDAR (Center for Data Analytics Research, ADA University) option, the notebook shall connevt to the local machine (where the whole dataset is supposed to be). To connect to the CeDAR's environment run the following to start Jupyter with access:


```
jupyter notebook \
>   --NotebookApp.allow_origin='https://colab.research.google.com' \
>   --port=8888 \
>   --NotebookApp.port_retries=0
```
Then select "Connect to a local runtime" and put the link of notebook environment (from the console)



In [29]:
class Config:
    debug = False
    env = 'CeDAR' # Dev (Jamal's GoogleDrive), Prod (SLR GDrive) or CeDAR (local)
    csv_path = ''
    seed = 44
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

    video_processing_tool = 'OpenCV' # OpenCV, VidGear or TorchVision
    max_frames = 25
    max_words_in_sentence = 25


    drive_folder = '/home/toghrul/SLR/data/dataset_jamal/' # path for the local (CeDAR)
    if (env == 'Dev'):
      drive_folder = 'drive/MyDrive/SLR_test'
    elif (env == 'Prod'):
      drive_folder = 'drive/MyDrive/SLR/Data'
    
    video_folder = drive_folder+'/Video'

    train_csv_path = drive_folder+'/sentences.csv'
    camera_source = 'Cam2' # Cam1 - side-top, Cam2 - front
    BATCH_SIZE = 1 #updated before the training

def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
#    torch.manual_seed(seed)
    # if torch.cuda.is_available():
    #     torch.cuda.manual_seed(seed)

config = Config()
seed_everything(config.seed)
print('Running on',config.device)

Running on cuda:0


In [3]:
if (config.env != 'CeDAR'):
  from google.colab import drive
  drive.mount('/content/drive')

In [4]:
import sys
import subprocess

def pip_install(package):
  subprocess.check_call([sys.executable, '-m', 'pip', 'install',package])

In [5]:
pip_install('mediapipe')

# https://github.com/jbohnslav/opencv_transforms
pip_install('opencv_transforms')

if config.video_processing_tool == 'VidGear':
  pip_install('vidgear[core]')



In [6]:
# read cvs file
sentences = pd.read_csv(config.train_csv_path)

# unique words
word_set = set(['SOS','EOS'])
sentences.iloc[:,2].str.lower().str.split().apply(word_set.update)
sorted_word_set = sorted(word_set)
print('Unique words',sorted_word_set)

# create word encoding
encodings = { k:v for v,k in enumerate(sorted_word_set)}
word_idx  = { v:k for k,v in encodings.items()}
print('Word encodings',encodings)
print('Words by index',word_idx)
torch.save(encodings,config.drive_folder+'/jamal/encodings.dict')
torch.save(word_idx,config.drive_folder+'/jamal/word_idx.dict')

# converts a sentence with zero padded encoding list
def get_sentence_encoded(sentence):
    encoded = [encodings[key] for key in ('SOS '+sentence+' EOS').split()]
    return  encoded + list([0]) * (config.max_words_in_sentence - len(encoded))

if config.debug:  
  print(get_sentence_encoded('mən hansı sənəd vermək'))
  print(get_sentence_encoded('mən bakı yaşamaq'))

# generate (video file name, encoding list)
# Good recommendation on not to iterate over DFs like this:
# https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas
# but it's not my case - I have fewer rows and one to many with videos.
df = pd.DataFrame(columns=["id", "video_file","encoding"])

for index, row in sentences.iterrows():
    id = row[0]
    phrase = row[2].lower()
    encoded = get_sentence_encoded(phrase)
    
    # there is a grouping of videos in production.
    pre_folder = '/1-250/' if (config.env == 'Prod' and id < 251) else '/'
    
    dir = config.video_folder+'/'+config.camera_source +pre_folder+str(id)
    # iterate over video folders
    for filename in os.listdir(dir):
        f = os.path.join(dir, filename)
        # checking if it is a file
        if os.path.isfile(f):
            entry = pd.DataFrame.from_dict({"id": id, "video_file": f, "encoding": [encoded]})
            df = pd.concat([df, entry], ignore_index = True)

if config.debug:
    print(df)

Unique words ['1', '14', '1979', '2', '23', '28', '3', '43', '6', '65', 'EOS', 'SOS', 'ada', 'adam', 'adası', 'ana', 'asif', 'ata', 'avtobus', 'avtomobil', 'axtarmaq', 'axşam', 'ayaq', 'ayaqyolu', 'az', 'azərbaycan', 'ağrımaq', 'aşağı', 'bakı', 'balıq', 'bax', 'bağlı', 'baş', 'başa', 'bel', 'biz', 'bizim', 'boğaz', 'bu', 'bulvar', 'burda', 'burdan', 'dammaq', 'dayanmaq', 'demək', 'doğulmaq', 'dünən', 'düşmək', 'dəfinələr', 'dəli', 'dəniz', 'etmək', 'ev', 'eynək', 'getmək', 'görmək', 'gözləmək', 'gün', 'günorta', 'gəlmək', 'gəzmək', 'hansı', 'harda', 'heç', 'həftə', 'hər', 'idman', 'iki', 'indi', 'it', 'iş', 'işləmək', 'i̇çəri', 'keçmək', 'kim', 'kitab', 'kovid', 'kür', 'kənar', 'lazım', 'may', 'metro', 'mın', 'məhərrəmov', 'mən', 'mənim', 'mərkəz', 'məşğul', 'necə', 'olmaq', 'ora', 'orda', 'oxumaq', 'oğul', 'paytaxt', 'pis', 'pişik', 'qapı', 'qaz', 'qaçmaq', 'qeydiyyat', 'qonşu', 'qoxu', 'qız', 'qələm', 'rayon', 'saat', 'sabah', 'sahil', 'saylı', 'siz', 'son', 'sonra', 'su', 'subay', '

This small piece of code is to test the outputs of the pre-trained model.

In [7]:
# from torchvision.models.feature_extraction import create_feature_extractor

# x = torch.rand(1, 3, 224, 224).to(config.device)

# return_nodes = {
#     'features.12.cat': 'layer12'
# }
# model_new = create_feature_extractor(model, return_nodes=return_nodes).to(config.device)

# result = model_new(x)
# n,out_filters,out_width,out_height = result['layer12'].shape
# print(n,out_filters,out_width,out_height)

In [8]:
rows = 5
cols = 5

def visualize_frames(frames):
  fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(15,15))
  
  idx = 0
  for i in range(rows):
      for j in range(cols):
        axes[i, j].imshow(frames[0,idx,:,:,:].permute(1,2,0).cpu()*255.0)
        idx += 1

TODO: Implement augmentation to the data

In [8]:
from torchvision import transforms

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    UniformTemporalSubsample,
    Permute,   
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomAdjustSharpness,
    Resize,
    ColorJitter,
    RandomHorizontalFlip
)

from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo
)



In [9]:
import cv2
import mediapipe as mp
from opencv_transforms import transforms

# New mediapipe modules
# from mediapipe.tasks import python
# from mediapipe.tasks.python import vision

# Old Mediapipe code
mpHands = mp.solutions.hands
hands = mpHands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.2, min_tracking_confidence=0.2)

# New Mediapipe code
# base_options = python.BaseOptions(model_asset_path='hand_landmarker.task')
# options = vision.HandLandmarkerOptions(base_options = base_options,num_hands=2)
# detector = vision.HandLandmarker.create_from_options(options)

# keeps only informative frames

def keep_frames_with_hands(video_data, crop_size: int = 224,
                           video_transform = None):
  """

  Args:
      video_data (_type_): _description_
      crop_size (int, optional): crop_size for image transform. Defaults to 224.
      video_transfrom (torchvision.transforms.Compose, optional): transformations to be applied to the video. Defaults to None.

  Returns:
      torch.Tensor(): processed video tensor
  """
  
  video_arr = torch.zeros((0, 3, crop_size, crop_size)).to(config.device)
  
    # Note: keeping this old code to know how to get frame count from different libs
    #  
    # frame_count = 0
    # if config.video_processing_tool == 'OpenCV':
    #   frame_count = int(video_data.get(cv2.CAP_PROP_FRAME_COUNT))
    # elif config.video_processing_tool == 'VidGear':
    #   frame_count = int(video_data.stream.get(cv2.CAP_PROP_FRAME_COUNT))
    # elif config.video_processing_tool == 'TorchVision':
    #   metadata = video_data.get_metadata()
    #   frame_count = metadata['video']['duration'][0] * metadata['video']['fps'][0]
    #   video_data.set_current_stream("video")
    
  # This uses torchvision's Compose since it has been imported
  
  # TO-DO: check if the nd.array returned by openCV has np.unit8 as dtype
  transform = Compose([
      transforms.Resize(size=(crop_size,crop_size)),
      transforms.ToTensor(), # Converts to tensor from nd.array (numpy) and shoul've normalized to [0,1].
      ])
  
  ret = True
  frame = None
  while(True):
    if config.video_processing_tool == 'OpenCV':
      ret, frame = video_data.read()
    elif config.video_processing_tool == 'VidGear':
      frame = video_data.read()
    elif config.video_processing_tool == 'TorchVision':
      frame = next(video_data)['data']

    if (ret is False) or (frame is None):
        break

    # Old Mediapipe code
    hand_results = hands.process(frame)

    # New Mediapipe code
    # hand_results = detector.detect(frame)

    if hand_results.multi_hand_landmarks != None:
      frame_ext = torch.unsqueeze(transform(frame), dim=0).to(config.device)
      video_arr = torch.cat((video_arr, frame_ext/255.0),0)
      
    # if config.debug:
    #   print(f"Video compiled with shape: {video_arr.shape}")
      
    # if video_transform:
    #   video_arr = video_transform(video_arr)
      
    # video_arr = def_video_transform(video_arr)
    
  return video_arr

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


TODO: It seems the data is not read randomly. It's read sequentially.

In [10]:
def get_video_transforms(img_size: int = 224):
    video_transform=Compose([
        # Resize(size=(img_size, img_size)),
        # UniformTemporalSubsample(25),
        ColorJitter(hue=0.5, contrast=0.5),
        # Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
        # RandomShortSideScale(min_size=256, max_size=512),
        # CenterCropVideo(224),
        RandomHorizontalFlip(p=0.5),
    ])
    
    return video_transform

In [23]:
if config.video_processing_tool == 'VidGear':
  from vidgear.gears import CamGear
import torchvision
from torchvision.models.feature_extraction import create_feature_extractor

class SLDataset(Dataset):

    def __init__(self, df, video_transform):
        # shuffle and save
        self.df = sklearn.utils.shuffle(df)
        self.video_transform = video_transform

    def __getitem__(self, idx):
        print(f"Got item at index: {idx}")
        video_path = df.iloc[idx,1]

        encoding = torch.tensor(df.iloc[idx,2]).to(config.device)
        enc_shape = encoding.shape[0]

        reader = None
        #--- keep frames with hands
        if config.video_processing_tool == 'OpenCV':
          reader = cv2.VideoCapture(video_path)
        elif config.video_processing_tool == 'VidGear':
          reader = CamGear(video_path).start()
        elif config.video_processing_tool == 'TorchVision':
          reader = torchvision.io.VideoReader(video_path, "video")

        hands_only = keep_frames_with_hands(reader, video_transform=self.video_transform).to(config.device)
        hands_only = video_transform(hands_only) # applies video transformations
        
        if config.debug:
          print(f"Hands only shape: {hands_only.shape}")
        if config.video_processing_tool == 'OpenCV':
          reader.release()
        elif config.video_processing_tool == 'VidGear':
          reader.stop()
        elif config.video_processing_tool == 'TorchVision':
          pass

        if config.debug:
          print(idx,video_path,encoding)

        n,l,w,h = hands_only.shape

        # When frames are more than we need but not that much (just trim it from the start and end)
        if (n > config.max_frames) and (n < 1.5*config.max_frames):
          left = (n-config.max_frames)//2
          right = n-left-1
          hands_only = hands_only[left:right,:,:,:]
          n = hands_only.shape[0]
        # If we have much more frames than we need
        elif (n > config.max_frames):
          slice_step = max(2,(n//config.max_frames)+1)
          hands_only = hands_only[::slice_step,:,:,:]
          n = hands_only.shape[0]

        # If we have less frames than we need
        # This should be a separate IF: the result of the previous IFs may also deliver fewer frame count
        if (n < config.max_frames):
          compliment_arr = torch.zeros(config.max_frames-n,l,w,h).to(config.device)
          hands_only = torch.cat((hands_only,compliment_arr),0)

        return hands_only, torch.reshape(encoding,(enc_shape,1))

    def __len__(self):
        return len(self.df)

def get_dataloader(df, phase: str, batch_size: int = 96, video_transform = None) -> DataLoader:
    train_df, val_df = train_test_split(df, test_size=0.1, random_state=config.seed, stratify=df['id'])
    train_df, val_df = train_df.reset_index(drop=True), val_df.reset_index(drop=True)
    df = train_df if phase == 'train' else val_df
    dataset = SLDataset(df, video_transform)
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=0, shuffle=True)
    return dataloader


video_transform = get_video_transforms()

# dl = get_dataloader(df,'train',1, video_transform=None)
# dl = get_dataloader(df,'train',1, video_transform=video_transform)
# dl_next = next(iter(dl))
# a,b = dl_next

# if config.debug:
#   print(a.shape,b.shape)
#   visualize_frames(a)

In [24]:
import torchvision
from torchvision.models import squeezenet1_1
from torchvision.models.feature_extraction import create_feature_extractor

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        
        model = squeezenet1_1(pretrained=True).to(config.device)
        return_nodes = {
            'features.12.cat': 'layer12'
        }
        self.pretrained_model = create_feature_extractor(model, return_nodes=return_nodes).to(config.device)
        self.pretrained_model.eval()

        self.input_size = input_size
        self.hidden_size = hidden_size

        self.rnn = nn.LSTM(
                input_size = self.input_size, #  multiply by 2 if bidirectional
                hidden_size = self.hidden_size,
                num_layers = 1,
                dropout = 0,
                bidirectional = False,
                batch_first = True)

    def forward(self, input, hidden):
        features = self.pretrained_model(input.squeeze())['layer12'].to(device=config.device)
        feat_shape = features.shape

        feat_flat =  torch.reshape(features,(1,feat_shape[0],feat_shape[1]*feat_shape[2]*feat_shape[3])).to(device=config.device)

        output, hidden = self.rnn(feat_flat, hidden)
        return output, hidden

    def initHidden(self):
        return (torch.zeros(1, config.BATCH_SIZE, self.hidden_size, device=config.device),torch.zeros(1, config.BATCH_SIZE, self.hidden_size, device=config.device))

In [25]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=config.max_words_in_sentence):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        if config.debug:
          print('Attn.init() hidden_size',hidden_size)
          print('Attn.init() output_size',output_size)
          print('Attn.init() max_length',max_length)

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.rnn = nn.LSTM(
                input_size = self.hidden_size, #hid_size * 2 if bidirectional else hid_size,
                hidden_size = self.hidden_size,
                num_layers = 1,
                dropout = 0,
                bidirectional = False,
                batch_first = True)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(input.shape[0],input.shape[1],self.hidden_size)
        embedded = self.dropout(embedded)
        if config.debug:
          print('Attn.forward() input',input.shape)
          print('Attn.forward() hidden',type(hidden),len(hidden),hidden[0].shape)
          print('Attn.forward() encoder_outputs',encoder_outputs.shape)
          print('embedded: ',embedded.shape)
          print('embedded: ',embedded.shape)

        attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1).to(device=config.device)

        attn_applied = torch.bmm(attn_weights.unsqueeze(0),encoder_outputs).to(device=config.device)

        output = torch.cat((embedded[0], attn_applied[0]), 1).to(device=config.device)
        output = self.attn_combine(output).unsqueeze(0).to(device=config.device)

        output = F.relu(output)
        output, hidden = self.rnn(output, (hidden[0].unsqueeze(0),hidden[0].unsqueeze(0)))

        output = F.log_softmax(self.out(output[0]), dim=1).to(device=config.device)
        return output, hidden, attn_weights

    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_size, device=config.device),torch.zeros(1, 1, self.hidden_size, device=config.device))

In [26]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=config.max_words_in_sentence):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    if config.debug:
      print('Input len',input_tensor.shape,'Target len',target_tensor.shape)

    loss = 0

    encoder_hidden = encoder.initHidden()
    encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)

    decoder_hidden = encoder_hidden

    decoder_input  = target_tensor[:,:(max_length-2),:]   # words from 1 to n-1
    decoder_target = target_tensor[:,1:(max_length-1),:]  # words from 2 to n (the target to the input word is the next word)
    tar_1hot = torch.nn.functional.one_hot(decoder_target, num_classes = len(encodings))

    if config.debug:
      print('Encoder hidden_0',len(encoder_hidden),'shape',encoder_hidden[0].shape)
      print('enc_out',encoder_output.shape)
      print('dec_in',decoder_input.shape)
      print('dec_target',decoder_target.shape)

    target_length = decoder_target.size(1)

    for di in range(target_length):
        if config.debug:
          print('dec hidden', decoder_hidden[0].shape,decoder_hidden[1].shape)

        decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input[:,di,:], decoder_hidden[0], encoder_output)

        if config.debug:
          print('decoder_output',decoder_output.shape)

        loss += criterion(decoder_output.squeeze(0), tar_1hot[0,di,:].squeeze(0).double())

        if (decoder_target[:,di,:] == torch.tensor(encodings['EOS'], device=config.device)):
          break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()
  
    return loss.item() / (config.BATCH_SIZE*target_length)

In [27]:
from torch import optim
import torch.nn.functional as F
import gc

def trainIters(encoder, decoder, print_every=1000, plot_every=100, learning_rate=0.01):
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

    criterion = nn.CrossEntropyLoss()
    video_transform = get_video_transforms()

    trainloader = get_dataloader(df,'train',config.BATCH_SIZE, video_transform=video_transform)

    max_epochs = 1

    iter = 1
    for epoch in range(max_epochs):
      print('Starting epoch', epoch)
      for inputs, labels in trainloader:
          input_tensor = inputs.to(config.device)
          target_tensor = labels.to(config.device)

          loss = train(input_tensor, target_tensor, encoder,
                      decoder, encoder_optimizer, decoder_optimizer, criterion)
          print_loss_total += loss
          plot_loss_total += loss

          if iter % print_every == 0:
              print_loss_avg = print_loss_total / print_every
              print_loss_total = 0
              print('%.4f' % (print_loss_avg))
              torch.save(encoder,config.drive_folder+'/jamal/encoder.model')
              torch.save(decoder,config.drive_folder+'/jamal/decoder.model')
              gc.collect()
              torch.cuda.empty_cache()

          if iter % plot_every == 0:
              plot_loss_avg = plot_loss_total / plot_every
              plot_losses.append(plot_loss_avg)
              plot_loss_total = 0

          iter += 1
                 
    #showPlot(plot_losses)

In [30]:
input_size = 86528
hidden_size = 256
encoder = EncoderRNN(input_size, hidden_size).to(config.device)
attn_decoder = AttnDecoderRNN(hidden_size, len(encodings), dropout_p=0.1).to(config.device)

config.BATCH_SZIE=256
trainIters(encoder, attn_decoder, print_every=50)



Starting epoch 0
Got item at index: 629
Got item at index: 1474
Got item at index: 564
Got item at index: 102
Got item at index: 1202
Got item at index: 761
Got item at index: 174
Got item at index: 148


KeyboardInterrupt: 

In [None]:
def evaluate(encoder, decoder, frames, max_length = config.max_words_in_sentence):
    with torch.no_grad():
        encoder_hidden = encoder.initHidden()

        encoder_output, encoder_hidden = encoder(frames, encoder_hidden)

        decoder_input = torch.tensor([[encodings['SOS']]], device=config.device)  # Start of sentence

        decoder_hidden = encoder_hidden

        decoded_words = ''
        decoder_attentions = torch.zeros(max_length, max_length, device=config.device)

        for di in range(max_length):
            #print('Input:',decoder_input)
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden[0], encoder_output)

            decoder_attentions[di] = decoder_attention.data

            topv, topi = decoder_output.data.topk(1)

            if topi.item() == encodings['EOS']:
                decoded_words += '.'
                break
            else:
                decoded_words += word_idx[topi.item()] + ' '

            decoder_input = topi.detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
#config.debug = True
config.env = 'Prod'

dl = get_dataloader(df,'test',1)

encodings = torch.load(config.drive_folder+'/jamal/encodings.dict')
word_idx = torch.load(config.drive_folder+'/jamal/word_idx.dict')

encoder = torch.load(config.drive_folder+'/jamal/encoder.model',map_location=torch.device(config.device))
decoder = torch.load(config.drive_folder+'/jamal/decoder.model',map_location=torch.device(config.device))

def print_words(tensor_encoding):
  sentence = ''
  idx = 1
  while tensor_encoding[0,idx] != encodings['EOS']:
    sentence += word_idx[tensor_encoding[0,idx,0].item()] + ' '
    idx += 1

t = 1
for a,b in iter(dl):
  # print('Testing for: ',b)
  output_words, attentions = evaluate(encoder, decoder, a)
  print(output_words)
  print(torch.max(a[0,0,:,:]))
  visualize_frames(a)
  if t>0:
    break
  t += 1

#config.debug = False
