In [None]:
import os
import random
import numpy as np 
import pandas as pd 

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import (CosineAnnealingLR,
                                      CosineAnnealingWarmRestarts,
                                      StepLR,
                                      ExponentialLR)
import sklearn.utils
from sklearn.model_selection import train_test_split

In [None]:
!pip install mediapipe
# https://github.com/jbohnslav/opencv_transforms
!pip install opencv_transforms

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls 'drive/MyDrive/SLR_test'

decoder.model  encoder.model  sentences.csv  video


In [None]:
class Config:
    debug = False
    csv_path = ''
    seed = 33
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

    max_frames = 25
    max_words_in_sentence = 25
    drive_folder = 'drive/MyDrive/SLR_test'
    video_folder = drive_folder+'/video'

    train_csv_path = drive_folder+'/sentences.csv'
    BATCH_SIZE = 1

def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
#    torch.manual_seed(seed)
    # if torch.cuda.is_available():
    #     torch.cuda.manual_seed(seed)

config = Config()
seed_everything(config.seed)

In [None]:
# read cvs file
sentences = pd.read_csv(config.train_csv_path)

# unique words
word_set = set(['SOS','EOS'])
sentences.iloc[:,2].str.lower().str.split().apply(word_set.update)
sorted_word_set = sorted(word_set)
print('Unique words',sorted_word_set)

# create word encoding
encodings = { k:v for v,k in enumerate(sorted_word_set)}
word_idx  = { v:k for k,v in encodings.items()}
print('Word encodings',encodings)
print('Words by index',word_idx)
torch.save(encodings,config.drive_folder+'/jamal/encodings.dict')
torch.save(encodings,config.drive_folder+'/jamal/word_idx.dict')

# converts a sentence with zero padded encoding list
def get_sentence_encoded(sentence):
    encoded = [encodings[key] for key in ('SOS '+sentence+' EOS').split()]
    return  encoded + list([0]) * (config.max_words_in_sentence - len(encoded))

# print(get_sentence_encoded('mən hansı sənəd vermək'))
# print(get_sentence_encoded('mən bakı yaşamaq'))

# generate (video file name, encoding list)
# Good recommendation on not to iterate over DFs like this:
# https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas
# but it's not my case - I have fewer rows and one to many with videos.
df = pd.DataFrame(columns=["id", "video_file","encoding"])

for index, row in sentences.iterrows():
    id = row[0]
    phrase = row[2].lower()
    encoded = get_sentence_encoded(phrase)
    # iterate over video folders
    dir = config.video_folder+'/'+str(id)
    for filename in os.listdir(dir):
        f = os.path.join(dir, filename)
        # checking if it is a file
        if os.path.isfile(f):
            entry = pd.DataFrame.from_dict({"id": id, "video_file": f, "encoding": [encoded]})
            df = pd.concat([df, entry], ignore_index = True)

if config.debug:
    print(df)

Unique words ['1', '2', 'EOS', 'SOS', 'ana', 'ata', 'azərbaycan', 'bakı', 'bu', 'dünən', 'getmək', 'hansı', 'iş', 'mən', 'necə', 'olmaq', 'ora', 'orda', 'oğul', 'paytaxt', 'qız', 'subay', 'sənəd', 'var', 'vermək', 'yaşamaq', 'yox']
Word encodings {'1': 0, '2': 1, 'EOS': 2, 'SOS': 3, 'ana': 4, 'ata': 5, 'azərbaycan': 6, 'bakı': 7, 'bu': 8, 'dünən': 9, 'getmək': 10, 'hansı': 11, 'iş': 12, 'mən': 13, 'necə': 14, 'olmaq': 15, 'ora': 16, 'orda': 17, 'oğul': 18, 'paytaxt': 19, 'qız': 20, 'subay': 21, 'sənəd': 22, 'var': 23, 'vermək': 24, 'yaşamaq': 25, 'yox': 26}
Words by index {0: '1', 1: '2', 2: 'EOS', 3: 'SOS', 4: 'ana', 5: 'ata', 6: 'azərbaycan', 7: 'bakı', 8: 'bu', 9: 'dünən', 10: 'getmək', 11: 'hansı', 12: 'iş', 13: 'mən', 14: 'necə', 15: 'olmaq', 16: 'ora', 17: 'orda', 18: 'oğul', 19: 'paytaxt', 20: 'qız', 21: 'subay', 22: 'sənəd', 23: 'var', 24: 'vermək', 25: 'yaşamaq', 26: 'yox'}


In [None]:
import torchvision
from torchvision.models import efficientnet_b0,squeezenet1_1,resnet50
from torchvision.models.feature_extraction import get_graph_node_names
from torchvision.models.feature_extraction import create_feature_extractor

# model = resnet50(pretrained=True)
# model = efficientnet_b0(pretrained=True)
model = squeezenet1_1(pretrained=True)
#model.eval().to(config.device);
#train_nodes, _ = get_graph_node_names(model)
#print(train_nodes)



In [None]:
from torchvision.models.feature_extraction import create_feature_extractor

x = torch.rand(1, 3, 224, 224).to(config.device)

return_nodes = {
    'features.12.cat': 'layer12'
}
model_new = create_feature_extractor(model, return_nodes=return_nodes).to(config.device)

result = model_new(x)
n,out_filters,out_width,out_height = result['layer12'].shape
print(n,out_filters,out_width,out_height)

1 512 13 13


In [None]:
import cv2
import mediapipe as mp
from opencv_transforms import transforms

# keeps only informative frames
def keep_frames_with_hands(video_data):
	video_arr = torch.zeros((0,3,224,224)).to(config.device)

	mpHands = mp.solutions.hands
	hands = mpHands.Hands(static_image_mode=False,max_num_hands=2,min_detection_confidence=0.5,min_tracking_confidence=0.5)

	frame_count = int(video_data.get(cv2.CAP_PROP_FRAME_COUNT))
	skip_frames = max(1,frame_count/config.max_frames) # do not analyze every frame, just every skip_frames
	cnt = 0

	transform = transforms.Compose([
			transforms.Resize(size=(224,224)),
			transforms.ToTensor(),
			])

	# if (config.device == 'cuda:0'):
	# 		gpu_frame = cv2.cuda_GpuMat()
	
	while(video_data.isOpened()):

		ret, frame = video_data.read()

		# if (config.device == 'cuda:0'):
		# 	gpu_frame.upload(frame)	

		if cnt > 0:
			cnt -= 1
			continue

		cnt = skip_frames

		if ret == True:
			hand_results = hands.process(frame)
			if hand_results.multi_hand_landmarks:
				frame_ext = torch.unsqueeze(transform(frame), dim=0).to(config.device)
				video_arr = torch.cat((video_arr,frame_ext),0)
		# Break the loop
		else:
			break

	return video_arr

In [None]:
from torchvision.models.feature_extraction import create_feature_extractor

class SLDataset(Dataset):

    def __init__(self, df):
        # shuffle and save
        self.df = sklearn.utils.shuffle(df)

    def __getitem__(self, idx):
        max_frames  = config.max_frames # to have the same amount of frames for everyone

        video = df.iloc[idx,1]
        encoding = torch.tensor(df.iloc[idx,2]).to(config.device)
        enc_shape = encoding.shape[0]

        #--- keep frames with hands
        sample = cv2.VideoCapture(video)
        if (sample.isOpened() == False):
            print("Error opening video stream or file")
        hands_only = keep_frames_with_hands(sample)
        # print(hands_only.shape)
        # print('.',end="")

        #--- get last convolutional layer for each
        #video_features = hands_only.to(config.device)
        video_features = model_new(hands_only.to(config.device))['layer12']
        print(idx,video,encoding)
        #print('Feature result:',video_features[0])
        n,l,w,h = video_features.shape
        compliment_arr = torch.zeros(max_frames-n,l,w,h).to(config.device)
        video_features = torch.cat((video_features,compliment_arr),0)

        return torch.reshape(video_features,(max_frames,l*w*h)), torch.reshape(encoding,(enc_shape,1))
        #return video_features, torch.tensor(encoding)

    def __len__(self):
        return len(self.df)

def get_dataloader(df, phase: str, batch_size: int = 96) -> DataLoader:
    '''
    Dataset and DataLoader.
    Parameters:
        phase: training or validation phase.
        batch_size: data per iteration.
    Returns:
        data generator
    '''
    train_df, val_df = train_test_split(df, test_size=0.1, random_state=config.seed, stratify=df['id'])
    train_df, val_df = train_df.reset_index(drop=True), val_df.reset_index(drop=True)
    df = train_df if phase == 'train' else val_df
    dataset = SLDataset(df)
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=0)
    return dataloader

dl = get_dataloader(df,'train',2)
dl_next = next(iter(dl))
a,b = dl_next
print(a.shape,b.shape)
INPUT_SHAPE = (a.shape[1:])
print(INPUT_SHAPE)

0 drive/MyDrive/SLR_test/video/2/2022-04-19 15-17-13.mp4 tensor([ 3, 13,  7, 25,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0], device='cuda:0')
1 drive/MyDrive/SLR_test/video/2/2022-04-19 15-45-12.mp4 tensor([ 3, 13,  7, 25,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0], device='cuda:0')
torch.Size([2, 25, 86528]) torch.Size([2, 25, 1])
torch.Size([25, 86528])


In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.rnn = nn.LSTM(
                input_size = self.input_size, #hid_size * 2 if bidirectional else hid_size,
                hidden_size = self.hidden_size,
                num_layers = 1,
                dropout = 0,
                bidirectional = False,
                batch_first = True)

    def forward(self, input, hidden):
        output, hidden = self.rnn(input, hidden)
        return output, hidden

    def initHidden(self):
        return (torch.zeros(1, config.BATCH_SIZE, self.hidden_size, device=config.device),torch.zeros(1, config.BATCH_SIZE, self.hidden_size, device=config.device))

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=config.max_words_in_sentence):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        if config.debug:
          print('Attn.init() hidden_size',hidden_size)
          print('Attn.init() output_size',output_size)
          print('Attn.init() max_length',max_length)

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.rnn = nn.LSTM(
                input_size = self.hidden_size, #hid_size * 2 if bidirectional else hid_size,
                hidden_size = self.hidden_size,
                num_layers = 1,
                dropout = 0,
                bidirectional = False,
                batch_first = True)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(input.shape[0],input.shape[1],self.hidden_size)
        embedded = self.dropout(embedded)
        if config.debug:
          print('Attn.forward() input',input.shape)
          print('Attn.forward() hidden',type(hidden),len(hidden),hidden[0].shape)
          print('Attn.forward() encoder_outputs',encoder_outputs.shape)
          print('embedded: ',embedded.shape)
          print('embedded: ',embedded.shape)

        attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1).to(device=config.device)

        attn_applied = torch.bmm(attn_weights.unsqueeze(0),encoder_outputs).to(device=config.device)

        output = torch.cat((embedded[0], attn_applied[0]), 1).to(device=config.device)
        output = self.attn_combine(output).unsqueeze(0).to(device=config.device)

        output = F.relu(output)
        output, hidden = self.rnn(output, (hidden[0].unsqueeze(0),hidden[0].unsqueeze(0)))

        output = F.log_softmax(self.out(output[0]), dim=1).to(device=config.device)
        return output, hidden, attn_weights

    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_size, device=config.device),torch.zeros(1, 1, self.hidden_size, device=config.device))

In [None]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=config.max_words_in_sentence):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    if config.debug:
      print('Input len',input_tensor.shape,'Target len',target_tensor.shape)

    loss = 0

    encoder_hidden = encoder.initHidden()
    encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)

    decoder_hidden = encoder_hidden

    decoder_input  = target_tensor[:,:(max_length-2),:]   # words from 1 to n-1
    decoder_target = target_tensor[:,1:(max_length-1),:]  # words from 2 to n (the target to the input word is the next word)
    tar_1hot = torch.nn.functional.one_hot(decoder_target, num_classes = len(encodings))

    if config.debug:
      print('Encoder hidden_0',len(encoder_hidden),'shape',encoder_hidden[0].shape)
      print('enc_out',encoder_output.shape)
      print('dec_in',decoder_input.shape)
      print('dec_target',decoder_target.shape)

    target_length = decoder_target.size(1)

    for di in range(target_length):
        if config.debug:
          print('dec hidden', decoder_hidden[0].shape,decoder_hidden[1].shape)

        decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input[:,di,:], decoder_hidden[0], encoder_output)

        if config.debug:
          print('decoder_output',decoder_output.shape)

        loss += criterion(decoder_output.squeeze(0), tar_1hot[0,di,:].squeeze(0).double())

        if (decoder_target[:,di,:] == torch.tensor(encodings['EOS'], device=config.device)):
          break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()
  
    return loss.item() / (config.BATCH_SIZE*target_length)

In [None]:
from torch import optim
import torch.nn.functional as F
import gc

def trainIters(encoder, decoder, print_every=1000, plot_every=100, learning_rate=0.01):
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    # encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    # decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

    criterion = nn.CrossEntropyLoss()

    trainloader = get_dataloader(df,'train',config.BATCH_SIZE)

    max_epochs = 10

    iter = 1
    for epoch in range(max_epochs):
      print('Starting epoch', epoch)
      for inputs, labels in trainloader:
          input_tensor = inputs.to(config.device)
          target_tensor = labels.to(config.device)

          loss = train(input_tensor, target_tensor, encoder,
                      decoder, encoder_optimizer, decoder_optimizer, criterion)
          print_loss_total += loss
          plot_loss_total += loss

          if iter % print_every == 0:
              print_loss_avg = print_loss_total / print_every
              print_loss_total = 0
              print('%.4f' % (print_loss_avg))

          if iter % plot_every == 0:
              plot_loss_avg = plot_loss_total / plot_every
              plot_losses.append(plot_loss_avg)
              plot_loss_total = 0

          iter += 1

      torch.save(encoder,config.drive_folder+'/jamal/encoder.model')
      torch.save(decoder,config.drive_folder+'/jamal/decoder.model')
                 
    #showPlot(plot_losses)

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
input_size = 86528
hidden_size = 128
encoder = EncoderRNN(input_size, hidden_size).to(config.device)
attn_decoder = AttnDecoderRNN(hidden_size, len(encodings), dropout_p=0.1).to(config.device)

config.BATCH_SZIE=128
trainIters(encoder, attn_decoder, print_every=50)

Starting epoch 0
0 drive/MyDrive/SLR_test/video/2/2022-04-19 15-17-13.mp4 tensor([ 3, 13,  7, 25,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0], device='cuda:0')
1 drive/MyDrive/SLR_test/video/2/2022-04-19 15-45-12.mp4 tensor([ 3, 13,  7, 25,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0], device='cuda:0')
2 drive/MyDrive/SLR_test/video/2/2022-04-21 17-25-09.mp4 tensor([ 3, 13,  7, 25,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0], device='cuda:0')
3 drive/MyDrive/SLR_test/video/2/2022-04-22 12-11-21.mp4 tensor([ 3, 13,  7, 25,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0], device='cuda:0')
4 drive/MyDrive/SLR_test/video/2/2022-04-23 11-56-14.mp4 tensor([ 3, 13,  7, 25,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0], device='cuda:0')
5 drive/MyDrive/SLR_test/vid

In [None]:
def evaluate(encoder, decoder, frames, max_length = config.max_words_in_sentence):
    with torch.no_grad():
        encoder_hidden = encoder.initHidden()

        encoder_output, encoder_hidden = encoder(frames, encoder_hidden)

        decoder_input = torch.tensor([[encodings['SOS']]], device=config.device)  # Start of sentence

        decoder_hidden = encoder_hidden

        decoded_words = ""
        decoder_attentions = torch.zeros(max_length, max_length, device=config.device)

        for di in range(max_length):
            #print('Input:',decoder_input)
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden[0], encoder_output)

            decoder_attentions[di] = decoder_attention.data

            topv, topi = decoder_output.data.topk(1)

            if topi.item() == encodings['EOS']:
                decoded_words += '.'
                break
            else:
                decoded_words += append(word_idx[topi.item()]) + ' '

            decoder_input = topi.detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
#config.debug = True

dl = get_dataloader(df,'test',1)
encoder = torch.load(config.drive_folder+'/jamal/encoder.model')
decoder = torch.load(config.drive_folder+'/jamal/decoder.model')

for a,b in iter(dl):
  print('Testing for: ',b)
  output_words, attentions = evaluate(encoder, decoder, a)
  print(output_words)

#config.debug = False


0 drive/MyDrive/SLR_test/video/2/2022-04-19 15-17-13.mp4 tensor([ 3, 13,  7, 25,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0], device='cuda:0')
Testing for:  tensor([[[ 3],
         [13],
         [ 7],
         [25],
         [ 2],
         [ 0],
         [ 0],
         [ 0],
         [ 0],
         [ 0],
         [ 0],
         [ 0],
         [ 0],
         [ 0],
         [ 0],
         [ 0],
         [ 0],
         [ 0],
         [ 0],
         [ 0],
         [ 0],
         [ 0],
         [ 0],
         [ 0],
         [ 0]]], device='cuda:0')
Input: tensor([[3]], device='cuda:0')
Input: tensor([[13]], device='cuda:0')
Input: tensor([[16]], device='cuda:0')
Input: tensor([[14]], device='cuda:0')
Input: tensor([[10]], device='cuda:0')
['mən', 'ora', 'necə', 'getmək', '.']
1 drive/MyDrive/SLR_test/video/2/2022-04-19 15-45-12.mp4 tensor([ 3, 13,  7, 25,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0, 