# Load model

In [1]:
import importlib
import numpy as np
import torch
from collections import OrderedDict
import utils

In [2]:
import sys
sys.path.insert(1, 'transformer-slt/')
from onmt.translate.translator import build_translator
import onmt
import onmt.inputters as inputters
from onmt.translate.beam_search import BeamSearch

from argparse import Namespace

In [3]:
def import_class(name):
    components = name.rsplit('.', 1)
    print(components)
    mod = importlib.import_module(components[0])
    mod = getattr(mod, components[1])
    print(mod)
    return mod

In [4]:
model_args = {"num_classes": 1296, "c2d_type": "resnet18", "conv_type": 2, "use_bn": 1}
loss_weights = {"ConvCTC": 1.0, "SeqCTC": 1.0, "Dist": 10.0}
gloss_dict = np.load('/home/aayush/Thesis/VAC_CSLR/preprocess/phoenix2014/gloss_dict.npy', allow_pickle=True)\
                .item()
print(len(gloss_dict) + 1)

model_class = import_class("slr_network.SLRModel")
model = model_class(**model_args,
                    gloss_dict=gloss_dict,
                    loss_weights=loss_weights)

1296
['slr_network', 'SLRModel']
<class 'slr_network.SLRModel'>


In [5]:
print(model)

SLRModel(
  (conv2d): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_run

In [6]:
def modified_weights(state_dict, modified=False):
    state_dict = OrderedDict([(k.replace('.module', ''), v) for k, v in state_dict.items()])
    if not modified:
        return state_dict
    modified_dict = dict()
    return modified_dict

In [7]:
state_dict = torch.load('model_new_V2.1_32.90.pt', map_location=torch.device('cpu'))
weights = modified_weights(state_dict['model_state_dict'], False)
model.load_state_dict(weights)

<All keys matched successfully>

In [8]:
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params+=params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params
    
count_parameters(model)

+-----------------------------------------+------------+
|                 Modules                 | Parameters |
+-----------------------------------------+------------+
|           conv2d.conv1.weight           |    9408    |
|            conv2d.bn1.weight            |     64     |
|             conv2d.bn1.bias             |     64     |
|       conv2d.layer1.0.conv1.weight      |   36864    |
|        conv2d.layer1.0.bn1.weight       |     64     |
|         conv2d.layer1.0.bn1.bias        |     64     |
|       conv2d.layer1.0.conv2.weight      |   36864    |
|        conv2d.layer1.0.bn2.weight       |     64     |
|         conv2d.layer1.0.bn2.bias        |     64     |
|       conv2d.layer1.1.conv1.weight      |   36864    |
|        conv2d.layer1.1.bn1.weight       |     64     |
|         conv2d.layer1.1.bn1.bias        |     64     |
|       conv2d.layer1.1.conv2.weight      |   36864    |
|        conv2d.layer1.1.bn2.weight       |     64     |
|         conv2d.layer1.1.bn2.b

34303072

In [9]:
model.eval()

SLRModel(
  (conv2d): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_run

In [10]:
# sample try
temp = prediction_after_transformer('WEST UND loc-NORDOST TROCKEN __OFF__ __ON__')
temp[0]

NameError: name 'prediction_after_transformer' is not defined

# Input format modification

In [10]:
import torch.utils.data as data
from utils import video_augmentation
import os
import glob
import cv2
import time
import re

In [11]:
def resize_img(img_path, dsize='256x256px'): #852x480px
    dsize = tuple(int(res) for res in re.findall("\d+", dsize))
    img = cv2.imread(img_path)
    img = cv2.resize(img, dsize, interpolation=cv2.INTER_LANCZOS4)
    return img


def resize_img_folder(frames_folder_path):
    img_list = glob.glob(frames_folder_path)
    for img_path in img_list:
        rs_img = resize_img(img_path)
        cv2.imwrite(img_path, rs_img)

In [12]:
class SampleBaseFeeder(data.Dataset):
    
    def __init__(self, prefix, gloss_dict):
        self.prefix = prefix # image frames path : ./01April_2010_Thursday_heute_default-5/1/*.png
        self.dict = gloss_dict
        self.data_aug = self.transform()
        
    def __getitem__(self, idx):
        
        input_data, label, fi = self.read_video(idx)
        input_data, label = self.normalize(input_data, label)
        
        return input_data, torch.LongTensor(label), ''#self.inputs_list[idx]['original_info']

    
    def read_video(self, index, num_glosses=-1):
        # load file info        
        img_folder = os.path.join(self.prefix)
        img_list = sorted(glob.glob(img_folder))
        label_list = []
                
        return [cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB) for img_path in img_list],\
            label_list, {}

    
    def normalize(self, video, label, file_id=None):
        video, label = self.data_aug(video, label, file_id)
        video = video.float() / 127.5 - 1
        return video, label

    
    def transform(self):
        print("Apply testing transform.")
        return video_augmentation.Compose([
            #video_augmentation.Resize(0.5, 'bicubic'),
            video_augmentation.CenterCrop(224),
            video_augmentation.ToTensor(),
        ]) 
    
    
    @staticmethod
    def collate_fn(batch):
        batch = [item for item in sorted(batch, key=lambda x: len(x[0]), reverse=True)]
        video, label, info = list(zip(*batch))
        if len(video[0].shape) > 3:
            max_len = len(video[0])
            video_length = torch.LongTensor([np.int(np.ceil(len(vid) / 4.0)) * 4 + 12 for vid in video])
            left_pad = 6
            right_pad = int(np.ceil(max_len / 4.0)) * 4 - max_len + 6
            max_len = max_len + left_pad + right_pad
            padded_video = [torch.cat(
                (
                    vid[0][None].expand(left_pad, -1, -1, -1),
                    vid,
                    vid[-1][None].expand(max_len - len(vid) - left_pad, -1, -1, -1),
                )
                , dim=0)
                for vid in video]
            padded_video = torch.stack(padded_video)
        else:
            max_len = len(video[0])
            video_length = torch.LongTensor([len(vid) for vid in video])
            padded_video = [torch.cat(
                (
                    vid,
                    vid[-1][None].expand(max_len - len(vid), -1),
                )
                , dim=0)
                for vid in video]
            padded_video = torch.stack(padded_video).permute(0, 2, 1)
        label_length = torch.LongTensor([len(lab) for lab in label])
        if max(label_length) == 0:
            return padded_video, video_length, [], [], info
        else:
            padded_label = []
            for lab in label:
                padded_label.extend(lab)
            padded_label = torch.LongTensor(padded_label)
            return padded_video, video_length, padded_label, label_length, info
        
    def __len__(self):
        return 1 # as for prediction we just have one folder/video

In [13]:
len(gloss_dict)

1295

In [14]:
def get_data_loader(frames_folder_path, gloss_dict):
    
    pred_dataset = SampleBaseFeeder(frames_folder_path, gloss_dict)
    data_loader = torch.utils.data.DataLoader(
                pred_dataset,
                batch_size=1,
                shuffle=False,
                drop_last=False,
                num_workers=4, 
                collate_fn=pred_dataset.collate_fn,
            )
    
    return data_loader

In [15]:
def prediction_after_transformer(glosses):
    opt = Namespace(align_debug=False, alpha=0.0, attn_debug=False, avg_raw_probs=False,
                    batch_size=30, batch_type='sents', beam_size=4, beta=-0.0,
                    block_ngram_repeat=0, config=None, coverage_penalty='none', data_type='text',
                    dump_beam='', dynamic_dict=False, fp32=False, gpu=-1, ignore_when_blocking=[],
                    image_channel_size=3, length_penalty='none', log_file='', log_file_level='0',
                    max_length=100, max_sent_length=None, min_length=0, models=['transformer-slt/current_model.pt'],
                    n_best=1, output=None, phrase_table='', random_sampling_temp=1.0,
                    random_sampling_topk=1, ratio=-0.0, replace_unk=True, report_align=False,
                    report_time=False, sample_rate=16000, save_config=None, seed=829,
                    shard_size=10000, share_vocab=False, src=glosses,
                    src_dir='', stepwise_penalty=False, tgt=None, verbose=False, window='hamming',
                    window_size=0.02, window_stride=0.01)

    translator = build_translator(opt, report_score=True, out_file='')

    load_test_model = onmt.model_builder.load_test_model
    fields, model, model_opt = load_test_model(opt)
    scorer = onmt.translate.GNMTGlobalScorer.from_opt(opt)

    src_reader = inputters.str2reader[opt.data_type].from_opt(opt)
    tgt_reader = inputters.str2reader['text'].from_opt(opt)

    src_data = {"reader": src_reader, "data": [opt.src], "dir": ''}
    tgt_data = {"reader": tgt_reader, "data": None, "dir": None}

    _readers, _data, _dir = inputters.Dataset.config([('src', src_data), ('tgt', tgt_data)])
    data = inputters.Dataset(
            fields, readers=_readers, data=_data, dirs=_dir,
            sort_key=inputters.str2sortkey[opt.data_type]
            )

    data_iter = inputters.OrderedIterator(
                    dataset=data,
                    device=torch.device("cpu"),
                    batch_size=opt.batch_size,
                    batch_size_fn=None,
                    train=False,
                    sort=False,
                    sort_within_batch=True,
                    shuffle=False
                )

    xlation_builder = onmt.translate.TranslationBuilder(
                            data, fields, opt.n_best, opt.replace_unk, opt.tgt,
                            opt.phrase_table
                        )


    all_predictions = []
    tgt_field = dict(fields)["tgt"].base_field
    _tgt_vocab = tgt_field.vocab
    _tgt_eos_idx = _tgt_vocab.stoi[tgt_field.eos_token]
    _tgt_pad_idx = _tgt_vocab.stoi[tgt_field.pad_token]
    _tgt_bos_idx = _tgt_vocab.stoi[tgt_field.init_token]
    _tgt_unk_idx = _tgt_vocab.stoi[tgt_field.unk_token]
    _tgt_vocab_len = len(_tgt_vocab)
    _exclusion_idxs = {_tgt_vocab.stoi[t] for t in opt.ignore_when_blocking}

    copy_attn = model_opt.copy_attn

    for batch in data_iter:

        with torch.no_grad():
            decode_strategy = BeamSearch(
            opt.beam_size,
            batch_size=batch.batch_size,
            pad=_tgt_pad_idx,
            bos=_tgt_bos_idx,
            eos=_tgt_eos_idx,
            n_best=opt.n_best,
            global_scorer=scorer,
            min_length=opt.min_length, max_length=opt.max_length,
            return_attention=opt.attn_debug or opt.replace_unk,
            block_ngram_repeat=opt.block_ngram_repeat,
            exclusion_tokens=_exclusion_idxs,
            stepwise_penalty=opt.stepwise_penalty,
            ratio=opt.ratio)

            batch_data = translator._translate_batch_with_strategy(batch, data.src_vocabs, decode_strategy)

        translations = xlation_builder.from_batch(batch_data)
        for trans in translations:
            n_best_preds = [" ".join(pred)
                                    for pred in trans.pred_sents[:opt.n_best]]
            print(n_best_preds)
    return n_best_preds

In [16]:
def get_prediction_from_frames(frames_folder_path, gloss_dict):
    start = time.time()
    loader = get_data_loader(frames_folder_path, gloss_dict)
    for batch_idx, l_data in enumerate(loader):
        print(l_data[0].shape)
        device = utils.GpuDataParallel()
        vid = device.data_to_device(l_data[0])
        vid_lgt = device.data_to_device(l_data[1])
        label = device.data_to_device(l_data[2])
        label_lgt = device.data_to_device(l_data[3])

    with torch.no_grad():
        ret_dict = model(vid, vid_lgt, label=label, label_lgt=label_lgt)
    
    sentence = ''
    for word, id in ret_dict['recognized_sents'][0]:
        sentence += word + " "
    
    print(sentence)
    if len(sentence) > 0:
        sentence = prediction_after_transformer(sentence)[0]
    
    
    end = time.time()
    print("Time taken to predict (total): ", round(end - start))
    return sentence, ret_dict

In [17]:
frames_folder_name = '07February_2011_Monday_heute-4659'
frames_folder_path = './dataset/dataset/' + frames_folder_name + '/*.png'
#frames_folder_path = './frames_that_worked/recording_3_1/*.png'
resize_img_folder(frames_folder_path)

In [18]:
ns, ret_dict = get_prediction_from_frames(frames_folder_path, gloss_dict)

Apply testing transform.
torch.Size([1, 148, 3, 224, 224])
In slr_network.py forward:
torch.Size([1, 148, 3, 224, 224])
tensor([148])
<class 'torch.Tensor'>


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


SONNE poss-SEIN DANN KOMMEN REGEN MORGEN MONTAG AUCH MEHR WOLKE WIE HEUTE FREUNDLICH VIEL SONNE 
['wir haben noch ein paar sonnenstrahlen kommen dann morgen auch noch ein paar schauer und auch im süden freundliches wetter .']
Time taken to predict (total):  8


  torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)


In [19]:
sentence = ''
for word, id in ret_dict['recognized_sents'][0]:
    sentence += word + " "
print(sentence)

SONNE poss-SEIN DANN KOMMEN REGEN MORGEN MONTAG AUCH MEHR WOLKE WIE HEUTE FREUNDLICH VIEL SONNE 


In [20]:
ret_dict

{'framewise_features': tensor([[[0.0199, 0.0199, 0.0199,  ..., 0.2079, 0.2079, 0.2079],
          [0.0550, 0.0550, 0.0550,  ..., 0.1790, 0.1790, 0.1790],
          [0.0045, 0.0045, 0.0045,  ..., 1.1819, 1.1819, 1.1819],
          ...,
          [0.8031, 0.8031, 0.8031,  ..., 0.6039, 0.6039, 0.6039],
          [1.2464, 1.2464, 1.2464,  ..., 0.2624, 0.2624, 0.2624],
          [1.7732, 1.7732, 1.7732,  ..., 0.7847, 0.7847, 0.7847]]]),
 'visual_features': tensor([[[0.0000, 0.0000, 0.7698,  ..., 0.0000, 1.6244, 1.1049]],
 
         [[0.0000, 0.0000, 0.6106,  ..., 0.0000, 3.4520, 1.5677]],
 
         [[1.5233, 0.0000, 0.0591,  ..., 0.0000, 2.5981, 0.5526]],
 
         ...,
 
         [[0.0000, 0.0000, 0.6513,  ..., 0.1158, 0.5487, 0.0000]],
 
         [[0.0000, 0.0000, 0.2852,  ..., 0.9933, 1.8173, 1.1906]],
 
         [[0.0000, 0.0000, 0.6689,  ..., 0.0000, 2.1551, 1.8359]]]),
 'feat_len': tensor([34]),
 'conv_logits': tensor([[[-12.8916, -30.9507, -27.1357,  ..., -26.5104, -25.9697, -27.18

In [20]:
#del model

In [21]:
# know how a folder info looks like.

inputs_list = np.load(f"./preprocess/phoenix14t/train_info.npy", allow_pickle=True).item()
for x in inputs_list:
    if x == 'prefix':
        continue
    if inputs_list[x]['fileid'] == frames_folder_name:
        print(inputs_list[x])

        

{'fileid': '07February_2011_Monday_heute-4659', 'folder': 'train/07February_2011_Monday_heute-4659/*.png', 'signer': 'Signer07', 'label': 'SONNE VORBEI DANN KOMMEN MEHR REGEN MORGEN IX FLUSS AUCH MEHR WOLKE WIE HEUTE FREUNDLICH SONNE', 'num_frames': 134, 'original_info': '07February_2011_Monday_heute-4659|07February_2011_Monday_heute-4659/1/*.png|-1|-1|Signer07|SONNE VORBEI DANN KOMMEN MEHR REGEN MORGEN IX FLUSS AUCH MEHR WOLKE WIE HEUTE FREUNDLICH SONNE|nach sonne kommt regen so wird es auch morgen an der mosel sein eher das trübe wetter und nicht so einen herrlichen sonnenschein wie wir heute hatten'}


In [22]:
inputs_list = np.load(f"./preprocess/phoenix14t/train_info.npy", allow_pickle=True).item()
for x in inputs_list:
    if x == 'prefix':
        continue
    
    if inputs_list[x]['num_frames'] < 40:# and 'TSCHUESS' in inputs_list[x]['label']:
        print(len(inputs_list[x]['label'].split(' ')))
        print(inputs_list[x])

2
{'fileid': '14August_2009_Friday_tagesschau-66', 'folder': 'train/14August_2009_Friday_tagesschau-66/*.png', 'signer': 'Signer05', 'label': 'MAL NEBEL', 'num_frames': 37, 'original_info': '14August_2009_Friday_tagesschau-66|14August_2009_Friday_tagesschau-66/1/*.png|-1|-1|Signer05|MAL NEBEL|hier und da bilden sich einzelne nebelfelder'}
2
{'fileid': '01November_2010_Monday_tagesschau-146', 'folder': 'train/01November_2010_Monday_tagesschau-146/*.png', 'signer': 'Signer05', 'label': 'IX MILD', 'num_frames': 37, 'original_info': '01November_2010_Monday_tagesschau-146|01November_2010_Monday_tagesschau-146/1/*.png|-1|-1|Signer05|IX MILD|dabei bleibt es sehr mild'}
2
{'fileid': '08May_2010_Saturday_tagesschau-180', 'folder': 'train/08May_2010_Saturday_tagesschau-180/*.png', 'signer': 'Signer08', 'label': 'ORT NEBEL', 'num_frames': 29, 'original_info': '08May_2010_Saturday_tagesschau-180|08May_2010_Saturday_tagesschau-180/1/*.png|-1|-1|Signer08|ORT NEBEL|örtlich bildet sich nebel'}
2
{'fil

# For live prediction

In [23]:
#!pip install tensorflow==2.4.1 tensorflow-gpu==2.4.1 opencv-python mediapipe sklearn matplotlib

In [23]:
from matplotlib import pyplot as plt
import mediapipe as mp
import shutil

In [24]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
mp_selfie_segmentation = mp.solutions.selfie_segmentation

In [25]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

In [26]:
def draw_styled_landmarks(image, results):
    #mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS,
                              #mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             #mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             #)
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             )
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

In [27]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh]), lh, rh

In [29]:
lhsequence = rhsequence = []
sentence = ''
trans_sentence = ''
vcount = 0
fcount = 0
recordingDone = False
frames_base_path = 'frames_collection'
frames_path = ''
BG_COLOR = (192, 192, 192)

cap = cv2.VideoCapture('output2.mp4')
#cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280) #1280
#cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720) #720
#cap.set(cv2.CAP_PROP_FPS, 25)

# empties the frames_base_path location to store new set of frames
if os.path.exists(frames_base_path):
    shutil.rmtree(frames_base_path)

with mp_holistic.Holistic(min_detection_confidence=0.1,
                          min_tracking_confidence=0.1,
                          model_complexity=0, 
                          enable_segmentation=True) as holistic:
#with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.1, min_detection_confidence=0.1, 
    #min_tracking_confidence=0.1) as hands
    bg_image = None
    while cap.isOpened():

        # READ CAMERA FEED
        ret, frame = cap.read()

        if frame is not None:
            image, results = mediapipe_detection(frame, holistic)


            # DRAW LANDMARKS
            #draw_styled_landmarks(image, results)

            # EXTRACT HANDS KEYPOINTS
            keypoints, lh_keypoints, rh_keypoints = extract_keypoints(results)        
            lhsequence.append(lh_keypoints)
            rhsequence.append(rh_keypoints)
            

        if frame is None and not recordingDone: 
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
                
        elif not (np.sum(lhsequence) == 0 and np.sum(rhsequence) == 0) and frame is not None:
            
            # with image segmentation, background as grey color
            if False: # if True change "frame" to "output_image" in cv2.imwrite(img_path,frame) #line 67 & 69.
                with mp_selfie_segmentation.SelfieSegmentation( 
                                                model_selection=1) as selfie_segmentation:
                    seg_img, seg_results = mediapipe_detection(frame, selfie_segmentation)
                    condition = np.stack((seg_results.segmentation_mask,) * 3, axis=-1) > 0.1
                    if bg_image is None:
                        bg_image = np.zeros(image.shape, dtype=np.uint8)
                        bg_image[:] = BG_COLOR
                    output_image = np.where(condition, seg_img, bg_image)
            
            
            frames_path = os.path.join(frames_base_path, 'recording_' + str(vcount))
            img_path = frames_path + '/frame' + str(fcount) + '.png'
            if not os.path.exists(frames_path):
                os.makedirs(frames_path)
                cv2.imwrite(img_path, frame)
            else:
                cv2.imwrite(img_path, frame) 

            fcount += 1
            recordingDone = True
            
            if len(lhsequence) > 15:
                lhsequence = []
            if len(rhsequence) > 15:
                rhsequence = []

        elif recordingDone:
            # MAKE PREDICTIONS
            print("Predicting on ",frames_path)
            resize_img_folder(frames_path+'/*.png')
            trans_sentence, prediction_dict = get_prediction_from_frames(frames_path+'/*.png', gloss_dict)
            
            vcount += 1
            fcount = 0
            recordingDone = False


        cv2.rectangle(image, (0,0), (1280, 30), (0, 0, 0), -1)        
        cv2.putText(image, str(trans_sentence), (3,20),cv2.FONT_HERSHEY_DUPLEX,
                    0.5, (255, 255, 255), 1, cv2.LINE_AA)

        
        cv2.imshow('Live SLR Detection', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
                
    cap.release()
    cv2.destroyAllWindows()

Predicting on  frames_collection/recording_0
Apply testing transform.
torch.Size([1, 148, 3, 224, 224])
In slr_network.py forward:
torch.Size([1, 148, 3, 224, 224])
tensor([148])
<class 'torch.Tensor'>
HEUTE FREUNDLICH SONNE DANN KOMMEN REGEN MORGEN FLUSS AUCH MEHR WOLKE WIE 
['heute herrlicher sonnenschein ohne wolken mit etwas regen im nordosten auch morgen wird es wolkiger .']
Time taken to predict (total):  9


In [38]:
cap.release()
cv2.destroyAllWindows()

In [None]:
'SONNE poss-SEIN DANN KOMMEN REGEN MORGEN MONTAG IX AUCH MEHR WOLKE WIE HEUTE FREUNDLICH VIEL SONNE'
['und das wird zieht ein bisschen regen auch morgen am montag da wird es wieder freundlicher und nicht \
 mehr so viel sonnenschein wie heute .']


In [35]:
prediction_dict

{'framewise_features': tensor([[[1.1967, 1.1967, 1.1967,  ..., 0.6765, 0.6765, 0.6765],
          [1.0768, 1.0768, 1.0768,  ..., 1.0086, 1.0086, 1.0086],
          [1.8241, 1.8241, 1.8241,  ..., 1.6931, 1.6931, 1.6931],
          ...,
          [0.5934, 0.5934, 0.5934,  ..., 0.9612, 0.9612, 0.9612],
          [0.2724, 0.2724, 0.2724,  ..., 0.3968, 0.3968, 0.3968],
          [0.0137, 0.0137, 0.0137,  ..., 0.0396, 0.0396, 0.0396]]]),
 'visual_features': tensor([[[0.4233, 0.2283, 0.0000,  ..., 0.0000, 0.0000, 0.0815]],
 
         [[0.0895, 0.1519, 0.0000,  ..., 0.0000, 0.0000, 0.2341]],
 
         [[0.2696, 0.1529, 0.0000,  ..., 0.0000, 0.0000, 0.7374]],
 
         ...,
 
         [[0.0614, 0.2695, 0.0000,  ..., 0.0000, 0.0000, 0.1809]],
 
         [[0.0000, 0.2335, 0.0000,  ..., 0.0000, 0.0000, 0.3364]],
 
         [[0.0000, 0.0744, 0.0000,  ..., 0.0000, 0.0000, 0.3708]]]),
 'feat_len': tensor([9]),
 'conv_logits': tensor([[[-19.1158, -32.7540, -32.6560,  ..., -31.2387, -31.5947, -32.163

In [30]:
sentence = ''
for word, id in prediction_dict['conv_sents'][0]:
    sentence += word + " "
print(sentence)

SONNE HEUTE BESONDERS SONNE BIS WOCHE DANN KOMMEN REGEN MORGEN AUCH MEHR WOLKE WIE 


In [31]:
temp = prediction_after_transformer(sentence)
temp[0]

['die sonne zeigt sich vor allem in der neuen woche wird es auch morgen in der neuen woche wieder häufiger mal mehr mal weniger wolken .']


'die sonne zeigt sich vor allem in der neuen woche wird es auch morgen in der neuen woche wieder häufiger mal mehr mal weniger wolken .'

# Extras