# Load model

In [1]:
import importlib
import numpy as np
import torch
from collections import OrderedDict
import utils

In [2]:
import sys
sys.path.insert(1, 'transformer-slt/')
from onmt.translate.translator import build_translator
import onmt
import onmt.inputters as inputters
from onmt.translate.beam_search import BeamSearch

from argparse import Namespace

In [3]:
def import_class(name):
    components = name.rsplit('.', 1)
    print(components)
    mod = importlib.import_module(components[0])
    mod = getattr(mod, components[1])
    print(mod)
    return mod

In [4]:
model_args = {"num_classes": 1296, "c2d_type": "resnet18", "conv_type": 2, "use_bn": 1}
loss_weights = {"ConvCTC": 1.0, "SeqCTC": 1.0, "Dist": 10.0}
gloss_dict = np.load('/home/aayush/Thesis/VAC_CSLR/preprocess/phoenix2014/gloss_dict.npy', allow_pickle=True)\
                .item()


model_class = import_class("slr_network.SLRModel")
model = model_class(**model_args,
                    gloss_dict=gloss_dict,
                    loss_weights=loss_weights)

['slr_network', 'SLRModel']
<class 'slr_network.SLRModel'>


In [5]:
def modified_weights(state_dict, modified=False):
    state_dict = OrderedDict([(k.replace('.module', ''), v) for k, v in state_dict.items()])
    if not modified:
        return state_dict
    modified_dict = dict()
    return modified_dict

In [6]:
state_dict = torch.load('resnet18_slr_pretrained_distill25.pt', map_location=torch.device('cpu'))
weights = modified_weights(state_dict['model_state_dict'], False)
model.load_state_dict(weights, strict=True)

<All keys matched successfully>

In [7]:
model.eval()

SLRModel(
  (conv2d): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_run

In [8]:
def prediction_after_transformer(glosses = 'SCHOEN ABEND TSCHUESS'):
    opt = Namespace(align_debug=False, alpha=0.0, attn_debug=False, avg_raw_probs=False,
                    batch_size=30, batch_type='sents', beam_size=4, beta=-0.0,
                    block_ngram_repeat=0, config=None, coverage_penalty='none', data_type='text',
                    dump_beam='', dynamic_dict=False, fp32=False, gpu=-1, ignore_when_blocking=[],
                    image_channel_size=3, length_penalty='none', log_file='', log_file_level='0',
                    max_length=100, max_sent_length=None, min_length=0, models=['transformer-slt/model_step_1600.pt'],
                    n_best=1, output=None, phrase_table='', random_sampling_temp=1.0,
                    random_sampling_topk=1, ratio=-0.0, replace_unk=True, report_align=False,
                    report_time=False, sample_rate=16000, save_config=None, seed=829,
                    shard_size=10000, share_vocab=False, src=glosses,
                    src_dir='', stepwise_penalty=False, tgt=None, verbose=False, window='hamming',
                    window_size=0.02, window_stride=0.01)

    translator = build_translator(opt, report_score=True, out_file='')

    load_test_model = onmt.model_builder.load_test_model
    fields, model, model_opt = load_test_model(opt)
    scorer = onmt.translate.GNMTGlobalScorer.from_opt(opt)

    src_reader = inputters.str2reader[opt.data_type].from_opt(opt)
    tgt_reader = inputters.str2reader['text'].from_opt(opt)

    src_data = {"reader": src_reader, "data": [opt.src], "dir": ''}
    tgt_data = {"reader": tgt_reader, "data": None, "dir": None}

    _readers, _data, _dir = inputters.Dataset.config([('src', src_data), ('tgt', tgt_data)])
    data = inputters.Dataset(
            fields, readers=_readers, data=_data, dirs=_dir,
            sort_key=inputters.str2sortkey[opt.data_type]
            )

    data_iter = inputters.OrderedIterator(
                    dataset=data,
                    device=torch.device("cpu"),
                    batch_size=opt.batch_size,
                    batch_size_fn=None,
                    train=False,
                    sort=False,
                    sort_within_batch=True,
                    shuffle=False
                )

    xlation_builder = onmt.translate.TranslationBuilder(
                            data, fields, opt.n_best, opt.replace_unk, opt.tgt,
                            opt.phrase_table
                        )


    all_predictions = []
    tgt_field = dict(fields)["tgt"].base_field
    _tgt_vocab = tgt_field.vocab
    _tgt_eos_idx = _tgt_vocab.stoi[tgt_field.eos_token]
    _tgt_pad_idx = _tgt_vocab.stoi[tgt_field.pad_token]
    _tgt_bos_idx = _tgt_vocab.stoi[tgt_field.init_token]
    _tgt_unk_idx = _tgt_vocab.stoi[tgt_field.unk_token]
    _tgt_vocab_len = len(_tgt_vocab)
    _exclusion_idxs = {_tgt_vocab.stoi[t] for t in opt.ignore_when_blocking}

    copy_attn = model_opt.copy_attn

    for batch in data_iter:

        with torch.no_grad():
            decode_strategy = BeamSearch(
            opt.beam_size,
            batch_size=batch.batch_size,
            pad=_tgt_pad_idx,
            bos=_tgt_bos_idx,
            eos=_tgt_eos_idx,
            n_best=opt.n_best,
            global_scorer=scorer,
            min_length=opt.min_length, max_length=opt.max_length,
            return_attention=opt.attn_debug or opt.replace_unk,
            block_ngram_repeat=opt.block_ngram_repeat,
            exclusion_tokens=_exclusion_idxs,
            stepwise_penalty=opt.stepwise_penalty,
            ratio=opt.ratio)

            batch_data = translator._translate_batch_with_strategy(batch, data.src_vocabs, decode_strategy)

        translations = xlation_builder.from_batch(batch_data)
        for trans in translations:
            n_best_preds = [" ".join(pred)
                                    for pred in trans.pred_sents[:opt.n_best]]
            print(n_best_preds)
    return n_best_preds

In [9]:
# sample try
temp = prediction_after_transformer('__ON__ SCHOEN ABEND TSCHUESS __OFF__')
temp[0]

['jetzt wünsche ich ihnen noch einen schönen abend .']


  torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)


'jetzt wünsche ich ihnen noch einen schönen abend .'

# Input format modification

In [10]:
import torch.utils.data as data
from utils import video_augmentation
import os
import glob
import cv2
import time
import re

In [11]:
def resize_img(img_path, dsize='256x256px'):
    dsize = tuple(int(res) for res in re.findall("\d+", dsize))
    img = cv2.imread(img_path)
    img = cv2.resize(img, dsize, interpolation=cv2.INTER_LANCZOS4)
    return img


def resize_img_folder(frames_folder_path):
    img_list = glob.glob(frames_folder_path)
    for img_path in img_list:
        rs_img = resize_img(img_path)
        cv2.imwrite(img_path, rs_img)

In [21]:
frames_folder_name = '01April_2010_Thursday_heute_default-5/1'
frames_folder_path = './dataset/dataset/' + frames_folder_name + '/*.png'
#frames_folder_path = './frames_collection/recording_0/*.png'
resize_img_folder(frames_folder_path)

In [14]:
class SampleBaseFeeder(data.Dataset):
    
    def __init__(self, prefix, gloss_dict):
        self.prefix = prefix # image frames path : ./01April_2010_Thursday_heute_default-5/1/*.png
        self.dict = gloss_dict
        self.data_aug = self.transform()
        
    def __getitem__(self, idx):
        
        input_data, label, fi = self.read_video(idx)
        input_data, label = self.normalize(input_data, label)
        
        return input_data, torch.LongTensor(label), ''#self.inputs_list[idx]['original_info']

    
    def read_video(self, index, num_glosses=-1):
        # load file info        
        img_folder = os.path.join(self.prefix)
        img_list = sorted(glob.glob(img_folder))
        label_list = []
                
        return [cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB) for img_path in img_list],\
            label_list, {}

    
    def normalize(self, video, label, file_id=None):
        video, label = self.data_aug(video, label, file_id)
        video = video.float() / 127.5 - 1
        return video, label

    
    def transform(self):
        print("Apply testing transform.")
        return video_augmentation.Compose([
            video_augmentation.CenterCrop(224),
            video_augmentation.ToTensor(),
        ]) 
    
    
    @staticmethod
    def collate_fn(batch):
        batch = [item for item in sorted(batch, key=lambda x: len(x[0]), reverse=True)]
        video, label, info = list(zip(*batch))
        if len(video[0].shape) > 3:
            max_len = len(video[0])
            video_length = torch.LongTensor([np.ceil(len(vid) / 4.0) * 4 + 12 for vid in video])
            left_pad = 6
            right_pad = int(np.ceil(max_len / 4.0)) * 4 - max_len + 6
            max_len = max_len + left_pad + right_pad
            padded_video = [torch.cat(
                (
                    vid[0][None].expand(left_pad, -1, -1, -1),
                    vid,
                    vid[-1][None].expand(max_len - len(vid) - left_pad, -1, -1, -1),
                )
                , dim=0)
                for vid in video]
            padded_video = torch.stack(padded_video)
        else:
            max_len = len(video[0])
            video_length = torch.LongTensor([len(vid) for vid in video])
            padded_video = [torch.cat(
                (
                    vid,
                    vid[-1][None].expand(max_len - len(vid), -1),
                )
                , dim=0)
                for vid in video]
            padded_video = torch.stack(padded_video).permute(0, 2, 1)
        label_length = torch.LongTensor([len(lab) for lab in label])
        if max(label_length) == 0:
            return padded_video, video_length, [], [], info
        else:
            padded_label = []
            for lab in label:
                padded_label.extend(lab)
            padded_label = torch.LongTensor(padded_label)
            return padded_video, video_length, padded_label, label_length, info
        
    def __len__(self):
        return 1 # as for prediction we just have one folder/video

In [15]:
gloss_dict['TSCHUESS']

[993, 44]

In [16]:
def get_data_loader(frames_folder_path, gloss_dict):
    
    pred_dataset = SampleBaseFeeder(frames_folder_path, gloss_dict)
    data_loader = torch.utils.data.DataLoader(
                pred_dataset,
                batch_size=8,
                shuffle=False,
                drop_last=False,
                num_workers=4, 
                collate_fn=pred_dataset.collate_fn,
            )
    
    return data_loader

In [17]:
def get_prediction_from_frames(frames_folder_path, gloss_dict):
    start = time.time()
    loader = get_data_loader(frames_folder_path, gloss_dict)
    for batch_idx, l_data in enumerate(loader):
        print(l_data[0].shape)
        device = utils.GpuDataParallel()
        vid = device.data_to_device(l_data[0])
        vid_lgt = device.data_to_device(l_data[1])
        label = device.data_to_device(l_data[2])
        label_lgt = device.data_to_device(l_data[3])

    with torch.no_grad():
        ret_dict = model(vid, vid_lgt, label=label, label_lgt=label_lgt)
    
    sentence = ''
    for word, id in ret_dict['recognized_sents'][0]:
        sentence += word + " "
    
    print(sentence)
    new_sentence = prediction_after_transformer(sentence)[0]
    
    end = time.time()
    print("Time taken to predict (total): ", round(end - start))
    return new_sentence, ret_dict

In [36]:
temploader = get_data_loader(frames_folder_path, gloss_dict)
len(temploader)

Apply testing transform.


1

In [22]:
ns, ret_dict = get_prediction_from_frames(frames_folder_path, gloss_dict)

Apply testing transform.


  video_length = torch.LongTensor([np.ceil(len(vid) / 4.0) * 4 + 12 for vid in video])


torch.Size([1, 144, 3, 224, 224])
In slr_network.py forward:
torch.Size([1, 144, 3, 224, 224])
tensor([144])
<class 'torch.Tensor'>
ABER WOCHE MORGEN SONNE REGEN 
['aber in der neuen woche scheint morgen verbreitet die sonne .']
Time taken to predict (total):  6


In [19]:
sentence = ''
for word, id in ret_dict['recognized_sents'][0]:
    sentence += word + " "
print(sentence)

__ON__ MILD WEHEN ICH 


In [23]:
ret_dict

{'framewise_features': tensor([[[0.0794, 0.0794, 0.0794,  ..., 0.1176, 0.1176, 0.1176],
          [0.1410, 0.1410, 0.1410,  ..., 0.8128, 0.8128, 0.8128],
          [1.5184, 1.5184, 1.5184,  ..., 0.9420, 0.9420, 0.9420],
          ...,
          [0.2038, 0.2038, 0.2038,  ..., 0.5262, 0.5262, 0.5262],
          [0.4511, 0.4511, 0.4511,  ..., 0.5688, 0.5688, 0.5688],
          [0.0109, 0.0109, 0.0109,  ..., 0.9165, 0.9165, 0.9165]]]),
 'visual_features': tensor([[[1.0786, 0.0000, 0.1509,  ..., 0.6620, 0.0000, 0.0000]],
 
         [[1.4162, 0.0000, 0.3352,  ..., 1.0442, 0.0000, 0.0000]],
 
         [[0.5913, 0.6186, 0.0000,  ..., 0.7431, 0.0000, 0.0000]],
 
         ...,
 
         [[1.7912, 0.0000, 1.1050,  ..., 5.2073, 0.0000, 0.0000]],
 
         [[3.1430, 0.3526, 0.5447,  ..., 3.7436, 0.0000, 0.6845]],
 
         [[3.0035, 0.8708, 0.0000,  ..., 2.4030, 0.0000, 1.2275]]]),
 'feat_len': tensor([33]),
 'conv_logits': tensor([[[ -7.0832, -23.6861, -23.3820,  ..., -21.5323, -20.8931, -22.28

In [None]:
del model

In [49]:
# to know a folder info looks like.

inputs_list = np.load(f"./preprocess/phoenix14t/train_info.npy", allow_pickle=True).item()
for x in inputs_list:
    if x == 'prefix':
        continue
    if inputs_list[x]['fileid'] == frames_folder_name:
        print(inputs_list[x])


{'fileid': '07February_2011_Monday_heute-4659', 'folder': 'train/07February_2011_Monday_heute-4659/*.png', 'signer': 'Signer07', 'label': 'SONNE VORBEI DANN KOMMEN MEHR REGEN MORGEN IX FLUSS AUCH MEHR WOLKE WIE HEUTE FREUNDLICH SONNE', 'num_frames': 134, 'original_info': '07February_2011_Monday_heute-4659|07February_2011_Monday_heute-4659/1/*.png|-1|-1|Signer07|SONNE VORBEI DANN KOMMEN MEHR REGEN MORGEN IX FLUSS AUCH MEHR WOLKE WIE HEUTE FREUNDLICH SONNE|nach sonne kommt regen so wird es auch morgen an der mosel sein eher das trübe wetter und nicht so einen herrlichen sonnenschein wie wir heute hatten'}


In [128]:
inputs_list = np.load(f"./preprocess/phoenix2014/train_info.npy", allow_pickle=True).item()
for x in inputs_list:
    if x == 'prefix':
        continue
    
    if inputs_list[x]['num_frames'] < 40 and 'TSCHUESS' in inputs_list[x]['label']:
        print(len(inputs_list[x]['label'].split(' ')))
        print(inputs_list[x])

7
{'fileid': '22November_2010_Monday_heute_default-14', 'folder': 'train/22November_2010_Monday_heute_default-14/1/*.png', 'signer': 'Signer01', 'label': '__ON__ SCHOEN ABEND MACHEN GUT TSCHUESS __OFF__', 'num_frames': 38, 'original_info': '22November_2010_Monday_heute_default-14|22November_2010_Monday_heute_default-14/1/*.png|Signer01|__ON__ SCHOEN ABEND MACHEN GUT TSCHUESS __OFF__'}
5
{'fileid': '30May_2011_Monday_heute_default-18', 'folder': 'train/30May_2011_Monday_heute_default-18/1/*.png', 'signer': 'Signer01', 'label': '__ON__ SCHOEN ABEND TSCHUESS __OFF__', 'num_frames': 39, 'original_info': '30May_2011_Monday_heute_default-18|30May_2011_Monday_heute_default-18/1/*.png|Signer01|__ON__ SCHOEN ABEND TSCHUESS __OFF__'}


# For live prediction

In [None]:
#!pip install tensorflow==2.4.1 tensorflow-gpu==2.4.1 opencv-python mediapipe sklearn matplotlib

In [22]:
from matplotlib import pyplot as plt
import mediapipe as mp

In [23]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
mp_selfie_segmentation = mp.solutions.selfie_segmentation

In [24]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

In [25]:
def draw_styled_landmarks(image, results):
    #mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS,
                              #mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             #mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             #)
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             )
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

In [26]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh]), lh, rh

In [27]:
lhsequence = rhsequence = []
sentence = ''
trans_sentence = ''
vcount = 0
fcount = 0
recordingDone = False
frames_base_path = 'frames_collection'
frames_path = ''
BG_COLOR = (192, 192, 192)

cap = cv2.VideoCapture('output.mp4')
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 852)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
#cap.set(cv2.CAP_PROP_FPS, 25)

with mp_holistic.Holistic(min_detection_confidence=0.1,
                          min_tracking_confidence=0.1,
                          model_complexity=0, 
                          enable_segmentation=True) as holistic:
#with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.1, min_detection_confidence=0.1, min_tracking_confidence=0.1) as hands
    bg_image = None
    while cap.isOpened():

        # READ CAMERA FEED
        ret, frame = cap.read()

        if frame is not None:
            image, results = mediapipe_detection(frame, holistic)


            # DRAW LANDMARKS
            draw_styled_landmarks(image, results)

            # EXTRACT HANDS KEYPOINTS
            keypoints, lh_keypoints, rh_keypoints = extract_keypoints(results)        
            lhsequence.append(lh_keypoints)
            rhsequence.append(rh_keypoints)
            

        if frame is None and not recordingDone: 
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
                
        elif not (np.sum(lhsequence) == 0 and np.sum(rhsequence) == 0) and frame is not None:
            
            # with image segmentation, background as grey color
            if False: # if True change frame to output_image in cv2.imwrite(img_path,frame)
                with mp_selfie_segmentation.SelfieSegmentation( 
                                                model_selection=1) as selfie_segmentation:
                    seg_img, seg_results = mediapipe_detection(frame, selfie_segmentation)
                    condition = np.stack((seg_results.segmentation_mask,) * 3, axis=-1) > 0.1
                    if bg_image is None:
                        bg_image = np.zeros(image.shape, dtype=np.uint8)
                        bg_image[:] = BG_COLOR
                    output_image = np.where(condition, seg_img, bg_image)
            
            
            frames_path = os.path.join(frames_base_path, 'recording_' + str(vcount))
            img_path = frames_path + '/frame' + str(fcount) + '.png'
            if not os.path.exists(frames_path):
                os.makedirs(frames_path)
                cv2.imwrite(img_path, frame)
            else:
                cv2.imwrite(img_path, frame) 

            fcount += 1
            recordingDone = True
            
            if len(lhsequence) > 15:
                lhsequence = []
            if len(rhsequence) > 15:
                rhsequence = []

        elif recordingDone:
            # MAKE PREDICTIONS
            print("Predicting on ",frames_path)
            resize_img_folder(frames_path+'/*.png')
            trans_sentence, prediction_dict = get_prediction_from_frames(frames_path+'/*.png', gloss_dict)
            
            vcount += 1
            fcount = 0
            recordingDone = False


        cv2.rectangle(image, (0,0), (1280, 30), (0, 0, 0), -1)        
        cv2.putText(image, str(trans_sentence), (3,20),cv2.FONT_HERSHEY_DUPLEX,
                    0.5, (255, 255, 255), 1, cv2.LINE_AA)

        
        cv2.imshow('Live SLR Detection', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
                
    cap.release()
    cv2.destroyAllWindows()

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


Predicting on  frames_collection/recording_0
Apply testing transform.


  video_length = torch.LongTensor([np.ceil(len(vid) / 4.0) * 4 + 12 for vid in video])


torch.Size([1, 148, 3, 224, 224])
In slr_network.py forward:
torch.Size([1, 148, 3, 224, 224])
tensor([148])
<class 'torch.Tensor'>
HEUTE FREUNDLICH SONNE DANN KOMMEN REGEN JA MORGEN MONTAG __LEFTHAND__ AUCH MEHR WOLKE WIE 
['heute nacht wird es wieder freundlicher und auch morgen montag den ganzen tag über .']
Time taken to predict (total):  10


In [21]:
cap.release()
cv2.destroyAllWindows()

In [None]:
'SONNE poss-SEIN DANN KOMMEN REGEN MORGEN MONTAG IX AUCH MEHR WOLKE WIE HEUTE FREUNDLICH VIEL SONNE '

In [90]:
prediction_dict

{'framewise_features': tensor([[[1.3008, 1.3008, 1.3008,  ..., 1.6407, 1.6407, 1.6407],
          [1.5033, 1.5033, 1.5033,  ..., 1.6175, 1.6175, 1.6175],
          [0.3130, 0.3130, 0.3130,  ..., 0.7975, 0.7975, 0.7975],
          ...,
          [0.8511, 0.8511, 0.8511,  ..., 1.4915, 1.4915, 1.4915],
          [1.0935, 1.0935, 1.0935,  ..., 1.2195, 1.2195, 1.2195],
          [0.8213, 0.8213, 0.8213,  ..., 0.3413, 0.3413, 0.3413]]]),
 'visual_features': tensor([[[0.5461, 0.2666, 0.0229,  ..., 0.0000, 1.6038, 0.0000]],
 
         [[0.6112, 0.2004, 0.1416,  ..., 0.0000, 1.7595, 0.0000]],
 
         [[0.7808, 0.3113, 0.1024,  ..., 0.0000, 2.0837, 0.0000]],
 
         ...,
 
         [[0.2793, 1.2042, 0.0000,  ..., 0.0000, 1.3556, 0.0000]],
 
         [[0.3082, 1.2186, 0.0000,  ..., 0.0184, 1.5509, 0.0000]],
 
         [[0.2335, 1.0669, 0.0461,  ..., 0.3000, 1.8297, 0.0000]]]),
 'feat_len': tensor([9]),
 'conv_logits': tensor([[[-17.4694, -31.9186, -28.4846,  ..., -29.9022, -29.7535, -30.046

# Extras