In [1]:
# !pip install -q pytorchvideo evaluate

In [2]:
# pip install -q pyarrow==14.0.1

In [3]:
# pip install -q transformers --upgrade

In [4]:
# pip install -q torch==2.0.1 torchvision==0.15.2 --extra-index-url https://download.pytorch.org/whl/cu118 xformers==0.0.21

# **Data Collection and Aggregation**

In [5]:
import os
import pandas as pd

root_path = "/kaggle/input/nsl-videos"
folder_list = os.listdir(root_path)
label_list = [path for path in folder_list if not path.endswith((".csv"))]

total_df = pd.read_csv('/kaggle/input/trainfilepath/train.csv')

total_df.reset_index(drop = True, inplace = True)
total_df['label'].value_counts()

label
म संग अण्डा छैन ।                  36
म तिम्रो पैसा खान्छु ।             36
तिमी म लाई मनपर्छ ।                34
तिमी हरु मेरो साथी हो ।            34
तिम्रो काम हरु म लाई छैन ।         34
म लाई भक्तपुर मनपर्छ ।             34
म संग मेरो साथी छ ।                34
भक्तपुर मा धेरै काम छ ।            34
म घर मा धेरै काम गर्छु ।           33
तिमी संग अण्डा छैन ।               32
म अण्डा खान्छु ।                   32
मेरो साथी लाई अण्डा मनपर्छ ।       32
तिम्रो काम छैन पैसा छैन ।          32
तिम्रो काम धेरै छ ।                32
मेरो धेरै साथी हरु छन् ।           30
म लाई अण्डा मनपर्छ ।               28
म संग धेरै पैसा छैन ।              26
मेरो घर भक्तपुर मा छ ।             18
म भक्तपुर मा काम गर्छु ।           18
मेरो साथी धेरै भक्तपुर मा छन् ।    16
Name: count, dtype: int64

# **Data Splitting**

In [6]:
from sklearn.model_selection import train_test_split

def correct_file_path(file_name: str, root_path: str):
    return os.path.join(root_path, file_name)

def preprocess_meta_df(df, root_path, label2id):
    df.rename(columns={"video_name": "video_path"}, inplace=True)
    df['video_path'] = df['video_path'].apply(lambda x: correct_file_path(x, root_path))
#     df['label'] = df['label'].apply(lambda x: label2id[x])
    df['label'] = df['label']
    
    return df

train_meta_df, test_meta_df = train_test_split(total_df, test_size=0.2, stratify=total_df['label'], random_state=42)

label_list = list(set(train_meta_df['label']))
class_labels = sorted(label_list)
label2id = {label: i for i, label in enumerate(class_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"Unique classes: {list(label2id.keys())}.")

train_meta_df = preprocess_meta_df(train_meta_df, root_path, label2id)
test_meta_df = preprocess_meta_df(test_meta_df, root_path, label2id)

print("Splitted data:", len(train_meta_df), len(test_meta_df))

Unique classes: ['तिमी म लाई मनपर्छ ।', 'तिमी संग अण्डा छैन ।', 'तिमी हरु मेरो साथी हो ।', 'तिम्रो काम छैन पैसा छैन ।', 'तिम्रो काम धेरै छ ।', 'तिम्रो काम हरु म लाई छैन ।', 'भक्तपुर मा धेरै काम छ ।', 'म अण्डा खान्छु ।', 'म घर मा धेरै काम गर्छु ।', 'म तिम्रो पैसा खान्छु ।', 'म भक्तपुर मा काम गर्छु ।', 'म लाई अण्डा मनपर्छ ।', 'म लाई भक्तपुर मनपर्छ ।', 'म संग अण्डा छैन ।', 'म संग धेरै पैसा छैन ।', 'म संग मेरो साथी छ ।', 'मेरो घर भक्तपुर मा छ ।', 'मेरो धेरै साथी हरु छन् ।', 'मेरो साथी धेरै भक्तपुर मा छन् ।', 'मेरो साथी लाई अण्डा मनपर्छ ।'].
Splitted data: 484 121


#  **Model Selection and Design**

## Preparing Configuration for the final Model

In [7]:
import torch
def get_config():
    return {
        "batch_size": 1,
        "num_epochs": 2,
        "lr": 10**-4,
        "seq_len": 1470,
        "d_model": 768,
        "lang_tgt": "ne",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "/content/drive/MyDrive/English2Nepali/runs"
        # "experiment_name": "/content/drive/MyDrive/translation/runs"
    }

config=get_config()
device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

## Pre-trained Encoder Model (Video Vision Transformer)

In [8]:
import pytorchvideo.data
from torch.utils.data import Dataset
from transformers import VivitImageProcessor, VivitModel, VivitForVideoClassification

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)

model_checkpoint = "google/vivit-b-16x2-kinetics400"
image_processor = VivitImageProcessor.from_pretrained(model_checkpoint)

# model = VivitForVideoClassification.from_pretrained(model_checkpoint)
vivit_model=VivitModel.from_pretrained(model_checkpoint)

  return self.fget.__get__(instance, owner)()
Some weights of VivitModel were not initialized from the model checkpoint at google/vivit-b-16x2-kinetics400 and are newly initialized: ['vivit.pooler.dense.bias', 'vivit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
vivit_model

VivitModel(
  (embeddings): VivitEmbeddings(
    (patch_embeddings): VivitTubeletEmbeddings(
      (projection): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): VivitEncoder(
    (layer): ModuleList(
      (0-11): 12 x VivitLayer(
        (attention): VivitAttention(
          (attention): VivitSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): VivitSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): VivitIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (dropout): D

## Don't Need the Final Pooler Layer

In [10]:
#Replace pooler layer with the identity function, it just returns what it gets

class Identity(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        return x

# model.classifier=Identity()
# model.classifier=torch.nn.Linear(768,5)

# patch size of 32*32
vivit_model.config.tubelet_size=[2,32,32]

# 6 encoder block stacks
vivit_model.config.num_hidden_layers=12

# dropout set to 0.1
vivit_model.config.hidden_dropout_prob=0.3

# number of frames extracting from each video
vivit_model.config.num_frames=60

vivit_model=VivitModel(vivit_model.config)

vivit_model.pooler=Identity()

In [11]:
vivit_model

VivitModel(
  (embeddings): VivitEmbeddings(
    (patch_embeddings): VivitTubeletEmbeddings(
      (projection): Conv3d(3, 768, kernel_size=(2, 32, 32), stride=(2, 32, 32))
    )
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (encoder): VivitEncoder(
    (layer): ModuleList(
      (0-11): 12 x VivitLayer(
        (attention): VivitAttention(
          (attention): VivitSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): VivitSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.3, inplace=False)
          )
        )
        (intermediate): VivitIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (dropout): D

## Custom Decoder Model

In [12]:
import math

class InputEmbeddings(torch.nn.Module):

    def __init__(self, d_model: int=768, vocab_size: int=27) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = torch.nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        # (batch, seq_len) --> (batch, seq_len, d_model)
        # Multiply by sqrt(d_model) to scale the embeddings according to the paper
        return self.embedding(x) * math.sqrt(self.d_model)


class PositionalEncoding(torch.nn.Module):

    def __init__(self, d_model: int=768, seq_len: int=1225, dropout: float=0.1) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = torch.nn.Dropout(dropout)
        # Create a matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        # Create a vector of shape (seq_len)
        position = torch.arange(
            0, seq_len, dtype=torch.float).unsqueeze(1)  # (seq_len, 1)
        # Create a vector of shape (d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float(
        ) * (-math.log(10000.0) / d_model))  # (d_model / 2)
        # Apply sine to even indices
        # sin(position * (10000 ** (2i / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        # Apply cosine to odd indices
        # cos(position * (10000 ** (2i / d_model))
        pe[:, 1::2] = torch.cos(position * div_term)
        # Add a batch dimension to the positional encoding
        pe = pe.unsqueeze(0)  # (1, seq_len, d_model)
        # Register the positional encoding as a buffer
        self.register_buffer('pe', pe)

    def forward(self, x):
        # (batch, seq_len, d_model)
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
        return self.dropout(x)
    
    
    
class MultiHeadAttentionBlock(torch.nn.Module):

    def __init__(self, d_model: int=768, h: int=8, dropout: float=0.1) -> None:
        super().__init__()
        self.d_model = d_model  # Embedding vector size
        self.h = h  # Number of heads
        # Make sure d_model is divisible by h
        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h  # Dimension of vector seen by each head
        self.w_q = torch.nn.Linear(d_model, d_model, bias=False)  # Wq
        self.w_k = torch.nn.Linear(d_model, d_model, bias=False)  # Wk
        self.w_v = torch.nn.Linear(d_model, d_model, bias=False)  # Wv
        self.w_o = torch.nn.Linear(d_model, d_model, bias=False)  # Wo
        self.dropout = torch.nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: torch.nn.Dropout):
        d_k = query.shape[-1]
        # Just apply the formula from the paper
        # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            # Write a very low value (indicating -inf) to the positions where mask == 0
            attention_scores.masked_fill_(mask == 0, -1e9)
        # (batch, h, seq_len, seq_len) # Apply softmax
        attention_scores = attention_scores.softmax(dim=-1)
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        # (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
        # return attention scores which can be used for visualization
        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        query = self.w_q(q)
        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k)
        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(
            query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1],
                       self.h, self.d_k).transpose(1, 2)
        value = value.view(
            value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        # Calculate attention
        x, self.attention_scores = MultiHeadAttentionBlock.attention(
            query, key, value, mask, self.dropout)

        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(
            x.shape[0], -1, self.h * self.d_k)

        # Multiply by Wo
        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        return self.w_o(x)
    

class LayerNormalization(torch.nn.Module):

    def __init__(self, features: int, eps: float = 10**-6) -> None:
        super().__init__()
        self.eps = eps
        # alpha is a learnable parameter
        self.alpha = torch.nn.Parameter(torch.ones(features))
        # bias is a learnable parameter
        self.bias = torch.nn.Parameter(torch.zeros(features))

    def forward(self, x):
        # x: (batch, seq_len, hidden_size)
        # Keep the dimension for broadcasting
        mean = x.mean(dim=-1, keepdim=True)  # (batch, seq_len, 1)
        # Keep the dimension for broadcasting
        std = x.std(dim=-1, keepdim=True)  # (batch, seq_len, 1)
        # eps is to prevent dividing by zero or when std is very small
        return self.alpha * (x - mean) / (std + self.eps) + self.bias


class FeedForwardBlock(torch.nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = torch.nn.Linear(d_model, d_ff)  # w1 and b1
        self.dropout = torch.nn.Dropout(dropout)
        self.linear_2 = torch.nn.Linear(d_ff, d_model)  # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))


class ResidualConnection(torch.nn.Module):

    def __init__(self, features: int, dropout: float) -> None:
        super().__init__()
        self.dropout = torch.nn.Dropout(dropout)
        self.norm = LayerNormalization(features)

    # many transformer implmentation also do like this--> normalize the input + positional embedding, then apply mhsa and add skip connection.
    # here sublayer is MHSA
    def forward(self, x, sublayer):
      return x + self.dropout(self.norm(sublayer(x)))



class DecoderBlock(torch.nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = torch.nn.ModuleList(
            [ResidualConnection(features, dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](
            x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(
            x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x


class Decoder(torch.nn.Module):

    def __init__(self, features: int, layers: torch.nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)


class ProjectionLayer(torch.nn.Module):

    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = torch.nn.Linear(d_model, vocab_size)

    def forward(self, x) -> None:
        # (batch, seq_len, d_model) --> (batch, seq_len, vocab_size)
        return self.proj(x)

## Building Video2Text Transformer Architecture

In [13]:
class Video2Text(torch.nn.Module):

    def __init__(self, encoder, decoder: Decoder, tgt_embed: InputEmbeddings, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()
        self.video_encoder = encoder
        self.decoder = decoder
        self.tgt_embed = tgt_embed
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self,src_video):
    # (batch,num_frames, num_channels, height, width)
        if src_video != None:
            perumuted_sample_test_video = src_video.permute(0,2, 1, 3, 4)

            inputs = {
                "pixel_values": perumuted_sample_test_video,
            }
            # forward pass
            outputs = self.video_encoder(**inputs)
            
#           first token in the sequence is the class token. so, we dont need that. (batchsize, seq_len, embedding)
            return outputs.last_hidden_state[:,1:,:]
        else:
            return None

    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)

    def project(self, x):
        # (batch, seq_len, vocab_size)
        return self.projection_layer(x)

In [14]:
def build_transformer(encoder_model,tgt_vocab_size: int, tgt_seq_len: int, d_model: int = 768, N: int = 6, h: int = 8, dropout: float = 0.1, d_ff: int = 2048) -> Video2Text:
    
    # Create the embedding layers
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # Create the positional encoding layers
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)

    # Create the decoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(
            d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(
            d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(d_model, decoder_self_attention_block,
                                     decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)

    # Create the encoder and decoder
    video_encoder = encoder_model
    decoder = Decoder(d_model, torch.nn.ModuleList(decoder_blocks))

    # Create the projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

    # Create the transformer
    transformer = Video2Text(
        encoder=video_encoder,decoder=decoder,tgt_embed=tgt_embed, tgt_pos=tgt_pos, projection_layer=projection_layer)

    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            torch.nn.init.xavier_uniform_(p)


    return transformer

In [15]:
def get_model(config,enc_model,vocab_tgt_len):
    v2t_model = build_transformer(encoder_model=enc_model,tgt_vocab_size=vocab_tgt_len,
                              tgt_seq_len=config['seq_len'], d_model=config['d_model'])
    return v2t_model

> # **Apply Necessary Transform and Prepare Dataset**

In [16]:
class CustomVideoDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        video_path = row['video_path']
        label = row['label']
        return video_path, label

mean = image_processor.image_mean
std = image_processor.image_std

if "shortest_edge" in image_processor.size:
    height = width = image_processor.size["shortest_edge"]
else:
    height = image_processor.size["height"]
    width = image_processor.size["width"]

resize_to = (vivit_model.config.image_size, vivit_model.config.image_size)

# num_frames_to_sample = model.config.num_frames
num_frames_to_sample = 60
clip_duration = 10

train_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
#                     RandomShortSideScale(min_size=256, max_size=320),
                    Resize(resize_to),
#                     RandomHorizontalFlip(p=0.5),
                ]
            ),
        ),
    ]
)

val_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize(resize_to),
                ]
            ),
        ),
    ]
)

train_custom_dataset = CustomVideoDataset(train_meta_df)
train_labeled_video_paths = [(video_path, {'label': label}) for video_path, label in train_custom_dataset]

test_custom_dataset = CustomVideoDataset(test_meta_df)
test_labeled_video_paths = [(video_path, {'label': label}) for video_path, label in test_custom_dataset]

In [17]:
import imageio
import numpy as np
from IPython.display import Image

train_dataset = pytorchvideo.data.LabeledVideoDataset(
    labeled_video_paths =train_labeled_video_paths,
    clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration),
    decode_audio=False,
    transform=train_transform,
)

test_dataset = pytorchvideo.data.LabeledVideoDataset(
    labeled_video_paths =test_labeled_video_paths,
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
    decode_audio=False,
    transform=val_transform,
)

In [18]:
class CustomVideoDataset2(Dataset):

    def __init__(self, vdataset, tokenizer_tgt, tgt_lang, seq_len):
        super().__init__()
        self.seq_len = seq_len

        self.vdataset = vdataset
        self.tokenizer_tgt = tokenizer_tgt
        self.tgt_lang = tgt_lang

        self.sos_token = torch.tensor(
            [tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor(
            [tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor(
            [tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

    def __len__(self):
#         return len(self.dataframe)
        return self.vdataset.num_videos

    def __getitem__(self, idx):
        #new code here
        
        video=next(iter(self.vdataset))['video']
#         label=next(iter(self.vdataset))['label']
        target_txt=next(iter(self.vdataset))['label']
        

        # Transform the output text into tokens
        dec_input_tokens = self.tokenizer_tgt.encode(target_txt).ids


         # We will only add <s> here, and </s> only on the label
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1

#         # Make sure the number of padding tokens is not negative. If it is, the sentence is too long
        if dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long")


#          Add only <s> token
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] *
                             dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

#          Add only </eos> token
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] *
                             dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
         )

#         # Double check the size of the tensors to make sure they are all seq_len long
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "video":video,
            "label":label,
            "decoder_input":decoder_input,
            "decoder_mask":(decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)),
            "tgt_text":target_txt
            
        }

def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

## Initializing the model

In [19]:
from tokenizers import Tokenizer

target_tokenizer=Tokenizer.from_file(str('/kaggle/input/nepalitokenizer/tokenizer_sign_lang_ne.json'))

# initialize the model
v2t_model=get_model(config=config,enc_model=vivit_model,vocab_tgt_len=target_tokenizer.get_vocab_size())
    
v2t_model.to(device)
    

Video2Text(
  (video_encoder): VivitModel(
    (embeddings): VivitEmbeddings(
      (patch_embeddings): VivitTubeletEmbeddings(
        (projection): Conv3d(3, 768, kernel_size=(2, 32, 32), stride=(2, 32, 32))
      )
      (dropout): Dropout(p=0.3, inplace=False)
    )
    (encoder): VivitEncoder(
      (layer): ModuleList(
        (0-11): 12 x VivitLayer(
          (attention): VivitAttention(
            (attention): VivitSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): VivitSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.3, inplace=False)
            )
          )
          (intermediate): VivitIntermediate(
            (dense):

# **Training Without Huggingface Trainer**

## Prepare and Test Dataloader

In [20]:
from torch.utils.data import Dataset, DataLoader, random_split

new_train_dataset=CustomVideoDataset2(train_dataset,target_tokenizer,config['lang_tgt'],config['seq_len'])
new_val_dataset=CustomVideoDataset2(test_dataset,target_tokenizer,config['lang_tgt'],config['seq_len'])

train_dataloader = DataLoader(new_train_dataset, batch_size=1,shuffle=True)
val_dataloader = DataLoader(new_val_dataset, batch_size=1,shuffle=True)

In [21]:
# i=0
# for data in train_dataloader:
#     i+=1
#     if i>3:
#         break
#     print(data['tgt_text'],data['label'])


## Training Loop

In [22]:
def greedy_decode(model, src_video, source_mask, tokenizer_tgt, max_len, device):
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
    eos_idx = tokenizer_tgt.token_to_id('[EOS]')

    # Precompute the encoder output and reuse it for every step
    encoder_output = model.encode(src_video=src_video)
#     encoder_output = (torch.randint(2,7,(1,784,768))).type_as(encoder_output).to(device)
    
#     print(f'encoder_output: {encoder_output[:,392:400,:20]}')
    # Initialize the decoder input with the sos token
    decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(src_video.type(torch.LongTensor)).to(device)
    
#     print(f"decoder input: {decoder_input,decoder_input.shape}")
    while True:
        if decoder_input.size(1) == max_len:
            break

        # build mask for target
        decoder_mask = causal_mask(decoder_input.size(
            1)).type_as(src_video.type(torch.LongTensor)).to(device)
        
        
#         print(f'decoder mask: {decoder_mask,decoder_mask.shape}')

        # calculate output
        out = model.decode(encoder_output=encoder_output, src_mask=None,tgt=decoder_input, tgt_mask=decoder_mask)

        # get next token
        prob = model.project(out[:, -1])
        
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat(
            [decoder_input, torch.empty(1, 1).type_as(src_video.type(torch.LongTensor)).fill_(next_word.item()).to(device)], dim=1
        )
        
#         print(f'next_word: {next_word}')
        if next_word == eos_idx:
            break

    return decoder_input.squeeze(0)


# model, src_video, source_mask, tokenizer_tgt, max_len, device
def beam_search_decode(model, beam_size, src_video, source_mask, tokenizer_tgt, max_len, device):
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
    eos_idx = tokenizer_tgt.token_to_id('[EOS]')

    # Precompute the encoder output and reuse it for every step
    encoder_output = model.encode(src_video=src_video)
    # Initialize the decoder input with the sos token
    decoder_initial_input = torch.empty(1, 1).fill_(sos_idx).type_as(src_video.type(torch.LongTensor)).to(device)

    # Create a candidate list
    candidates = [(decoder_initial_input, 1)]

    while True:

        # If a candidate has reached the maximum length, it means we have run the decoding for at least max_len iterations, so stop the search
        if any([cand.size(1) == max_len for cand, _ in candidates]):
            break

        # Create a new list of candidates
        new_candidates = []

        for candidate, score in candidates:

            # Do not expand candidates that have reached the eos token
            if candidate[0][-1].item() == eos_idx:
                continue

            # Build the candidate's mask
            candidate_mask = causal_mask(candidate.size(1)).type_as(src_video.type(torch.LongTensor)).to(device)
            
            # calculate output
            out = model.decode(encoder_output=encoder_output, src_mask=None, tgt=candidate, tgt_mask=candidate_mask)
            
            # get next token probabilities
            prob = model.project(out[:, -1])
            
            # get the top k candidates
            topk_prob, topk_idx = torch.topk(prob, beam_size, dim=1)
            
            for i in range(beam_size):
                # for each of the top k candidates, get the token and its probability
                token = topk_idx[0][i].unsqueeze(0).unsqueeze(0)
                token_prob = topk_prob[0][i].item()
                # create a new candidate by appending the token to the current candidate
                new_candidate = torch.cat([candidate, token], dim=1)
                # We sum the log probabilities because the probabilities are in log space
                new_candidates.append((new_candidate, score + token_prob))

        # Sort the new candidates by their score
        candidates = sorted(new_candidates, key=lambda x: x[1], reverse=True)
        # Keep only the top k candidates
        candidates = candidates[:beam_size]

        # If all the candidates have reached the eos token, stop
        if all([cand[0][-1].item() == eos_idx for cand, _ in candidates]):
            break

    # Return the best candidate
    return candidates[0][0].squeeze()


In [23]:
import torchmetrics

def run_validation(model, validation_ds, tokenizer_tgt, max_len, device, print_msg, num_examples=2):
    model.eval()
    count = 0

#     expected = []
#     predicted = []

    try:
        # get the console window width
        with os.popen('stty size', 'r') as console:
            _, console_width = console.read().split()
            console_width = int(console_width)
    except:
        # If we can't get the console width, use 80 as default
        console_width = 80

    with torch.no_grad():
        for batch in validation_ds:
            count += 1
            encoder_input = batch["video"].to(device)  # (b, seq_len)

            # check that the batch size is 1
            assert encoder_input.size(
                0) == 1, "Batch size must be 1 for validation"

            model_out_greedy = greedy_decode(
                model, encoder_input, None, tokenizer_tgt, max_len, device)
            model_out_beam = beam_search_decode(model, 3, encoder_input, None, tokenizer_tgt, max_len, device)

            target_text = batch["tgt_text"][0]
            model_out_text_beam = tokenizer_tgt.decode(model_out_beam.detach().cpu().numpy())
            model_out_text_greedy = tokenizer_tgt.decode(model_out_greedy.detach().cpu().numpy())

#             source_texts.append(source_text)
#             expected.append(target_text)
#             predicted.append(model_out_text_greedy)

            # Print the source, target and model output
            if count <4:
                print_msg(f"{f'TARGET: ':>20}{target_text}")
                print_msg(f"{f'PREDICTED GREEDY: ':>20}{model_out_text_greedy}")
                print_msg(f"{f'PREDICTED BEAM: ':>20}{model_out_text_beam}")

            # print(count)
            if count == num_examples:
                print_msg('-'*console_width)
                break
        # Compute the word error rate   
#         metric = torchmetrics.WordErrorRate()
#         v_wer = metric(predicted, expected)
#         print(f"Word Error Rate:{v_wer}")
#         return v_wer

        # Compute the BLEU metric
#         metric = torchmetrics.BLEUScore()
#         bleu = metric(predicted, expected)
#         writer.add_scalar('validation BLEU', bleu, global_step)
#         writer.flush()

## Load the saved model if notebook restarted

In [24]:
# saved_model=torch.load("/kaggle/working/99_mtrain.pt")

In [25]:
# v2t_model=saved_model['model'].to(device)

In [26]:
# saved_model['epoch']

In [27]:
# import matplotlib.pyplot as plt
# plt.plot(saved_model['train_loss'])
# plt.show()

In [29]:
from tqdm import tqdm
import numpy as np

optimizer = torch.optim.Adam(v2t_model.parameters(), lr=10**-5, eps=1e-9)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=target_tokenizer.token_to_id(
        '[PAD]'),label_smoothing=0.1).to(device)


# loss_list_train=saved_model['train_loss']
loss_list_train=[]
loss_list_val=[]
wer_list_train=[]
wer_list_val=[]


# for epoch in range(saved_model['epoch']+1,200):
for epoch in range(100):
    torch._C._cuda_emptyCache()
    v2t_model.train()
    batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
    
    #accumulate LOSS and WER
    acc_loss=0
    acc_wer=0
    for batch in batch_iterator:
        
                src_video=batch['video'].to(device)
                decoder_input=batch['decoder_input'].to(device)
                decoder_mask=batch['decoder_mask'].to(device)
                
                 # Run the tensors through the encoder, decoder and the projection layer
                encoder_output = v2t_model.encode(src_video=src_video)  # (B, seq_len, d_model)
                decoder_output = v2t_model.decode(encoder_output=encoder_output, src_mask=None,tgt=decoder_input, tgt_mask=decoder_mask) # (B, seq_len, d_model)
                 # (B, seq_len, vocab_size)
                proj_output = v2t_model.project(decoder_output)
                
                 # Compare the output with the label
                label = batch['label'].to(device)  # (B, seq_len)

                 # Compute the loss using a simple cross entropy
                loss = loss_fn(proj_output.view(-1, target_tokenizer.get_vocab_size()), label.view(-1))
                
                 # accumulated loss for every batch in a single epoch
                acc_loss+=loss.item()
                
        
                # Backpropagate the loss
                loss.backward()

                # Update the weights
                optimizer.step()
                optimizer.zero_grad(set_to_none=True)
                
                #calculating training WER
                pred_tokens = torch.argmax(proj_output, dim=-1)  # Get the predicted token indices
                pred_sentences = target_tokenizer.decode(pred_tokens.detach().cpu().numpy()[0][:8], skip_special_tokens=True)
                metric = torchmetrics.text.WordErrorRate()
                t_WER=metric(pred_sentences, batch['tgt_text'])
                acc_wer+=t_WER
                
    
    
    loss_list_train.append(np.round(acc_loss/len(train_dataloader),3))
    wer_list_train.append(np.round(acc_wer.numpy()/len(train_dataloader),3))
    
    # predict sentences
    run_validation(v2t_model,val_dataloader, target_tokenizer, 15 ,device,lambda msg: batch_iterator.write(msg))
    

    v2t_model.eval()
    acc_loss=0
    acc_wer=0
    
    with torch.no_grad():
        for batchv in val_dataloader:

            src_video=batchv['video'].to(device)
            decoder_input=batchv['decoder_input'].to(device)
            decoder_mask=batchv['decoder_mask'].to(device)

            # Run the tensors through the encoder, decoder and the projection layer
            encoder_output = v2t_model.encode(src_video=src_video)  # (B, seq_len, d_model)
            decoder_output = v2t_model.decode(encoder_output=encoder_output, src_mask=None,tgt=decoder_input, tgt_mask=decoder_mask) # (B, seq_len, d_model)
            # (B, seq_len, vocab_size)
            proj_output = v2t_model.project(decoder_output)

            # Compare the output with the label
            label = batchv['label'].to(device)  # (B, seq_len)

            # Compute the loss using a simple cross entropy
            val_loss = loss_fn(proj_output.view(-1, target_tokenizer.get_vocab_size()), label.view(-1))

            acc_loss+=val_loss.item()

            #calculating validation WER
            pred_tokens = torch.argmax(proj_output, dim=-1)  # Get the predicted token indices
            pred_sentences = target_tokenizer.decode(pred_tokens.detach().cpu().numpy()[0][:8], skip_special_tokens=True)
            metric = torchmetrics.text.WordErrorRate()
            v_WER=metric(pred_sentences, batchv['tgt_text'])
            acc_wer+=v_WER
        
    with torch.no_grad():   
        loss_list_val.append(np.round(acc_loss/len(val_dataloader),3))
        wer_list_val.append(np.round(acc_wer.numpy()/len(val_dataloader),3))
    
    if (epoch+1)%5==0:
        
        torch.save({"model_state_dict":v2t_model.state_dict(),
                    "optimizer_state_dict":optimizer.state_dict(),
                    "train_loss":loss_list_train,
                    "val_loss":loss_list_val,
                   "t_wer":wer_list_train,
                    "v_wer":wer_list_val,
                    "epoch":epoch+1,
                    },
                    f"{epoch}_mtrain.pt")
        
    
    print(f" Epoch: {epoch} | Training Loss: {loss_list_train[-1]}      Validation Loss: {loss_list_val[-1]}\
             Train WER: {wer_list_train[-1]}      Validation WER: {wer_list_val[-1]}")
    
                

Processing Epoch 00: 100%|██████████| 484/484 [07:57<00:00,  1.01it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: तिमी म लाई मनपर्छ ।
  PREDICTED GREEDY: म संग धेरै पैसा छैन ।
    PREDICTED BEAM: म संग मेरो साथी छ । अण्डा छैन । अण्डा छैन ।
            TARGET: तिम्रो काम हरु म लाई छैन ।
  PREDICTED GREEDY: म संग धेरै पैसा छैन ।
    PREDICTED BEAM: म संग मेरो साथी छ । अण्डा छैन । अण्डा छैन ।
--------------------------------------------------------------------------------
 Epoch: 0 | Training Loss: 1.34      Validation Loss: 1.269             Train WER: 0.576      Validation WER: 0.468


Processing Epoch 01: 100%|██████████| 484/484 [08:13<00:00,  1.02s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: म लाई भक्तपुर मनपर्छ ।
  PREDICTED GREEDY: म संग धेरै पैसा छैन ।
    PREDICTED BEAM: तिमी हरु मेरो साथी हो ।
            TARGET: तिमी संग अण्डा छैन ।
  PREDICTED GREEDY: म संग धेरै पैसा छैन ।
    PREDICTED BEAM: तिमी हरु मेरो साथी हो ।
--------------------------------------------------------------------------------
 Epoch: 1 | Training Loss: 1.226      Validation Loss: 1.177             Train WER: 0.523      Validation WER: 0.519


Processing Epoch 02: 100%|██████████| 484/484 [08:15<00:00,  1.02s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: म लाई भक्तपुर मनपर्छ ।
  PREDICTED GREEDY: म तिम्रो पैसा खान्छु ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । म लाई छैन पैसा छैन ।
            TARGET: तिम्रो काम धेरै छ ।
  PREDICTED GREEDY: म तिम्रो पैसा खान्छु ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । म लाई छैन पैसा छैन ।
--------------------------------------------------------------------------------
 Epoch: 2 | Training Loss: 1.181      Validation Loss: 1.16             Train WER: 0.507      Validation WER: 0.506


Processing Epoch 03: 100%|██████████| 484/484 [08:14<00:00,  1.02s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: मेरो धेरै साथी हरु छन् ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: तिमी हरु मेरो साथी हो ।
            TARGET: म तिम्रो पैसा खान्छु ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: तिमी हरु मेरो साथी हो ।
--------------------------------------------------------------------------------
 Epoch: 3 | Training Loss: 1.15      Validation Loss: 1.123             Train WER: 0.488      Validation WER: 0.45


Processing Epoch 04: 100%|██████████| 484/484 [08:10<00:00,  1.01s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: म भक्तपुर मा काम गर्छु ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: तिमी हरु मेरो साथी हो ।
            TARGET: तिमी संग अण्डा छैन ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: तिमी हरु मेरो साथी हो ।
--------------------------------------------------------------------------------
 Epoch: 4 | Training Loss: 1.14      Validation Loss: 1.121             Train WER: 0.5      Validation WER: 0.46


Processing Epoch 05: 100%|██████████| 484/484 [07:34<00:00,  1.06it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: म संग धेरै पैसा छैन ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: म संग मेरो साथी छ । भक्तपुर मा छन् । अण्डा छैन ।
            TARGET: तिमी संग अण्डा छैन ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: म संग मेरो साथी छ । भक्तपुर मा छन् । अण्डा छैन ।
--------------------------------------------------------------------------------
 Epoch: 5 | Training Loss: 1.132      Validation Loss: 1.137             Train WER: 0.494      Validation WER: 0.481


Processing Epoch 06: 100%|██████████| 484/484 [07:59<00:00,  1.01it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: तिमी संग अण्डा छैन ।
  PREDICTED GREEDY: म लाई अण्डा मनपर्छ ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । लाई छैन । हरु म लाई छैन
            TARGET: म भक्तपुर मा काम गर्छु ।
  PREDICTED GREEDY: म लाई अण्डा मनपर्छ ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । लाई छैन । हरु म लाई छैन
--------------------------------------------------------------------------------
 Epoch: 6 | Training Loss: 1.113      Validation Loss: 1.112             Train WER: 0.501      Validation WER: 0.449


Processing Epoch 07: 100%|██████████| 484/484 [08:07<00:00,  1.01s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: भक्तपुर मा धेरै काम छ ।
  PREDICTED GREEDY: म लाई भक्तपुर मनपर्छ ।
    PREDICTED BEAM: मेरो साथी लाई अण्डा मनपर्छ । भक्तपुर मा छन् । तिमी हरु छन् ।
            TARGET: तिमी हरु मेरो साथी हो ।
  PREDICTED GREEDY: म लाई भक्तपुर मनपर्छ ।
    PREDICTED BEAM: मेरो साथी लाई अण्डा मनपर्छ । भक्तपुर मा छन् । तिमी हरु छन् ।
--------------------------------------------------------------------------------
 Epoch: 7 | Training Loss: 1.103      Validation Loss: 1.1             Train WER: 0.493      Validation WER: 0.472


Processing Epoch 08: 100%|██████████| 484/484 [08:04<00:00,  1.00s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: म संग अण्डा छैन ।
  PREDICTED GREEDY: म संग धेरै पैसा छैन ।
    PREDICTED BEAM: मेरो साथी धेरै भक्तपुर मा छन् । हरु छन् । म लाई अण्डा
            TARGET: म भक्तपुर मा काम गर्छु ।
  PREDICTED GREEDY: म संग धेरै पैसा छैन ।
    PREDICTED BEAM: मेरो साथी धेरै भक्तपुर मा छन् । हरु छन् । म लाई अण्डा
--------------------------------------------------------------------------------
 Epoch: 8 | Training Loss: 1.104      Validation Loss: 1.093             Train WER: 0.493      Validation WER: 0.481


Processing Epoch 09: 100%|██████████| 484/484 [08:06<00:00,  1.00s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: तिम्रो काम धेरै छ ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: मेरो साथी धेरै भक्तपुर मा छन् । धेरै भक्तपुर मा छन् मा छन् ।
            TARGET: मेरो साथी लाई अण्डा मनपर्छ ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: मेरो साथी धेरै भक्तपुर मा छन् । धेरै भक्तपुर मा छन् मा छन् ।
--------------------------------------------------------------------------------
 Epoch: 9 | Training Loss: 1.095      Validation Loss: 1.098             Train WER: 0.483      Validation WER: 0.494


Processing Epoch 10: 100%|██████████| 484/484 [08:19<00:00,  1.03s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: मेरो साथी लाई अण्डा मनपर्छ ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । संग मेरो साथी छ । छैन ।
            TARGET: म तिम्रो पैसा खान्छु ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । संग मेरो साथी छ । छैन ।
--------------------------------------------------------------------------------
 Epoch: 10 | Training Loss: 1.086      Validation Loss: 1.101             Train WER: 0.476      Validation WER: 0.483


Processing Epoch 11: 100%|██████████| 484/484 [08:21<00:00,  1.04s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: मेरो साथी धेरै भक्तपुर मा छन् ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: मेरो धेरै साथी हरु छन् । घर भक्तपुर मा छन् । मेरो साथी हरु
            TARGET: म संग मेरो साथी छ ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: मेरो धेरै साथी हरु छन् । घर भक्तपुर मा छन् । मेरो साथी हरु
--------------------------------------------------------------------------------
 Epoch: 11 | Training Loss: 1.086      Validation Loss: 1.095             Train WER: 0.483      Validation WER: 0.456


Processing Epoch 12: 100%|██████████| 484/484 [08:31<00:00,  1.06s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: भक्तपुर मा धेरै काम छ ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: म लाई अण्डा मनपर्छ । अण्डा मनपर्छ । म लाई अण्डा मनपर्छ ।
            TARGET: तिमी हरु मेरो साथी हो ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: म लाई अण्डा मनपर्छ । अण्डा मनपर्छ । म लाई अण्डा मनपर्छ ।
--------------------------------------------------------------------------------
 Epoch: 12 | Training Loss: 1.084      Validation Loss: 1.137             Train WER: 0.473      Validation WER: 0.472


Processing Epoch 13: 100%|██████████| 484/484 [08:23<00:00,  1.04s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: तिम्रो काम धेरै छ ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: म संग मेरो साथी छ । मेरो साथी छ । मेरो साथी छ ।
            TARGET: तिमी म लाई मनपर्छ ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: म संग मेरो साथी छ । मेरो साथी छ । मेरो साथी छ ।
--------------------------------------------------------------------------------
 Epoch: 13 | Training Loss: 1.079      Validation Loss: 1.108             Train WER: 0.474      Validation WER: 0.496


Processing Epoch 14: 100%|██████████| 484/484 [08:28<00:00,  1.05s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: तिम्रो काम धेरै छ ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । म लाई छैन । पैसा छैन ।
            TARGET: म भक्तपुर मा काम गर्छु ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । म लाई छैन । पैसा छैन ।
--------------------------------------------------------------------------------
 Epoch: 14 | Training Loss: 1.079      Validation Loss: 1.08             Train WER: 0.486      Validation WER: 0.519


Processing Epoch 15: 100%|██████████| 484/484 [08:00<00:00,  1.01it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: म भक्तपुर मा काम गर्छु ।
  PREDICTED GREEDY: म लाई भक्तपुर मनपर्छ ।
    PREDICTED BEAM: तिम्रो काम धेरै छ ।
            TARGET: तिमी म लाई मनपर्छ ।
  PREDICTED GREEDY: म लाई भक्तपुर मनपर्छ ।
    PREDICTED BEAM: तिम्रो काम धेरै छ ।
--------------------------------------------------------------------------------
 Epoch: 15 | Training Loss: 1.079      Validation Loss: 1.111             Train WER: 0.486      Validation WER: 0.472


Processing Epoch 16: 100%|██████████| 484/484 [07:53<00:00,  1.02it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: तिमी संग अण्डा छैन ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: मेरो साथी धेरै भक्तपुर मा छन् । लाई अण्डा मनपर्छ । छन् ।
            TARGET: म संग अण्डा छैन ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: मेरो साथी धेरै भक्तपुर मा छन् । मा छन् मा छन् मा छन् ।
--------------------------------------------------------------------------------
 Epoch: 16 | Training Loss: 1.072      Validation Loss: 1.077             Train WER: 0.47      Validation WER: 0.457


Processing Epoch 17: 100%|██████████| 484/484 [08:22<00:00,  1.04s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: तिम्रो काम छैन पैसा छैन ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: मेरो धेरै साथी हरु छन् ।
            TARGET: तिमी हरु मेरो साथी हो ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: मेरो धेरै साथी हरु छन् ।
--------------------------------------------------------------------------------
 Epoch: 17 | Training Loss: 1.065      Validation Loss: 1.066             Train WER: 0.466      Validation WER: 0.475


Processing Epoch 18: 100%|██████████| 484/484 [08:09<00:00,  1.01s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: म अण्डा खान्छु ।
  PREDICTED GREEDY: म संग धेरै पैसा छैन ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । तिम्रो पैसा छैन । भक्तपुर मनपर्छ ।
            TARGET: म तिम्रो पैसा खान्छु ।
  PREDICTED GREEDY: म संग धेरै पैसा छैन ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । तिम्रो पैसा छैन । भक्तपुर मनपर्छ ।
--------------------------------------------------------------------------------
 Epoch: 18 | Training Loss: 1.063      Validation Loss: 1.095             Train WER: 0.469      Validation WER: 0.485


Processing Epoch 19: 100%|██████████| 484/484 [08:06<00:00,  1.01s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: मेरो साथी लाई अण्डा मनपर्छ ।
  PREDICTED GREEDY: म संग धेरै पैसा छैन ।
    PREDICTED BEAM: तिम्रो काम छैन पैसा छैन । लाई छैन । तिम्रो पैसा छैन ।
            TARGET: मेरो साथी लाई अण्डा मनपर्छ ।
  PREDICTED GREEDY: म संग धेरै पैसा छैन ।
    PREDICTED BEAM: तिम्रो काम छैन पैसा छैन । लाई छैन । तिम्रो पैसा छैन ।
--------------------------------------------------------------------------------
 Epoch: 19 | Training Loss: 1.065      Validation Loss: 1.072             Train WER: 0.488      Validation WER: 0.474


Processing Epoch 20: 100%|██████████| 484/484 [08:06<00:00,  1.00s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: तिम्रो काम छैन पैसा छैन ।
  PREDICTED GREEDY: म लाई भक्तपुर मनपर्छ ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । अण्डा मनपर्छ ।
            TARGET: मेरो साथी लाई अण्डा मनपर्छ ।
  PREDICTED GREEDY: म लाई भक्तपुर मनपर्छ ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । अण्डा मनपर्छ ।
--------------------------------------------------------------------------------
 Epoch: 20 | Training Loss: 1.07      Validation Loss: 1.07             Train WER: 0.482      Validation WER: 0.499


Processing Epoch 21: 100%|██████████| 484/484 [08:17<00:00,  1.03s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: म अण्डा खान्छु ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । लाई छैन ।
            TARGET: तिमी म लाई मनपर्छ ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । लाई छैन ।
--------------------------------------------------------------------------------
 Epoch: 21 | Training Loss: 1.064      Validation Loss: 1.061             Train WER: 0.493      Validation WER: 0.503


Processing Epoch 22: 100%|██████████| 484/484 [08:10<00:00,  1.01s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: मेरो धेरै साथी हरु छन् ।
  PREDICTED GREEDY: म अण्डा खान्छु ।
    PREDICTED BEAM: तिम्रो काम छैन पैसा छैन । छैन । धेरै छ । तिम्रो ।
            TARGET: म संग धेरै पैसा छैन ।
  PREDICTED GREEDY: म अण्डा खान्छु ।
    PREDICTED BEAM: तिम्रो काम छैन पैसा छैन । छैन । धेरै छ । तिम्रो ।
--------------------------------------------------------------------------------
 Epoch: 22 | Training Loss: 1.072      Validation Loss: 1.073             Train WER: 0.49      Validation WER: 0.44


Processing Epoch 23: 100%|██████████| 484/484 [08:28<00:00,  1.05s/it]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: तिम्रो काम छैन पैसा छैन ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: तिम्रो काम छैन पैसा छैन । हरु म लाई छैन । संग अण्डा छैन
            TARGET: मेरो साथी लाई अण्डा मनपर्छ ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: तिम्रो काम छैन पैसा छैन । छैन । हरु म लाई छैन ।
--------------------------------------------------------------------------------
 Epoch: 23 | Training Loss: 1.064      Validation Loss: 1.074             Train WER: 0.485      Validation WER: 0.527


Processing Epoch 24: 100%|██████████| 484/484 [07:57<00:00,  1.01it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: तिम्रो काम हरु म लाई छैन ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: मेरो साथी धेरै भक्तपुर मा छन् । साथी हरु छन् । साथी हरु छन्
            TARGET: मेरो धेरै साथी हरु छन् ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: मेरो साथी धेरै भक्तपुर मा छन् । साथी हरु छन् । साथी हरु छन्
--------------------------------------------------------------------------------
 Epoch: 24 | Training Loss: 1.059      Validation Loss: 1.068             Train WER: 0.458      Validation WER: 0.498


Processing Epoch 25: 100%|██████████| 484/484 [07:45<00:00,  1.04it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: म संग अण्डा छैन ।
  PREDICTED GREEDY: म लाई अण्डा मनपर्छ ।
    PREDICTED BEAM: तिमी म लाई मनपर्छ ।
            TARGET: भक्तपुर मा धेरै काम छ ।
  PREDICTED GREEDY: म लाई अण्डा मनपर्छ ।
    PREDICTED BEAM: तिमी म लाई मनपर्छ ।
--------------------------------------------------------------------------------
 Epoch: 25 | Training Loss: 1.055      Validation Loss: 1.08             Train WER: 0.459      Validation WER: 0.466


Processing Epoch 26: 100%|██████████| 484/484 [07:44<00:00,  1.04it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: म तिम्रो पैसा खान्छु ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । तिमी संग मेरो साथी हो ।
            TARGET: मेरो धेरै साथी हरु छन् ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । तिमी संग मेरो साथी हो ।
--------------------------------------------------------------------------------
 Epoch: 26 | Training Loss: 1.059      Validation Loss: 1.051             Train WER: 0.47      Validation WER: 0.529


Processing Epoch 27: 100%|██████████| 484/484 [07:23<00:00,  1.09it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: तिम्रो काम हरु म लाई छैन ।
  PREDICTED GREEDY: म लाई भक्तपुर मनपर्छ ।
    PREDICTED BEAM: तिमी हरु मेरो साथी हो । लाई मनपर्छ ।
            TARGET: म संग अण्डा छैन ।
  PREDICTED GREEDY: म लाई भक्तपुर मनपर्छ ।
    PREDICTED BEAM: तिमी हरु मेरो साथी हो । लाई मनपर्छ ।
--------------------------------------------------------------------------------
 Epoch: 27 | Training Loss: 1.058      Validation Loss: 1.07             Train WER: 0.475      Validation WER: 0.501


Processing Epoch 28: 100%|██████████| 484/484 [07:36<00:00,  1.06it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: म तिम्रो पैसा खान्छु ।
  PREDICTED GREEDY: म लाई अण्डा मनपर्छ ।
    PREDICTED BEAM: मेरो साथी लाई अण्डा मनपर्छ । म लाई अण्डा मनपर्छ ।
            TARGET: मेरो घर भक्तपुर मा छ ।
  PREDICTED GREEDY: म लाई अण्डा मनपर्छ ।
    PREDICTED BEAM: मेरो साथी लाई अण्डा मनपर्छ । म लाई अण्डा मनपर्छ ।
--------------------------------------------------------------------------------
 Epoch: 28 | Training Loss: 1.06      Validation Loss: 1.104             Train WER: 0.482      Validation WER: 0.49


Processing Epoch 29: 100%|██████████| 484/484 [07:18<00:00,  1.10it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: मेरो घर भक्तपुर मा छ ।
  PREDICTED GREEDY: म अण्डा खान्छु ।
    PREDICTED BEAM: मेरो धेरै साथी हरु छन् । अण्डा मनपर्छ ।
            TARGET: तिम्रो काम छैन पैसा छैन ।
  PREDICTED GREEDY: म अण्डा खान्छु ।
    PREDICTED BEAM: मेरो धेरै साथी हरु छन् । अण्डा मनपर्छ ।
--------------------------------------------------------------------------------
 Epoch: 29 | Training Loss: 1.059      Validation Loss: 1.066             Train WER: 0.478      Validation WER: 0.479


Processing Epoch 30: 100%|██████████| 484/484 [07:24<00:00,  1.09it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: म संग धेरै पैसा छैन ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । हरु म लाई छैन । हरु म
            TARGET: भक्तपुर मा धेरै काम छ ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । हरु म लाई छैन । हरु म
--------------------------------------------------------------------------------
 Epoch: 30 | Training Loss: 1.051      Validation Loss: 1.074             Train WER: 0.457      Validation WER: 0.477


Processing Epoch 31: 100%|██████████| 484/484 [07:24<00:00,  1.09it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: तिमी संग अण्डा छैन ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: म संग मेरो साथी छ । म लाई अण्डा छैन ।
            TARGET: म लाई भक्तपुर मनपर्छ ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: म संग मेरो साथी छ । म लाई अण्डा छैन ।
--------------------------------------------------------------------------------
 Epoch: 31 | Training Loss: 1.056      Validation Loss: 1.099             Train WER: 0.46      Validation WER: 0.493


Processing Epoch 32: 100%|██████████| 484/484 [07:22<00:00,  1.09it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: तिम्रो काम छैन पैसा छैन ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । अण्डा मनपर्छ छैन । छैन ।
            TARGET: तिम्रो काम छैन पैसा छैन ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । अण्डा मनपर्छ छैन । छैन ।
--------------------------------------------------------------------------------
 Epoch: 32 | Training Loss: 1.051      Validation Loss: 1.056             Train WER: 0.474      Validation WER: 0.49


Processing Epoch 33: 100%|██████████| 484/484 [07:19<00:00,  1.10it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: मेरो धेरै साथी हरु छन् ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: मेरो साथी धेरै भक्तपुर मा छन् । हरु छन् । हरु छन् ।
            TARGET: तिम्रो काम हरु म लाई छैन ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: मेरो साथी धेरै भक्तपुर मा छन् । हरु छन् । हरु छन् ।
--------------------------------------------------------------------------------
 Epoch: 33 | Training Loss: 1.05      Validation Loss: 1.062             Train WER: 0.471      Validation WER: 0.49


Processing Epoch 34: 100%|██████████| 484/484 [07:17<00:00,  1.11it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: तिमी म लाई मनपर्छ ।
  PREDICTED GREEDY: म लाई भक्तपुर मनपर्छ ।
    PREDICTED BEAM: तिम्रो काम छैन पैसा छैन । छैन । धेरै छ । धेरै छ ।
            TARGET: म लाई अण्डा मनपर्छ ।
  PREDICTED GREEDY: म लाई भक्तपुर मनपर्छ ।
    PREDICTED BEAM: तिम्रो काम छैन पैसा छैन । छैन । धेरै छ । धेरै छ ।
--------------------------------------------------------------------------------
 Epoch: 34 | Training Loss: 1.049      Validation Loss: 1.077             Train WER: 0.472      Validation WER: 0.495


Processing Epoch 35: 100%|██████████| 484/484 [07:45<00:00,  1.04it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: म अण्डा खान्छु ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: तिम्रो काम छैन पैसा छैन । अण्डा छैन । छैन । छैन ।
            TARGET: तिमी म लाई मनपर्छ ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: तिम्रो काम छैन पैसा छैन । धेरै छ । अण्डा छैन ।
--------------------------------------------------------------------------------
 Epoch: 35 | Training Loss: 1.054      Validation Loss: 1.052             Train WER: 0.477      Validation WER: 0.46


Processing Epoch 36: 100%|██████████| 484/484 [07:18<00:00,  1.10it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: मेरो साथी लाई अण्डा मनपर्छ ।
  PREDICTED GREEDY: म संग धेरै पैसा छैन ।
    PREDICTED BEAM: म संग धेरै पैसा छैन । संग धेरै पैसा छैन । धेरै पैसा छैन
            TARGET: म तिम्रो पैसा खान्छु ।
  PREDICTED GREEDY: म संग धेरै पैसा छैन ।
    PREDICTED BEAM: म संग धेरै पैसा छैन । संग धेरै पैसा छैन । धेरै पैसा छैन
--------------------------------------------------------------------------------
 Epoch: 36 | Training Loss: 1.051      Validation Loss: 1.069             Train WER: 0.469      Validation WER: 0.485


Processing Epoch 37: 100%|██████████| 484/484 [07:23<00:00,  1.09it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: म संग धेरै पैसा छैन ।
  PREDICTED GREEDY: म लाई भक्तपुर मनपर्छ ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । लाई छैन लाई छैन लाई छैन ।
            TARGET: मेरो धेरै साथी हरु छन् ।
  PREDICTED GREEDY: म लाई भक्तपुर मनपर्छ ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । लाई छैन लाई छैन लाई छैन ।
--------------------------------------------------------------------------------
 Epoch: 37 | Training Loss: 1.052      Validation Loss: 1.061             Train WER: 0.479      Validation WER: 0.485


Processing Epoch 38: 100%|██████████| 484/484 [07:28<00:00,  1.08it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: तिम्रो काम धेरै छ ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: मेरो धेरै साथी हरु छन् । अण्डा मनपर्छ ।
            TARGET: म तिम्रो पैसा खान्छु ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: मेरो धेरै साथी हरु छन् । अण्डा मनपर्छ ।
--------------------------------------------------------------------------------
 Epoch: 38 | Training Loss: 1.051      Validation Loss: 1.046             Train WER: 0.479      Validation WER: 0.465


Processing Epoch 39: 100%|██████████| 484/484 [07:50<00:00,  1.03it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: म संग धेरै पैसा छैन ।
  PREDICTED GREEDY: म तिम्रो पैसा खान्छु ।
    PREDICTED BEAM: म तिम्रो पैसा खान्छु ।
            TARGET: मेरो साथी लाई अण्डा मनपर्छ ।
  PREDICTED GREEDY: म तिम्रो पैसा खान्छु ।
    PREDICTED BEAM: म तिम्रो पैसा खान्छु ।
--------------------------------------------------------------------------------
 Epoch: 39 | Training Loss: 1.055      Validation Loss: 1.062             Train WER: 0.477      Validation WER: 0.443


Processing Epoch 40: 100%|██████████| 484/484 [07:38<00:00,  1.05it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: मेरो साथी धेरै भक्तपुर मा छन् ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: मेरो धेरै साथी हरु छन् । अण्डा छैन ।
            TARGET: म घर मा धेरै काम गर्छु ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: मेरो धेरै साथी हरु छन् । अण्डा छैन ।
--------------------------------------------------------------------------------
 Epoch: 40 | Training Loss: 1.048      Validation Loss: 1.047             Train WER: 0.457      Validation WER: 0.481


Processing Epoch 41: 100%|██████████| 484/484 [07:56<00:00,  1.01it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: मेरो साथी लाई अण्डा मनपर्छ ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: तिमी संग अण्डा छैन ।
            TARGET: मेरो साथी धेरै भक्तपुर मा छन् ।
  PREDICTED GREEDY: म संग अण्डा छैन ।
    PREDICTED BEAM: तिमी संग अण्डा छैन ।
--------------------------------------------------------------------------------
 Epoch: 41 | Training Loss: 1.045      Validation Loss: 1.045             Train WER: 0.464      Validation WER: 0.468


Processing Epoch 42: 100%|██████████| 484/484 [07:57<00:00,  1.01it/s]
stty: 'standard input': Inappropriate ioctl for device


            TARGET: तिम्रो काम हरु म लाई छैन ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । तिम्रो पैसा छैन ।
            TARGET: तिमी म लाई मनपर्छ ।
  PREDICTED GREEDY: म संग मेरो साथी छ ।
    PREDICTED BEAM: तिम्रो काम हरु म लाई छैन । तिम्रो पैसा छैन ।
--------------------------------------------------------------------------------
 Epoch: 42 | Training Loss: 1.043      Validation Loss: 1.058             Train WER: 0.455      Validation WER: 0.513


Processing Epoch 43:  44%|████▍     | 212/484 [03:27<04:26,  1.02it/s]


KeyboardInterrupt: 

In [None]:
# v2t_model=torch.load('/kaggle/input/save-model/1_e_train.pt',map_location=torch.device('cpu'))

In [None]:
len(train_dataloader)

# **Running Tests**

## Plot Each Frames Extracted from the Video

In [None]:
import matplotlib.pyplot as plt

video=next(iter(val_dataloader))

# Create subplots
fig, axs = plt.subplots(8, 8, figsize=(10, 10))

# Plot images
f=0
for i in range(8):
    for j in range(8):
        f+=1
        if f<60:
          im=video['video'].permute(0,2,3,4,1)[0,f,:,:,:]
          axs[i, j].imshow(im)
        axs[i, j].set_title(f'frame: {f+1}')
        axs[i, j].axis('off')  # Hide axis
plt.tight_layout()
plt.show()

In [None]:
video['tgt_text']

# **Inference on Single Video**

In [None]:
saved_model=torch.load('/kaggle/working/199_mtrain.pt')
v2t_model=saved_model['model'].to(device)

In [None]:
def run_inference(model, video, tokenizer_tgt, max_len, device):
    model.eval()

    source_texts = []
    expected = []
    predicted = []

    with torch.no_grad():

        encoder_input = video['video'][0].unsqueeze(0).to(device)  # (b, seq_len)

        # check that the batch size is 1
        assert encoder_input.size(
            0) == 1, "Batch size must be 1 for validation"

        model_out = greedy_decode(
            model, encoder_input, None, tokenizer_tgt, max_len, device)

        target_text = video["tgt_text"][0]
        model_out_text = tokenizer_tgt.decode(
            model_out.detach().cpu().numpy())

#             source_texts.append(source_text)
        expected.append(target_text)
        predicted.append(model_out_text)

        # Print the source, target and model output
        print('-----------------------------')
        print(f"TARGET: {target_text}")
        print(f"PREDICTED: {model_out_text}")

        # Compute the word error rate   
        metric = torchmetrics.WordErrorRate()
        wer = metric(predicted, expected)
        print(f"Word Error Rate:{wer}")

In [None]:
video=next(iter(train_dataloader))
run_inference(model=v2t_model, video=video, tokenizer_tgt=target_tokenizer, max_len=15, device=device)