In [9]:
import os
import random
from torch.utils.data import Dataset
import torch
import numpy as np
import json
from transformers import AutoTokenizer
import copy
import os
from config import config

IGNORE_INDEX=-100

class ACGDataset(Dataset):
    
    def __init__(self,root_dir, window = 15, fps = 1,
                 tokenizer_name = config.model.language_model.tokenizer_name, max_token_length=128):
        
        self.root_path = root_dir  
        self.vid_embs_dir = os.path.join(self.root_path, "vid_embs")
        self.commentary_dir = os.path.join(self.root_path, "standardized_transcripts_filtered")
        self.tokenizer_name = tokenizer_name
        self.window = window 
        
        # the video embeddings are sampled at two embedding per second. 
        #need to hcange the below parameter as req
        self.fps = fps
        
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,use_auth_token=True)
        self.tokenizer.pad_token_id = 128001
        self.tokenizer.add_tokens(["[BATSMAN]","[BOWLER]", "[FIELDER]", 
                                   "[UMPIRE]", "[VENUE]" ], special_tokens=True)
        self.max_token_length = max_token_length
        
        self.vid_ids = self._get_valid_vidids()
        
        
    def __getitem__(self,idx):
        
        vid_id = self.vid_ids[idx]
        
        #load npy file
        features_path = os.path.join(self.vid_embs_dir, vid_id+"_embeddings.npy")
        features = np.load(features_path) # Time, Patches(256+cls), Dimension
        features = self._resample_features(features,self.fps)
        
        
        #load commentary for hte vid file
        commentary = self._get_vid_commentary(vid_id)
        
        tokens = self.tokenizer(commentary,return_tensors="pt", 
                                max_length=self.max_token_length,truncation=True).input_ids[0]
        
    
        return {'vid_features': features, "tokens": tokens, "commentary": commentary}
  
        
    def __len__(self):
        return len(self.vid_ids)
    
    def _resample_features(self, features, fps, time_step=0.5):
        # Calculate the new time interval between samples
        new_interval = time_step / fps  # How often to sample (in seconds)
        
        # Calculate the index step (how many original time steps to skip)
        step = int(1 / new_interval)
        
        # Create an array of indices for resampling based on fps
        new_time_indices = np.arange(0, features.shape[0], step)
        
        # Return the features sampled at the new time indices
        return features[new_time_indices]
    
    def _get_vid_commentary(self, vid_id):
        """
        Reads and returns the text from the commentary file associated with the given vid_id.

        Args:
            vid_id (str): The video ID whose commentary is to be fetched.

        Returns:
            str: The content of the commentary file.

        Raises:
            FileNotFoundError: If the .txt file for the given vid_id does not exist.
        """
        
        file_path = os.path.join(self.commentary_dir, f"{vid_id}.txt")
        
        
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Commentary file not found for vid_id: {vid_id}")

        
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read()
    


    def _get_valid_vidids(self):
        """
        Finds video IDs (vidids) in the given directory for which both .txt and _embeddings.npy files are present.

        Args:
            directory (str): Path to the directory containing the files.

        Returns:
            list: List of vidids with both .txt and _embeddings.npy files present.
        """
        
        
        
        # Get the list of files in both directories
        vid_files = os.listdir(self.vid_embs_dir)
        com_files = os.listdir(self.commentary_dir)
        
        # Get the base filenames without extensions for .npy files
        emb_files = {os.path.splitext(f)[0].split('_embeddings')[0] for f in vid_files if f.endswith('_embeddings.npy')}
        
        # Get the base filenames without extensions for .txt files
        com_files = {os.path.splitext(f)[0] for f in com_files if f.endswith('.txt')}
        
      
        # Find the intersection of the two sets
        valid_vidids = emb_files.intersection(com_files)
        
        
        return list(valid_vidids)
    
    
    def collator(self,batch):
        
        out_batch= {}

        input_ids = [
            torch.cat((torch.tensor([self.tokenizer.convert_tokens_to_ids("<|begin_of_text|>")]),
                        instance["tokens"],
                        torch.tensor([self.tokenizer.convert_tokens_to_ids("<|end_of_text|>")]))) for instance in batch] 

        labels = copy.deepcopy(input_ids)
        
        labels = torch.nn.utils.rnn.pad_sequence(
            labels,
            batch_first=True,
            padding_value=IGNORE_INDEX)

        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids,
            batch_first=True,
            padding_value=self.tokenizer.convert_tokens_to_ids("<|end_of_text|>"))

        attention_mask=input_ids.ne(self.tokenizer.convert_tokens_to_ids("<|end_of_text|>"))
        
        vid_features = torch.nn.utils.rnn.pad_sequence(
            [torch.from_numpy(instance['vid_features']) for instance in batch], batch_first=True )
        
        commentaries = [instance['commentary'] for instance in batch]
        
       


        # if 'vid_features' in batch[0]:
        #     features = [torch.from_numpy(instance['vid_features']) for instance in batch]
        # if all(x is not None and x.shape == features[0].shape for x in features):
        #     out_batch['vid_features'] = torch.stack(features)
        # else:
        #     out_batch['vid_features'] = features
        
        
            
        out_batch['vid_features'] = vid_features
        out_batch['input_ids']=input_ids
        out_batch['attention_mask'] =attention_mask
        out_batch['labels']=labels
        out_batch['commentary'] = commentaries
        return out_batch
     

In [10]:
ds = ACGDataset("/home/hoffman/Documents/UT/Stuffs/Applied ML/project/data/pre_3")



In [11]:
batch = [ds[4], ds[6], ds[43]]

collated_batch =  ds.collator(batch)

In [18]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from torch import nn
import einops
import contextlib
from Qformer import BertConfig, BertLMHeadModel
from transformers.generation.logits_process import LogitsProcessor, LogitsProcessorList
from typing import List
import pickle as pkl
import sys
import io

from config import config 


def process_output_tokens(predict_model, tokens):
    output_texts = []
    for output_token in tokens:
        output_text = predict_model.tokenizer.decode(output_token)
        end_token_index = output_text.find('<|end_of_text|>')
        if end_token_index != -1:
            output_text = output_text[:end_token_index]
        output_texts.append(output_text)
    return output_texts
    
class LayerNorm(nn.LayerNorm):
    def forward(self, x: torch.Tensor):
        orig_type = x.dtype
        ret = super().forward(x.type(torch.float32))
        return ret.type(orig_type)
        

class OPTModel(nn.Module):
    def __init__(self, max_frame_pos=128, 
                 window=30, num_query_tokens=32, 
                 num_video_query_token=32, num_features=1024, 
                 device = "cuda", inference=False):
        
        super().__init__()
        
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(config.model.language_model.tokenizer_name)
        #self.tokenizer.add_tokens(["[PLAYER]","[TEAM]","([TEAM])"], special_tokens=True)
        self.opt_model = AutoModelForCausalLM.from_pretrained(config.model.language_model.llm_name, torch_dtype=torch.bfloat16)
        #self.opt_model.resize_token_embeddings(len(self.tokenizer))
        
        self.eos_token_id = self.tokenizer(
            "\n", add_special_tokens=False
        ).input_ids[0]
        
        #self.llama_model.parallelize()
        
        self.ln_vision = LayerNorm(num_features)
        self.num_query_tokens = num_query_tokens,
        self.num_video_query_token = num_video_query_token
        self.inference = inference
        
        
         # Initialize video Q-former
        self.video_Qformer,self.video_query_tokens = self.init_video_Qformer(num_query_token = num_video_query_token,
                                                                             vision_width=num_features,
                                                                             num_hidden_layers =2)
        self.video_Qformer.cls = None
        self.video_Qformer.bert.embeddings.word_embeddings = None
        self.video_Qformer.bert.embeddings.position_embeddings = None
        for layer in self.video_Qformer.bert.encoder.layer:
            layer.output = None
            layer.intermediate = None

        # llama projection
        self.opt_proj = nn.Linear(
            self.video_Qformer.config.hidden_size, 512
        )
        # video frame positional embedding
        self.video_frame_position_embedding = nn.Embedding(max_frame_pos, num_features)
        self.window = window

        # move to device
        self.opt_model = self.opt_model.to(self.device)
        for name, param in self.opt_model.named_parameters():
            param.requires_grad = False
        self.video_Qformer = self.video_Qformer.to(self.device)
        self.opt_proj = self.opt_proj.to(self.device)
        self.ln_vision = self.ln_vision.to(self.device)
        for name, param in self.ln_vision.named_parameters():
            param.requires_grad = False
        self.ln_vision = self.ln_vision.eval()
        self.video_frame_position_embedding = self.video_frame_position_embedding.to(self.device)

        
        
    
    @classmethod
    def init_video_Qformer(cls, num_query_token, vision_width, num_hidden_layers =2):
        encoder_config = BertConfig.from_pretrained("bert-base-uncased")
        encoder_config.num_hidden_layers = num_hidden_layers
        encoder_config.encoder_width = vision_width
        # insert cross-attention layer every other block
        encoder_config.add_cross_attention = True
        encoder_config.cross_attention_freq = 1
        encoder_config.query_length = num_query_token
        Qformer = BertLMHeadModel(config=encoder_config)
        query_tokens = nn.Parameter(
            torch.zeros(1, num_query_token, encoder_config.hidden_size)
        )
        query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
        return Qformer, query_tokens
    
    def forward(self, batch, validating=False):
        
        video_features = batch['vid_features'].to(self.device) #B,T,P,D or [T,P,D]
        input_ids= batch['input_ids']#Bxmax(T)
        atts_opt = batch['attention_mask']  #Bxmax(T)
        targets= batch['labels'] #Bxmax(T)
        

            
        batch_size, time_length, _, _ = video_features.size()
        
        video_features = self.ln_vision(video_features)
        
        video_features = einops.rearrange(video_features, 'b t n f -> (b t) n f', b=batch_size, t=time_length)
        position_ids = torch.arange(time_length, dtype=torch.long, device=video_features.device)
        position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
        
        frame_position_embeddings = self.video_frame_position_embedding(position_ids)
        frame_position_embeddings = frame_position_embeddings.unsqueeze(-2)
        
        frame_hidden_state = einops.rearrange(video_features, '(b t) n f -> b t n f',b=batch_size,t=time_length)
        frame_hidden_state = frame_position_embeddings + frame_hidden_state
        
        frame_hidden_state =  einops.rearrange(frame_hidden_state, 'b t q h -> b (t q) h',b=batch_size,t=time_length)
        
        
        frame_atts = torch.ones(frame_hidden_state.size()[:-1], dtype=torch.long).to(frame_hidden_state)
        
        video_query_tokens = self.video_query_tokens.expand(frame_hidden_state.shape[0], -1, -1).to(frame_hidden_state.device)
        
        video_query_output = self.video_Qformer.bert(
            query_embeds=video_query_tokens,
            encoder_hidden_states=frame_hidden_state,
            encoder_attention_mask=frame_atts,
            return_dict=True,
        )
        
        video_hidden = video_query_output.last_hidden_state
        
        inputs_opt = self.opt_proj(video_hidden)
        #inputs_embeds = self.opt_model.model.decoder.embed_tokens(input_ids)
        
        if self.inference:
            return self.generate_text(inputs_opt)
        
        if validating:
            response = self.generate_text(inputs_opt)
            return response, batch['commentary']
        
        #atts_llama
        
        visual_label = torch.full((batch_size, self.num_video_query_token), -100, dtype=targets.dtype)
        concat_targets = torch.cat((visual_label, targets), dim=1).to(self.device)
        temp_input_ids = input_ids.clone().to(self.device)
        targets_embeds = self.opt_model.model.decoder.embed_tokens(temp_input_ids)
        
        embedding_cat = torch.cat((inputs_opt, targets_embeds), dim=1)
        mask_prefix = torch.ones(batch_size, self.num_video_query_token, dtype=atts_opt.dtype)
        mask = torch.concat((mask_prefix, atts_opt), dim=1).to(self.device)
        
        
        original_stdout = sys.stdout
        sys.stdout = io.StringIO()
        
        with self.maybe_autocast():
            outputs = self.opt_model(
                inputs_embeds=embedding_cat,
                attention_mask=mask,
                return_dict=True,
                labels=concat_targets,
            )
            
        sys.stdout = original_stdout
        loss = outputs.loss
        return loss
    

        
    # def generate_text(self, inputs_opt):
    #     start_embeds = self.opt_model.model.decoder.embed_tokens(torch.tensor([128000]).to(self.device))
    #     inputs_llama_with_s = torch.cat([inputs_opt, start_embeds.expand(inputs_opt.size(0), -1, -1)], dim=1).to(dtype=torch.bfloat16)
    #     temp_res_tokens = self.opt_model.generate(
    #         renormalize_logits=True,
    #         inputs_embeds=inputs_llama_with_s,
    #         max_new_tokens=128,
    #         num_beams=5,
    #         do_sample=True,
    #         min_length=5,
    #         top_p=0.9,
    #         repetition_penalty=1.0,
    #         length_penalty=1,
    #         temperature=1.0,
    #     )
    #     res_text = process_output_tokens(self, temp_res_tokens)
    #     return res_text
        
        
    
    def maybe_autocast(self, dtype=torch.float16):
        enable_autocast = self.device != torch.device("cpu")
        if enable_autocast:
            return torch.cuda.amp.autocast(dtype=dtype)
        else:
            return contextlib.nullcontext()
        
        
    def generate_text(self, inputs_opt):
        start_embeds = self.opt_model.model.embed_tokens(torch.tensor([128000]).to(self.device))
        # inputs_opt_with_s = torch.cat([inputs_opt, start_embeds.expand(inputs_opt.size(0), -1, -1)], dim=1).to(dtype=torch.bfloat16)
        # temp_res_tokens = self.opt_model.generate(
        #     renormalize_logits=True,
        #     inputs_embeds=inputs_opt_with_s,
        #     max_new_tokens=128,
        #     num_beams=5,
        #     do_sample=True,
        #     min_length=5,
        #     top_p=0.9,
        #     repetition_penalty=1.0,
        #     length_penalty=1,
        #     temperature=1.0,
        # )
        # res_text = process_output_tokens(self, temp_res_tokens)
        # return res_text
        return None

    

        


In [44]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from torch import nn
import einops
import contextlib
from Qformer import BertConfig, BertLMHeadModel
from transformers.generation.logits_process import LogitsProcessor, LogitsProcessorList
from typing import List
import pickle as pkl
import sys
import io

from config import config 
from transformers import BertTokenizer


def process_output_tokens(predict_model, tokens):
    output_texts = []
    for output_token in tokens:
        output_text = predict_model.tokenizer.decode(output_token)
        end_token_index = output_text.find('<|end_of_text|>')
        if end_token_index != -1:
            output_text = output_text[:end_token_index]
        output_texts.append(output_text)
    return output_texts
    
class LayerNorm(nn.LayerNorm):
    def forward(self, x: torch.Tensor):
        orig_type = x.dtype
        ret = super().forward(x.type(torch.float32))
        return ret.type(orig_type)
        

class OPTModel(nn.Module):
    def __init__(self, max_frame_pos=128, 
                 window=30, num_query_tokens=32, 
                 num_video_query_token=32, num_features=1024, 
                 device = "cuda", inference=False):
        
        super().__init__()
        
        self.device = device
        self.opt_tokenizer = AutoTokenizer.from_pretrained(config.model.language_model.tokenizer_name)
        #self.opt_tokenizer.add_tokens(["[PLAYER]","[TEAM]","([TEAM])"], special_tokens=True)
        self.opt_model = AutoModelForCausalLM.from_pretrained(config.model.language_model.llm_name, torch_dtype=torch.bfloat16)
        #self.opt_model.resize_token_embeddings(len(self.tokenizer))
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation_side=truncation_side)
        tokenizer.add_special_tokens({"bos_token": "[DEC]"})
        
        self.eos_token_id = self.opt_tokenizer(
            "\n", add_special_tokens=False
        ).input_ids[0]
        
        #self.llama_model.parallelize()
        
        self.ln_vision = LayerNorm(num_features)
        self.num_query_tokens = num_query_tokens,
        self.num_video_query_token = num_video_query_token
        self.inference = inference
        
        
         # Initialize video Q-former
        self.video_Qformer,self.video_query_tokens = self.init_video_Qformer(num_query_token = num_video_query_token,
                                                                             vision_width=num_features,
                                                                             num_hidden_layers =2)
        self.video_Qformer.cls = None
        self.video_Qformer.bert.embeddings.word_embeddings = None
        self.video_Qformer.bert.embeddings.position_embeddings = None
        for layer in self.video_Qformer.bert.encoder.layer:
            layer.output = None
            layer.intermediate = None

        # llama projection
        self.opt_proj = nn.Linear(
            self.video_Qformer.config.hidden_size, self.opt_model.config.hidden_size
        )
        # video frame positional embedding
        self.video_frame_position_embedding = nn.Embedding(max_frame_pos, num_features)
        self.window = window

        # move to device
        self.opt_model = self.opt_model.to(self.device)
        for name, param in self.opt_model.named_parameters():
            param.requires_grad = False
        self.video_Qformer = self.video_Qformer.to(self.device)
        self.opt_proj = self.opt_proj.to(self.device)
        self.ln_vision = self.ln_vision.to(self.device)
        for name, param in self.ln_vision.named_parameters():
            param.requires_grad = False
        self.ln_vision = self.ln_vision.eval()
        self.video_frame_position_embedding = self.video_frame_position_embedding.to(self.device)

        
        
    
    @classmethod
    def init_video_Qformer(cls, num_query_token, vision_width, num_hidden_layers =2):
        encoder_config = BertConfig.from_pretrained("bert-base-uncased")
        encoder_config.num_hidden_layers = num_hidden_layers
        encoder_config.encoder_width = vision_width
        # insert cross-attention layer every other block
        encoder_config.add_cross_attention = True
        encoder_config.cross_attention_freq = 1
        encoder_config.query_length = num_query_token
        Qformer = BertLMHeadModel(config=encoder_config)
        query_tokens = nn.Parameter(
            torch.zeros(1, num_query_token, encoder_config.hidden_size)
        )
        query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
        return Qformer, query_tokens
    
    def forward(self, batch, validating=False):
        
        video_features = batch['vid_features'].to(self.device) #B,T,P,D or [T,P,D]
        input_ids= batch['input_ids']#Bxmax(T)
        atts_opt = batch['attention_mask']  #Bxmax(T)
        targets= batch['labels'] #Bxmax(T)
        commentary = batch['commentary']

            
        batch_size, time_length, _, _ = video_features.size()
        
        video_features = self.ln_vision(video_features)
        
        video_features = einops.rearrange(video_features, 'b t n f -> (b t) n f', b=batch_size, t=time_length)
        position_ids = torch.arange(time_length, dtype=torch.long, device=video_features.device)
        position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
        
        frame_position_embeddings = self.video_frame_position_embedding(position_ids)
        frame_position_embeddings = frame_position_embeddings.unsqueeze(-2)
        
        frame_hidden_state = einops.rearrange(video_features, '(b t) n f -> b t n f',b=batch_size,t=time_length)
        frame_hidden_state = frame_position_embeddings + frame_hidden_state
        
        frame_hidden_state =  einops.rearrange(frame_hidden_state, 'b t q h -> b (t q) h',b=batch_size,t=time_length)
        
        
        frame_atts = torch.ones(frame_hidden_state.size()[:-1], dtype=torch.long).to(frame_hidden_state)
        
        video_query_tokens = self.video_query_tokens.expand(frame_hidden_state.shape[0], -1, -1).to(frame_hidden_state.device)
        
        video_query_output = self.video_Qformer.bert(
            query_embeds=video_query_tokens,
            encoder_hidden_states=frame_hidden_state,
            encoder_attention_mask=frame_atts,
            return_dict=True,
        )
        
        video_hidden = video_query_output.last_hidden_state
        
        inputs_opt = self.opt_proj(video_hidden)
        #inputs_embeds = self.opt_model.model.decoder.embed_tokens(input_ids)
        
        # targets = opt_tokens.input_ids.masked_fill(
        #     input_ids == self.opt_tokenizer.pad_token_id, -100
        # ).to(self.device)
        
        # empty_targets = (
        #     torch.ones(atts_opt.size(), dtype=torch.long).to(self.device).fill_(-100)
        # )
        
        # targets = torch.cat([empty_targets, targets], dim=1)
        
        
        
        
        
        if self.inference:
            return self.generate_text(inputs_opt)
        
        if validating:
            response = self.generate_text(inputs_opt)
            return response, batch['commentary']
        
        #atts_llama
        
        visual_label = torch.full((batch_size, self.num_video_query_token), -100, dtype=targets.dtype)
        concat_targets = torch.cat((visual_label, targets), dim=1).to(self.device)
        
        temp_input_ids = input_ids.clone().to(self.device)
        targets_embeds = self.opt_model.model.decoder.embed_tokens(temp_input_ids)
        
        embedding_cat = torch.cat((inputs_opt, targets_embeds), dim=1)
        mask_prefix = torch.ones(batch_size, self.num_video_query_token, dtype=atts_opt.dtype)
        mask = torch.concat((mask_prefix, atts_opt), dim=1).to(self.device)
        
        
        original_stdout = sys.stdout
        sys.stdout = io.StringIO()
        
        with self.maybe_autocast():
            outputs = self.opt_model(
                inputs_embeds=embedding_cat,
                attention_mask=mask,
                return_dict=True,
                labels=concat_targets,
            )
            
        sys.stdout = original_stdout
        loss = outputs.loss
        return loss
    

        
    def generate_text(self, inputs_opt,
        use_nucleus_sampling=False,
        num_beams=5,
        max_length=30,
        min_length=1,
        top_p=0.9,
        repetition_penalty=1.0,
        length_penalty=1.0,
        num_captions=1,
        temperature=1,):
        
        batch_size = inputs_opt.shape[0]
        mask_prefix = torch.ones(batch_size, self.num_video_query_token, dtype=atts_opt.dtype).to(self.device)
        
        outputs = self.opt_model.generate(
                inputs_embeds=inputs_opt, 
                attention_mask=mask_prefix,
                do_sample=use_nucleus_sampling,
                top_p=top_p,
                temperature=temperature,
                num_beams=num_beams,
                max_length=max_length,
                min_length=min_length,
                eos_token_id=self.eos_token_id,
                repetition_penalty=repetition_penalty,
                length_penalty=length_penalty,
                num_return_sequences=num_captions,
            )
        
         output_text = self.opt_tokenizer.batch_decode(
                outputs, skip_special_tokens=True
            )
         
         output_text = [text.strip() for text in output_text]
         return output_text
        
       
        
        
    
    def maybe_autocast(self, dtype=torch.float16):
        enable_autocast = self.device != torch.device("cpu")
        if enable_autocast:
            return torch.cuda.amp.autocast(dtype=dtype)
        else:
            return contextlib.nullcontext()
        
        
    # def generate_text(self, inputs_opt):
    #     start_embeds = self.opt_model.model.embed_tokens(torch.tensor([128000]).to(self.device))
    #     # inputs_opt_with_s = torch.cat([inputs_opt, start_embeds.expand(inputs_opt.size(0), -1, -1)], dim=1).to(dtype=torch.bfloat16)
    #     # temp_res_tokens = self.opt_model.generate(
    #     #     renormalize_logits=True,
    #     #     inputs_embeds=inputs_opt_with_s,
    #     #     max_new_tokens=128,
    #     #     num_beams=5,
    #     #     do_sample=True,
    #     #     min_length=5,
    #     #     top_p=0.9,
    #     #     repetition_penalty=1.0,
    #     #     length_penalty=1,
    #     #     temperature=1.0,
    #     # )
    #     # res_text = process_output_tokens(self, temp_res_tokens)
    #     # return res_text
    #     return None

    

        


IndentationError: unexpected indent (2705187739.py, line 234)

In [41]:
video_features = collated_batch['vid_features']

In [42]:
video_features.shape

torch.Size([3, 22, 257, 1024])

In [37]:
batch_size, time_length, _, _ = video_features.size()

In [38]:
video_features = einops.rearrange(video_features, 'b t n f -> (b t) n f', b=batch_size, t=time_length)



frame_hidden_state = einops.rearrange(video_features, '(b t) n f -> b t n f',b=batch_size,t=time_length)

frame_hidden_state =  einops.rearrange(frame_hidden_state, 'b t q h -> b (t q) h',b=batch_size,t=time_length)


frame_atts = torch.ones(frame_hidden_state.size()[:-1], dtype=torch.long).to(frame_hidden_state)

TypeError: ne() received an invalid combination of arguments - got (torch.Size, dtype=torch.dtype), but expected one of:
 * (Tensor input, Tensor other, *, Tensor out = None)
 * (Tensor input, Number other, *, Tensor out = None)


In [39]:
frame_hidden_state.shape

torch.Size([3, 5654, 1024])

In [33]:
frame_atts.shape

torch.Size([3, 5654])

In [45]:
model  = OPTModel()

out = model(collated_batch, validating=True)

  return torch.load(checkpoint_file, map_location=map_location)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [9]:
        




out = model(collated_batch)

print(out)

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50268. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


torch.Size([3, 3341, 1024])


  return torch.cuda.amp.autocast(dtype=dtype)


tensor(10.4282, device='cuda:0', grad_fn=<NllLossBackward0>)


  return torch.load(checkpoint_file, map_location=map_location)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [6]:



from dataset import ACGDataset

ds = ACGDataset(config.data.train_path)

batch = [ds[0], ds[1], ds[3], ds[4]]

collated_batch = ds.collator(batch)


# Initialize the model
model = MyModel(
    max_frame_pos=128,
    window=30,
    num_query_tokens=32,
    num_video_query_token=32,
    num_features=1024,
    device="cuda",  # Change to "cpu" if no GPU is available
    inference=False
)

# Perform a forward pass
out = model(collated_batch)

# Print the output
print(out)

  return torch.load(checkpoint_file, map_location=map_location)
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50268. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


TypeError: cannot assign 'torch.cuda.FloatTensor' as parameter 'video_query_tokens' (torch.nn.Parameter or None expected)