In [1]:
import os
import sys
import json
import av
import cv2
import numpy as np
import torch
from tqdm import tqdm
from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extract_frames(video_dir, model):
    container = av.open(video_dir)

    # extract evenly spaced frames from video
    seg_len = container.streams.video[0].frames
    clip_len = model.config.encoder.num_frames
    indices = set(np.linspace(0, seg_len, num=clip_len, endpoint=False).astype(np.int64))
    frames = []
    container.seek(0)

    for i, frame in enumerate(container.decode(video=0)):
        if i in indices:
            frames.append(frame.to_ndarray(format="rgb24"))
    
    # check if the number of frames is correct
    while len(frames) < clip_len:
        frames.append(frames[-1])
            
    return frames

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load pretrained processor, tokenizer, and model
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = VisionEncoderDecoderModel.from_pretrained("Neleac/timesformer-gpt2-video-captioning").to(device)

In [6]:
# Load video data
video_dirs = os.listdir('../../../data/fewshot/raw_video')
# skip the following videos
video_dirs = [video_dir for video_dir in video_dirs if video_dir.endswith(".mp4")]
# sort video directories
video_dirs.sort()
print(video_dirs[0])
print(f"Evaluating {len(video_dirs)} videos ...")
# extract video ids, excluding the ".mp4" extension
video_ids = [video_dir.split(".mp4")[0] for video_dir in video_dirs]

gen_kwargs = {
        "min_length": 20, 
        "max_length": 50, 
        "num_beams": 12,
    }

captions = []

for video_dir in tqdm(video_dirs):
    file_path = os.path.join('../../../data/fewshot/raw_video', video_dir)
    frames = extract_frames(file_path, model)
    # generate caption
    pixel_values = image_processor(frames, return_tensors="pt", padding=True).pixel_values.to(device)
    tokens = model.generate(pixel_values, **gen_kwargs)
    caption = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
    print("generated:", caption)
    print("ground truth:", video_dir.split(".mp4")[0])
    captions.append(caption)

6dLGxtbZ-T0_120.000_130.000.mp4
Evaluating 9 videos ...


  return torch.tensor(value)
 11%|█         | 1/9 [00:03<00:29,  3.69s/it]

A person is using a razor to carve a tattoo on the leg of a woman who is lying on a bed.


 22%|██▏       | 2/9 [00:04<00:15,  2.27s/it]

A person is driving a vehicle in a parking lot and then another person is driving a vehicle in a parking lot.


 33%|███▎      | 3/9 [00:06<00:12,  2.02s/it]

A man is sitting in a chair and is talking to another man who is sitting next to him.


 44%|████▍     | 4/9 [00:07<00:08,  1.65s/it]

A man is pushing a sled down a snow covered road while another man is talking in the background.


 56%|█████▌    | 5/9 [00:11<00:08,  2.24s/it]

A man is using a machine to cut a piece of wood into smaller pieces and then puts the pieces back together.


 67%|██████▋   | 6/9 [00:13<00:07,  2.42s/it]

A man is demonstrating how to use a tool to make a portrait of a man in the woods.


 78%|███████▊  | 7/9 [00:15<00:04,  2.02s/it]

A man is sitting at a table and talking about how to play a game of Scrabble.


 89%|████████▉ | 8/9 [00:16<00:01,  1.89s/it]

A group of people are sitting around a table and one of them is sitting on top of a piece of wood.


100%|██████████| 9/9 [00:17<00:00,  1.96s/it]

A person is showing how to use a remote control on a device that is attached to a phone.



