In [1]:
# Install decord and other video processing libraries
!pip install decord
!pip install av
!pip install pyav
!pip install opencv-python
!pip install moviepy

Collecting decord
  Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl.metadata (422 bytes)
Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: decord
Successfully installed decord-0.6.0
Collecting av
  Downloading av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Downloading av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl (40.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av
Successfully installed av-16.0.1
[31mERROR: Could not find a version that satisfies the requirement pyav (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pyav[0m[31m


In [19]:
import numpy as np
import torch
import os
import torch.nn as nn
import torch.nn.functional as F
from huggingface_hub import snapshot_download, hf_hub_download
import bisect
import shutil
import numpy as np
from PIL import Image
import requests
import copy
import warnings
from decord import VideoReader, cpu
from datasets import load_dataset
import re

In [3]:
data_list = {
    "Action Sequence": ("action_sequence.json", "star/Charades_v1_480/", "video", True), # has start & end
    "Action Prediction": ("action_prediction.json", "star/Charades_v1_480/", "video", True), # has start & end
    "Action Antonym": ("action_antonym.json", "ssv2_video/", "video", False),
    "Fine-grained Action": ("fine_grained_action.json", "Moments_in_Time_Raw/videos/", "video", False),
    "Unexpected Action": ("unexpected_action.json", "FunQA_test/test/", "video", False),
    "Object Existence": ("object_existence.json", "clevrer/video_validation/", "video", False),
    "Object Interaction": ("object_interaction.json", "star/Charades_v1_480/", "video", True), # has start & end
    "Object Shuffle": ("object_shuffle.json", "perception/videos/", "video", False),
    "Moving Direction": ("moving_direction.json", "clevrer/video_validation/", "video", False),
    "Action Localization": ("action_localization.json", "sta/sta_video/", "video", True),  # has start & end
    "Scene Transition": ("scene_transition.json", "scene_qa/video/", "video", False),
    "Action Count": ("action_count.json", "perception/videos/", "video", False),
    "Moving Count": ("moving_count.json", "clevrer/video_validation/", "video", False),
    "Moving Attribute": ("moving_attribute.json", "clevrer/video_validation/", "video", False),
    "State Change": ("state_change.json", "perception/videos/", "video", False),
    "Fine-grained Pose": ("fine_grained_pose.json", "nturgbd/", "video", False),
    "Character Order": ("character_order.json", "perception/videos/", "video", False),
    "Egocentric Navigation": ("egocentric_navigation.json", "vlnqa/", "video", False),
    "Episodic Reasoning": ("episodic_reasoning.json", "tvqa/frames_fps3_hq/", "frame", True),  # has start & end, read frame
    "Counterfactual Inference": ("counterfactual_inference.json", "clevrer/video_validation/", "video", False),
}

data_dir = "dataset"
if not os.path.exists(data_dir):
    os.mkdir("dataset")

def read_video_pyav(video_path, start, end, n_frames=8):
    """
    Reads a video for given start-end timestamps interval
    and uniformly samples 8 frames of it
    """
    container = av.open(video_path)
    video = container.streams.get(0)[0]

    av_timestamps = [
        int(packet.pts * video.time_base) for packet in container.demux(video) if packet.pts is not None
    ]

    av_timestamps.sort()
    start_id = bisect.bisect_left(av_timestamps, start)
    end_id = bisect.bisect_left(av_timestamps, end)

    # in case it is a very short video, lets take a longer duration and sample
    if end_id  - start_id < 10:
        end_id += 10
        start_id -= 10

    end_id = min(len(av_timestamps) - 1, end_id)
    start_id = max(1, start_id)

    # We sample n_frames frames for tuning following the original paper
    # But we can increase the number of frames for longer videos and check out if it helps performance
    # Change the below "n_frames" to any number of frames you want, and note that more frames -> more computational resources needed
    indices = np.linspace(start_id, end_id, n_frames).astype(int)

    frames = []
    container.seek(0)
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_id:
            break
        if i >= start_id and i in indices:
            frames.append(frame)
    assert len(frames) == n_frames, f"Got {len(frames)} frames but should be {n_frames}. Check the indices: {indices};, start_id: {start_id}, end_id: {end_id}. Len of video is {len(av_timestamps)} frames."
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

def collate_read_video(example, path):
    # Some datasets have a start-end interval, so we try to get it if exists.
    # Otherwise just set a very large end timestamp
    clip = read_video_pyav(f'{path}/{example["video"]}', example.get("start", 1), example.get("end", 1e+10))
    example["clip"] = clip
    return example

TASK_NAME = "Moving Count"
annotation_fn, video_dir, video_type, has_clip = data_list[TASK_NAME]

annotation_fn_local = hf_hub_download(
    repo_id="OpenGVLab/MVBench",
    filename='json/' + annotation_fn,
    repo_type="dataset",
    local_dir=data_dir
)

video_zip_name = video_dir.split("/")[0] + ".zip"
videos_zip = hf_hub_download(
    repo_id="OpenGVLab/MVBench",
    filename='video/' + video_zip_name,
    repo_type="dataset",
    local_dir=data_dir
)

for zip_file in os.listdir(f"{data_dir}/video"):
    if zip_file.endswith(".zip"):
        shutil.unpack_archive(
            f"{data_dir}/video/{zip_file}",
            f"{data_dir}/video/videos_unzipped/"
        )

ds = load_dataset("json", data_files=annotation_fn_local, split="train")
ds

has_missing = False
for sample in ds:
    if not os.path.exists(f"{data_dir}/video/videos_unzipped/{video_dir}/{sample['video']}"):
        print(f"Video `{sample['video']}` does not exists!")
        has_missing = True

print(f"Dataset length = {len(ds)}")
if has_missing:
    ds = ds.filter(lambda x: os.path.exists(f"{data_dir}/video/videos_unzipped/{video_dir}/{x['video']}"))

print(f"Dataset length = {len(ds)}")

moving_count.json: 0.00B [00:00, ?B/s]

video/clevrer.zip:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset length = 200
Dataset length = 200


In [4]:
ds

Dataset({
    features: ['video', 'question', 'candidates', 'answer'],
    num_rows: 200
})

In [5]:
from pathlib import Path

unzipped_dir = f"{data_dir}/video/videos_unzipped"
video_subdir = Path(video_dir).name
video_path = os.path.join(unzipped_dir, "star", video_subdir)

In [12]:
video_filename = ds[1]['video']
full_video_path = f"{unzipped_dir}/{video_dir}/{video_filename}"

In [13]:
prompt = ds[1]['question']

In [30]:
import torch
from transformers import AutoProcessor, Idefics3ForConditionalGeneration
from PIL import Image
import cv2
import numpy as np
from typing import List
import logging

class VideoFrameExtractor:
    def __init__(self, max_frames: int = 50):
        self.max_frames = max_frames

    def resize_and_center_crop(self, image: Image.Image, target_size: int) -> Image.Image:
        # Get current dimensions
        width, height = image.size

        # Calculate new dimensions keeping aspect ratio
        if width < height:
            new_width = target_size
            new_height = int(height * (target_size / width))
        else:
            new_height = target_size
            new_width = int(width * (target_size / height))

        # Resize
        image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)

        # Center crop
        left = (new_width - target_size) // 2
        top = (new_height - target_size) // 2
        right = left + target_size
        bottom = top + target_size

        return image.crop((left, top, right, bottom))

    def extract_frames(self, video_path: str) -> List[Image.Image]:
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise ValueError(f"Could not open video: {video_path}")

        # Get video properties
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = int(cap.get(cv2.CAP_PROP_FPS))

        # Calculate frame indices to extract (1fps)
        frame_indices = list(range(0, total_frames, fps))

        # If we have more frames than max_frames, sample evenly
        if len(frame_indices) > self.max_frames:
            indices = np.linspace(0, len(frame_indices) - 1, self.max_frames, dtype=int)
            frame_indices = [frame_indices[i] for i in indices]

        frames = []
        for frame_idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
            ret, frame = cap.read()
            if ret:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                pil_image = Image.fromarray(frame)
                pil_image = self.resize_and_center_crop(pil_image, 384)
                frames.append(pil_image)

        cap.release()
        return frames

def load_model(checkpoint_path: str, base_model_id: str = "HuggingFaceTB/SmolVLM-Instruct", device: str = "cuda"):
    # Load processor from original model
    processor = AutoProcessor.from_pretrained(base_model_id)
    if checkpoint_path:
        # Load fine-tuned model from checkpoint
        model = Idefics3ForConditionalGeneration.from_pretrained(
            checkpoint_path,
            torch_dtype=torch.bfloat16,
            device_map=device
        )
    else:
        model = Idefics3ForConditionalGeneration.from_pretrained(
            base_model_id,
            torch_dtype=torch.bfloat16,
            device_map=device
        )

    # Configure processor for video frames
    processor.image_processor.size = (384, 384)
    processor.image_processor.do_resize = False
    processor.image_processor.do_image_splitting = False

    return model, processor

def generate_response(model, processor, video_path: str, question: str, max_frames: int = 50):
    # Extract frames
    frame_extractor = VideoFrameExtractor(max_frames)
    frames = frame_extractor.extract_frames(video_path)

    # Create prompt with frames
    image_tokens = [{"type": "image"} for _ in range(len(frames))]
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Answer briefly."},
                *image_tokens,
                {"type": "text", "text": question}
            ]
        }
    ]

    inputs = processor(
        text=processor.apply_chat_template(messages, add_generation_prompt=True),
        images=[img for img in frames],
        return_tensors="pt"
    ).to(model.device)

    start_time = time.time()
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        num_beams=5,
        temperature=0.7,
        do_sample=True,
        use_cache=True
    )

    # Decode response
    response = processor.decode(outputs[0], skip_special_tokens=True)
    end_time = time.time()
    elapsed_time = end_time - start_time
    return response, elapsed_time

checkpoint_path = None
base_model_id = "HuggingFaceTB/SmolVLM-Instruct"
video_path = full_video_path
question = f"{prompt}\nYou have the following candidates for the right answer: {ds[1]['candidates'][0]}, {ds[1]['candidates'][1]}, {ds[1]['candidates'][2]}, {ds[1]['candidates'][3]}"

device = "cuda" if torch.cuda.is_available() else "cpu"
print('loasing model')
model, processor = load_model(checkpoint_path, base_model_id, device)
model.eval()
response, _ = generate_response(model, processor, video_path, question)

print("Question:", question)
print("Response:", response)

loasing model
Question: How many red objects are moving?
You have the following candidates for the right answer: 3, 0, 2, 1
Response: User: Answer briefly.<global-img><global-img><global-img><global-img><global-img><global-img>How many red objects are moving?
You have the following candidates for the right answer: 3, 0, 2, 1
Assistant: 0


In [31]:
import time
from tqdm import tqdm

comp_time = 0
correct_guessed = 0

model.eval()
with torch.no_grad():

    for i in tqdm(range(len(ds)), desc="Processing samples"):
        sample = ds[i]
        video_filename = sample['video']
        full_video_path = f"{unzipped_dir}/{video_dir}/{video_filename}"
        prompt = sample['question']
        real_answer = sample['answer']
        question = f"{prompt}\nYou have the following candidates for the right answer: {sample['candidates'][0]}, {sample['candidates'][1]}, {sample['candidates'][2]}, {sample['candidates'][3]}"
        response, elapsed_time = generate_response(model, processor, full_video_path, question)

        comp_time += elapsed_time

        match = re.search(r"Assistant:\s*(\d+)", response)
        if match:
            number = int(match.group(1))
        else:
            print("Number not found")

        if str(number) == real_answer:
            correct_guessed +=1

accuracy = correct_guessed / len(ds)
print(f'Computational Time: {comp_time}\nAccuracy: {accuracy}')

Processing samples: 100%|██████████| 200/200 [48:26<00:00, 14.53s/it]

Computational Time: 2798.0575230121613
Accuracy: 0.35



