In [None]:
! pip install av
! pip install bert-score



In [None]:
import cv2
import av
import numpy as np
import torch
from transformers import (
    LlavaNextVideoProcessor,
    LlavaNextVideoForConditionalGeneration
)
from PIL import Image
import os

def get_best_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"✅ Using CUDA GPU: {torch.cuda.get_device_name(0)}")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print("✅ Using Apple Silicon GPU (MPS backend)")
    else:
        device = torch.device("cpu")
        print("⚠️ No GPU found — using CPU")
    return device

device = get_best_device()



✅ Using CUDA GPU: NVIDIA A100-SXM4-40GB


In [None]:
from huggingface_hub import login
import os
HF_TOKEN = "" #add your hugging face token here
login(token=HF_TOKEN)

In [None]:
# --- 1. Configuration ---
MODEL_ID = "llava-hf/LLaVA-NeXT-Video-7B-hf"
#substitute VIDEO_PATH with diffusion genereated video
VIDEO_PATH = "/content/drive/MyDrive/cse291a/project/llavaCritic/IMG_1813.mp4"
QUESTION_PROMPT = "what did this robot do"
# The conversation template format for LLaVA-NeXT
PROMPT = f"USER: <video>\n{QUESTION_PROMPT}ASSISTANT:"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import av
import torch
import numpy as np
from huggingface_hub import hf_hub_download
from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration

model_id = MODEL_ID

model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
).to(0)

processor = LlavaNextVideoProcessor.from_pretrained(model_id)

def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


# define a chat history and use `apply_chat_template` to get correctly formatted prompt
# Each value in "content" has to be a list of dicts with types ("text", "image", "video")
conversation = [
    {

        "role": "user",
        "content": [
            {"type": "text", "text": QUESTION_PROMPT},
            {"type": "video"},
            ],
    },
]

prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

# video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
container = av.open(VIDEO_PATH)

# sample uniformly 8 frames from the video, can sample more for longer videos
total_frames = container.streams.video[0].frames
indices = np.arange(0, total_frames, total_frames / 8).astype(int)
clip = read_video_pyav(container, indices)
inputs_video = processor(text=prompt, videos=clip, padding=True, return_tensors="pt").to(model.device)

output = model.generate(**inputs_video, max_new_tokens=100, do_sample=False)
llava_candidate_answer = processor.decode(output[0][2:], skip_special_tokens=True)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
print("\n--- LLaVA-NeXT-Video Result ---")
print(f"Question: {QUESTION_PROMPT}")
print(f"Answer: **{llava_candidate_answer}**")
print("-----------------------------\n")

# --- 5. BERTScore Part Configuration ---
# *** REPLACE WITH GOAL ***

GROUND_TRUTH_REFERENCE = "The robot arm used its gripper to successfully pick up the yellow cube from the left side of the table and place it on the red platform."

from bert_score import score

candidate = llava_candidate_answer
reference = GROUND_TRUTH_REFERENCE

# --- Compute BERTScore ---
P, R, F1 = score([candidate], [reference], lang="en", verbose=False)

print("--- BERTScore Evaluation ---")
print(f"Reference: {reference}")
print(f"Candidate: {candidate}")
print(f"BERTScore P: {P.item():.4f}")
print(f"BERTScore R: {R.item():.4f}")
print(f"BERTScore F1: **{F1.item():.4f}** (Main Metric)")
print("----------------------------")




--- LLaVA-NeXT-Video Result ---
Question: what did this robot do
Answer: **ER: 
what did this robot do ASSISTANT: The robot in the image appears to be an industrial robotic arm, likely used for tasks such as assembling, welding, or handling objects in a factory setting. It is equipped with a gripper at the end of its arm, which is used to pick up and manipulate objects. The robot is currently in a position where it seems to be either picking up or placing down an object, possibly a part or a component, as indicated by the gripper's position**
-----------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- BERTScore Evaluation ---
Reference: The robot arm used its gripper to successfully pick up the yellow cube from the left side of the table and place it on the red platform.
Candidate: ER: 
what did this robot do ASSISTANT: The robot in the image appears to be an industrial robotic arm, likely used for tasks such as assembling, welding, or handling objects in a factory setting. It is equipped with a gripper at the end of its arm, which is used to pick up and manipulate objects. The robot is currently in a position where it seems to be either picking up or placing down an object, possibly a part or a component, as indicated by the gripper's position
BERTScore P: 0.8419
BERTScore R: 0.8723
BERTScore F1: **0.8568** (Main Metric)
----------------------------
