In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoProcessor

model_name = "DAMO-NLP-SG/VideoLLaMA3-7B"

question = "Continue describing what's happening now on livestream."

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)

processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.28it/s]


In [2]:
import os
folder_name = "clips"
files = sorted(os.listdir(folder_name))
print(files)

['output_000.mp4', 'output_001.mp4', 'output_002.mp4', 'output_003.mp4', 'output_004.mp4', 'output_005.mp4', 'output_006.mp4', 'output_007.mp4', 'output_008.mp4', 'output_009.mp4', 'output_010.mp4', 'output_011.mp4', 'output_012.mp4', 'output_013.mp4', 'output_014.mp4', 'output_015.mp4', 'output_016.mp4', 'output_017.mp4', 'output_018.mp4', 'output_019.mp4', 'output_020.mp4', 'output_021.mp4', 'output_022.mp4', 'output_023.mp4', 'output_024.mp4', 'output_025.mp4', 'output_026.mp4', 'output_027.mp4', 'output_028.mp4', 'output_029.mp4', 'output_030.mp4', 'output_031.mp4', 'output_032.mp4', 'output_033.mp4', 'output_034.mp4', 'output_035.mp4', 'output_036.mp4', 'output_037.mp4', 'output_038.mp4', 'output_039.mp4', 'output_040.mp4', 'output_041.mp4', 'output_042.mp4', 'output_043.mp4', 'output_044.mp4', 'output_045.mp4', 'output_046.mp4', 'output_047.mp4', 'output_048.mp4', 'output_049.mp4', 'output_050.mp4', 'output_051.mp4', 'output_052.mp4', 'output_053.mp4', 'output_054.mp4', 'output_0

In [3]:
from collections import deque
responses = []

for idx, file in enumerate(files):

    conversation = [
        {"role": "system", "content": (
            "Focus only on what is visible on screen: the player’s actions, the environment, objects, mobs, structures, and movements."
            "Use clear and neutral language, without adding interpretations, emotions, or intentions unless they are explicitly shown."
            "Describe events in chronological order, as they occur in the stream."
            "If something is uncertain, indicate it with phrases like “probably” or “seems to."
            "Do not invent or add content that is not present in the stream."
            "Include relevant details such as colors, directions, locations, and interactions between entities."
            "You are watching a livestream. Always relate your answer to what has happened earlier, using the memory summary provided. If new events contradict older ones, update the summary."
            "You are a real-time narrator for a livestream. "
            "If new events contradict previous ones, update the narrative accordingly."
            "Be concise (2–3 sentences), present tense"
        )},
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": {
                        "video_path": f"{folder_name}/{file}",
                        "fps": 60,
                        "max_frames": 1  # Keep this LOW for 6GB VRAM
                    },
                },
                {"type": "text", "text": question},
                
            ]
        },
    ]

    inputs = processor(conversation=conversation, return_tensors="pt")
    inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
    if "pixel_values" in inputs:
        inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
    output_ids = model.generate(**inputs, max_new_tokens=256)
    response = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()    
    responses.append({file : response})
    print(f"{idx} : {response}")
    torch.cuda.empty_cache()


0 : The player is currently navigating through the menu of a Minecraft Java Edition game. They are in a cave-like environment with stone walls and floors, and there are several ghost-like entities floating around. The player's avatar, represented by a blocky character head, is visible on the left side of the screen. The player appears to be deciding which mode to start the game in, as options such as Singleplayer, Multiplayer, Minecraft Realms, Options, and Quit Game are displayed on the screen.
1 : The player is currently in a dense forest environment, surrounded by tall trees with green leaves. They are holding a red and black tool, presumably for breaking blocks, as indicated by the text "Block breaking" at the bottom right of the screen. The player's health is displayed at the top left corner with red hearts.
2 : The player is mining stone blocks in a Minecraft cave, breaking them to collect resources.
3 : The player is holding a sword and standing in front of a wall. The player se

In [4]:
import json

with open("visual_prompts.json", "w") as fp:
    fp.write("{")
    for line in responses:
        fp.write(f"{line},\n")
    fp.write("}")
