In [4]:
import os
import random
from contextlib import contextmanager
from dataclasses import asdict
from typing import NamedTuple, Optional

from huggingface_hub import snapshot_download
from transformers import AutoTokenizer

from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.lora.request import LoRARequest
from vllm.multimodal.image import convert_image_mode
from vllm.utils import FlexibleArgumentParser

In [5]:
class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: list[str]
    stop_token_ids: Optional[list[int]] = None
    lora_requests: Optional[list[LoRARequest]] = None

### Aria

In [6]:
def run_aria(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "rhymes-ai/Aria"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        dtype="bfloat16",
        limit_mm_per_prompt={modality: 1},
    )

    prompts = [
        (
            f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
            "<|im_end|>\n<|im_start|>assistant\n"
        )
        for question in questions
    ]

    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )

### BLIP-2

In [7]:
def run_blip2(questons: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name="Salesforce/blip2-opt-2.7b"

    engine_args = EngineArgs(
        model=model_name,
        limit_mm_per_prompt={modality: 1}
    )

    prompts = [f"Question: {question} Answer:" for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts
    )

### Ernie4.5-VL

In [8]:
def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        limit_mm_per_prompt={modality: 1},
        trust_remote_code=True,
    )

    if modality == "image":
        placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
    elif modality == "video":
        placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"

    prompts = [
        (
            f"<|begin_of_sentence|>User: {question}{placeholder}\n"
            "Assistant: <think></think>"
        )
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )

### Gemma3N

In [13]:
def run_gemma3n(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "google/gemma-3n-E2B-it"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        limit_mm_per_prompt={modality: 1},
        enforce_eager=True,
    )

    prompts = [
        (
            "<start_of_turn>user\n"
            f"<image_soft_token>{question}<end_of_turn>\n"
            "<start_of_turn>model\n"
        )
        for question in questions
    ]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )

### H2OVL-Mississippi

In [10]:
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "h2oai/h2ovl-mississippi-800m"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        limit_mm_per_prompt={modality: 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    messages = [
        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
    ]
    prompts = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    stop_token_ids = [tokenizer.eos_token_id]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )

### LLaVA-1.5

In [11]:
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    modal_name = "llava-hf/llava-1.5-7b-hf"

    prompts = [f"USER: <image>\n{question}\nASSISTANT:" for question in questions]

    engine_args = EngineArgs(
        model=modal_name,
        max_model_len=4096,
        limit_mm_per_prompt={modality: 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )

### LLaVA-OneVision

In [12]:
def run_llava_onevision(questions: list[str], modality: str) -> ModelRequestData:
    model_name="llava-hf/llava-onevision-qwen2-7b-ov-hf"
    
    if modality == "video":
        prompts = [
            f"<|im_start|>user <video>\n{question}<|im_end|><|im_start|>assistant\n"
            for question in questions
        ]

    elif modality == "image":
        prompts = [
            f"<|im_start|>user <image>\n{question}<|im_end|><|im_start|>assistant\n"
            for question in questions
        ]

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=16384,
        limit_mm_per_prompt={modality: 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )

## Model map

In [14]:
model_example_map = {
    "aria": run_aria,
    "blip-2": run_blip2,
    "ernie45_vl": run_ernie45_vl,
    "gemma3n": run_gemma3n,
    "h2ovl_chat": run_h2ovl,
    "llava": run_llava,
    "llava-onevision": run_llava_onevision,
}

## Inference

In [26]:
def get_multi_modal_input(args):
    """
    return {
        "data": image or video,
        "question": question,
    }
    """
    if args.modality == "image":
        # Input image and question
        image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
        img_questions = [
            "What is the content of this image?",
            "Describe the content of this image in detail.",
            "What's in the image?",
            "Where is this image taken?",
        ]

        return {
            "data": image,
            "questions": img_questions,
        }

    if args.modality == "video":
        # Input video and question
        video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays
        # metadata = VideoAsset(name="baby_reading", num_frames=args.num_frames).metadata
        vid_questions = ["Why is this video funny?"]

        return {
            # "data": [(video, metadata)] if args.model_type == "glm4_1v" else video,
            "data": video,
            "questions": vid_questions,
        }

    msg = f"Modality {args.modality} is not supported."
    raise ValueError(msg)

In [16]:
def apply_image_repeat(
    image_repeat_prob, num_prompts, data, prompts: list[str], modality
):
    """Repeats images with provided probability of "image_repeat_prob".
    Used to simulate hit/miss for the MM preprocessor cache.
    """
    assert image_repeat_prob <= 1.0 and image_repeat_prob >= 0
    no_yes = [0, 1]
    probs = [1.0 - image_repeat_prob, image_repeat_prob]

    inputs = []
    cur_image = data
    for i in range(num_prompts):
        if image_repeat_prob is not None:
            res = random.choices(no_yes, probs)[0]
            if res == 0:
                # No repeat => Modify one pixel
                cur_image = cur_image.copy()
                new_val = (i // 256 // 256, i // 256, i % 256)
                cur_image.putpixel((0, 0), new_val)

        inputs.append(
            {
                "prompt": prompts[i % len(prompts)],
                "multi_modal_data": {modality: cur_image},
            }
        )

    return inputs

In [17]:
@contextmanager
def time_counter(enable: bool):
    if enable:
        import time

        start_time = time.time()
        yield
        elapsed_time = time.time() - start_time
        print("-" * 50)
        print("-- generate time = {}".format(elapsed_time))
        print("-" * 50)
    else:
        yield

In [22]:
def main(args):
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")

    modality = args.modality
    mm_input = get_multi_modal_input(args)
    data = mm_input["data"]
    questions = mm_input["questions"]
    req_data = model_example_map[model](questions, modality)

    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
        req_data.engine_args.limit_mm_per_prompt or {}
    )
    engine_args = asdict(req_data.engine_args) | {
        "seed": args.seed,
        # "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
    }

    llm = LLM(**engine_args)

    prompts = (
        req_data.prompts
        if args.use_different_prompt_per_request
        else [req_data.prompts[0]]
    )

    sampling_params = SamplingParams(
        temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
    )

    assert args.num_prompts > 0
    if args.num_prompts == 1:
        inputs = {
            "prompt": prompts[0],
            "multi_modal_data": {modality: data},
        }
    else:
        if args.image_repeat_prob is not None:
            inputs = apply_image_repeat(
                args.image_repeat_prob, args.num_prompts, data, prompts, modality
            )
        else:
            inputs = [
                {
                    "prompt": prompts[i % len(prompts)],
                    "multi_modal_data": {modality: data},
                }
                for i in range(args.num_prompts)
            ]

    lora_request = (
        req_data.lora_requests * args.num_prompts if req_data.lora_requests else None
    )

    with time_counter(args.time_generate):
        outputs = llm.generate(
            inputs,
            sampling_params=sampling_params,
            lora_request=lora_request,
        )

    print("-" * 50)
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
        print("-" * 50)

In [23]:
class Args:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

#### Image example

In [24]:
args = Args(
    model_type="llava",
    num_prompts=4,
    modality="image",
    num_frames=16,
    seed=None,
    image_repeat_prob=None,
    disable_mm_processor_cache=False,
    time_generate=True,
    use_different_prompt_per_request=False
)

main(args)

config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

INFO 09-12 20:00:38 [config.py:853] This model supports multiple tasks: {'embed', 'reward', 'score', 'classify', 'generate'}. Defaulting to 'generate'.


tokenizer_config.json: 0.00B [00:00, ?B/s]

INFO 09-12 20:00:38 [config.py:1467] Using max model len 4096
INFO 09-12 20:00:46 [config.py:2267] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 09-12 20:00:46 [config.py:4566] full_cuda_graph is not supported with cascade attention. Disabling cascade attention.


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/674 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

INFO 09-12 20:00:50 [__init__.py:244] Automatically detected platform rocm.
INFO 09-12 20:00:59 [core.py:459] Waiting for init message from front-end.
INFO 09-12 20:00:59 [core.py:69] Initializing a V1 LLM engine (v0.9.2.dev364+gb432b7a28) with config: model='llava-hf/llava-1.5-7b-hf', speculative_config=None, tokenizer='llava-hf/llava-1.5-7b-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoi

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


INFO 09-12 20:01:01 [gpu_model_runner.py:1751] Starting to load model llava-hf/llava-1.5-7b-hf...
INFO 09-12 20:01:01 [gpu_model_runner.py:1756] Loading model from scratch...
INFO 09-12 20:01:01 [rocm.py:224] Using Triton Attention backend on V1 engine.
INFO 09-12 20:01:01 [rocm.py:224] Using Triton Attention backend on V1 engine.
INFO 09-12 20:01:01 [weight_utils.py:292] Using model weights format ['*.safetensors']
INFO 09-12 20:01:12 [weight_utils.py:308] Time spent downloading weights for llava-hf/llava-1.5-7b-hf: 10.752270 seconds


Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  33% Completed | 1/3 [00:01<00:02,  1.40s/it]
Loading safetensors checkpoint shards:  67% Completed | 2/3 [00:03<00:01,  1.63s/it]
Loading safetensors checkpoint shards: 100% Completed | 3/3 [00:05<00:00,  1.73s/it]
Loading safetensors checkpoint shards: 100% Completed | 3/3 [00:05<00:00,  1.68s/it]



INFO 09-12 20:01:17 [default_loader.py:272] Loading weights took 5.16 seconds
INFO 09-12 20:01:17 [gpu_model_runner.py:1782] Model loading took 13.2012 GiB and 16.181506 seconds
INFO 09-12 20:01:17 [gpu_model_runner.py:2221] Encoder cache will be initialized with a budget of 16384 tokens, and profiled with 29 image items of the maximum feature size.
INFO 09-12 20:01:41 [backends.py:509] Using cache directory: /root/.cache/vllm/torch_compile_cache/0b9de6558d/rank_0_0/backbone for vLLM's torch.compile
INFO 09-12 20:01:41 [backends.py:520] Dynamo bytecode transform time: 4.72 s
INFO 09-12 20:01:58 [backends.py:181] Cache the graph of shape None for later use
INFO 09-12 20:01:58 [backends.py:193] Compiling a graph for general shape takes 14.12 s
INFO 09-12 20:02:00 [monitor.py:34] torch.compile takes 18.85 s in total
INFO 09-12 20:02:02 [gpu_worker.py:232] Available KV cache memory: 156.20 GiB
INFO 09-12 20:02:02 [kv_cache_utils.py:716] GPU KV cache size: 319,888 tokens
INFO 09-12 20:02:02

Capturing CUDA graphs: 100%|██████████| 67/67 [00:38<00:00,  1.73it/s]


INFO 09-12 20:02:41 [gpu_model_runner.py:2306] Graph capturing finished in 39 secs, took 0.56 GiB
INFO 09-12 20:02:41 [core.py:172] init engine (profile, create kv cache, warmup model) took 83.53 seconds


Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Processed prompts:   0% 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

--------------------------------------------------
-- generate time = 1.930650234222412
--------------------------------------------------
--------------------------------------------------
 The image features a tall tower with a spire, surrounded by a beautiful flowering tree. The tree is filled with pink flowers, creating a vibrant and lively atmosphere. The tower stands tall in the background, with the tree's branches extending towards it. The combination of the tower and the flow
--------------------------------------------------
 The image features a tall tower with a spire, surrounded by a beautiful cherry blossom tree. The tree is filled with pink flowers, creating a picturesque scene. The tower stands tall in the background, with the blossoming tree in the foreground. The combination of the tower and the v
--------------------------------------------------
 The image features a tall tower with a spire, surrounded by a beautiful cherry blossom tree. The tree is filled with pink 



#### Video example

In [27]:
args = Args(
    model_type="llava-onevision",
    num_prompts=4,
    modality="video",
    num_frames=16,
    seed=None,
    image_repeat_prob=None,
    disable_mm_processor_cache=False,
    time_generate=True,
    use_different_prompt_per_request=False
)

main(args)

config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json: 0.00B [00:00, ?B/s]

INFO 09-12 20:07:41 [config.py:853] This model supports multiple tasks: {'embed', 'reward', 'score', 'classify', 'generate'}. Defaulting to 'generate'.


tokenizer_config.json: 0.00B [00:00, ?B/s]

INFO 09-12 20:07:41 [config.py:1467] Using max model len 16384
INFO 09-12 20:07:41 [config.py:2267] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 09-12 20:07:41 [config.py:4566] full_cuda_graph is not supported with cascade attention. Disabling cascade attention.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/126 [00:00<?, ?B/s]

INFO 09-12 20:07:45 [__init__.py:244] Automatically detected platform rocm.
INFO 09-12 20:07:54 [core.py:459] Waiting for init message from front-end.
INFO 09-12 20:07:54 [core.py:69] Initializing a V1 LLM engine (v0.9.2.dev364+gb432b7a28) with config: model='llava-hf/llava-onevision-qwen2-7b-ov-hf', speculative_config=None, tokenizer='llava-hf/llava-onevision-qwen2-7b-ov-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=16384, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_v

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


INFO 09-12 20:07:56 [gpu_model_runner.py:1751] Starting to load model llava-hf/llava-onevision-qwen2-7b-ov-hf...
INFO 09-12 20:07:56 [gpu_model_runner.py:1756] Loading model from scratch...
INFO 09-12 20:07:56 [rocm.py:224] Using Triton Attention backend on V1 engine.
INFO 09-12 20:07:56 [rocm.py:224] Using Triton Attention backend on V1 engine.
INFO 09-12 20:07:56 [weight_utils.py:292] Using model weights format ['*.safetensors']
INFO 09-12 20:08:10 [weight_utils.py:308] Time spent downloading weights for llava-hf/llava-onevision-qwen2-7b-ov-hf: 13.937792 seconds


Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:01<00:05,  1.72s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:03<00:03,  1.77s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:05<00:01,  1.78s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.29s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.46s/it]



INFO 09-12 20:08:16 [default_loader.py:272] Loading weights took 5.88 seconds
INFO 09-12 20:08:16 [gpu_model_runner.py:1782] Model loading took 15.1484 GiB and 20.123120 seconds
INFO 09-12 20:08:17 [gpu_model_runner.py:2221] Encoder cache will be initialized with a budget of 16384 tokens, and profiled with 6 video items of the maximum feature size.
INFO 09-12 20:08:34 [backends.py:509] Using cache directory: /root/.cache/vllm/torch_compile_cache/8ec670dcc6/rank_0_0/backbone for vLLM's torch.compile
INFO 09-12 20:08:34 [backends.py:520] Dynamo bytecode transform time: 4.07 s
INFO 09-12 20:08:49 [backends.py:181] Cache the graph of shape None for later use
INFO 09-12 20:08:49 [backends.py:193] Compiling a graph for general shape takes 12.85 s
INFO 09-12 20:08:51 [monitor.py:34] torch.compile takes 16.92 s in total
INFO 09-12 20:08:53 [gpu_worker.py:232] Available KV cache memory: 150.55 GiB
INFO 09-12 20:08:53 [kv_cache_utils.py:716] GPU KV cache size: 2,819,040 tokens
INFO 09-12 20:08:5

Capturing CUDA graphs: 100%|██████████| 67/67 [00:13<00:00,  4.99it/s]


INFO 09-12 20:09:06 [gpu_model_runner.py:2306] Graph capturing finished in 13 secs, took 0.27 GiB
INFO 09-12 20:09:06 [core.py:172] init engine (profile, create kv cache, warmup model) took 49.89 seconds


Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0% 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

--------------------------------------------------
-- generate time = 1.961785078048706
--------------------------------------------------
--------------------------------------------------
The video is humorous because it shows a baby with a disproportionately large head compared to its body, wearing oversized glasses, and engaging in a pretend reading session.
--------------------------------------------------
The video is humorous because it shows a child with a blue face and glasses pretending to read a book, which is an amusing and unexpected behavior for a child.
--------------------------------------------------
The video is funny because the child's exaggerated actions and expressions, such as the large glasses and the dramatic reading, create a humorous and endearing scene.
--------------------------------------------------
The video is funny because the child's actions are exaggerated and humorous, such as waving their hands and adjusting the glasses.
------------------------

