In [2]:
import os
from dataclasses import asdict
from typing import NamedTuple, Optional

from huggingface_hub import snapshot_download
from PIL.Image import Image
from transformers import AutoProcessor, AutoTokenizer

from vllm import LLM, EngineArgs, SamplingParams
from vllm.lora.request import LoRARequest
from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser

In [3]:
class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompt: str
    image_data: list[Image]
    stop_token_ids: Optional[list[int]] = None
    chat_template: Optional[str] = None
    lora_requests: Optional[list[LoRARequest]] = None

### Aria

In [4]:
def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "rhymes-ai/Aria"
    engine_args = EngineArgs(
        model=model_name,
        tokenizer_mode="slow",
        trust_remote_code=True,
        dtype="bfloat16",
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
    prompt = (
        f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n<|im_start|>assistant\n"
    )
    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )

### Gemma3

In [5]:
def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "google/gemma-3-4b-it"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )

### H2OVL-Mississippi

In [6]:
def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "h2oai/h2ovl-mississippi-800m"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"max_dynamic_patch": 4},
    )

    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Stop tokens for H2OVL-Mississippi
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
    stop_token_ids = [tokenizer.eos_token_id]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )

### LLaVA-1.5

In [7]:
def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
    # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
    # it will generate poor response for multi-image inputs!
    model_name = "llava-hf/llava-1.5-7b-hf"
    engine_args = EngineArgs(
        model=model_name,
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )

### LLaVA-OneVision

In [8]:
def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=16384,
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


## Model map

In [20]:
model_example_map = {
    "aria": load_aria,
    "gemma3": load_gemma3,
    "h2ovl_chat": load_h2ovl,
    "llava": load_llava,
    "llava-onevision": load_llava_onevision,
}

## Inference

In [10]:
QUESTION = "What is the content of each image?"
IMAGE_URLS = [
    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
]

In [15]:
def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]):
    req_data = model_example_map[model](question, image_urls)

    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)

    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
    )

    outputs = llm.generate(
        {
            "prompt": req_data.prompt,
            "multi_modal_data": {"image": req_data.image_data},
        },
        sampling_params=sampling_params,
        lora_request=req_data.lora_requests,
    )

    print("-" * 50)
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
        print("-" * 50)

In [16]:
def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]):
    req_data = model_example_map[model](question, image_urls)

    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
        req_data.engine_args.limit_mm_per_prompt or {}
    )

    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)

    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
    )
    outputs = llm.chat(
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": question,
                    },
                    *(
                        {
                            "type": "image_url",
                            "image_url": {"url": image_url},
                        }
                        for image_url in image_urls
                    ),
                ],
            }
        ],
        sampling_params=sampling_params,
        chat_template=req_data.chat_template,
        lora_request=req_data.lora_requests,
    )

    print("-" * 50)
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
        print("-" * 50)

In [17]:
def main(method: str, model: str, seed: int, question: str, image_urls: list[str]):
    if method == "generate":
        run_generate(model, question, image_urls, seed)
    elif method == "chat":
        run_chat(model, question, image_urls, seed)
    else:
        raise ValueError(f"Invalid method: {method}")

In [29]:
main("generate", "aria", None, QUESTION, IMAGE_URLS)

INFO 09-19 19:46:14 [config.py:853] This model supports multiple tasks: {'score', 'embed', 'reward', 'generate', 'classify'}. Defaulting to 'generate'.
INFO 09-19 19:46:14 [config.py:1467] Using max model len 65536
INFO 09-19 19:46:14 [config.py:2267] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 09-19 19:46:14 [config.py:4566] full_cuda_graph is not supported with cascade attention. Disabling cascade attention.
INFO 09-19 19:46:18 [__init__.py:244] Automatically detected platform rocm.
INFO 09-19 19:46:26 [core.py:459] Waiting for init message from front-end.
INFO 09-19 19:46:26 [core.py:69] Initializing a V1 LLM engine (v0.9.2.dev364+gb432b7a28) with config: model='rhymes-ai/Aria', speculative_config=None, tokenizer='rhymes-ai/Aria', skip_tokenizer_init=False, tokenizer_mode=slow, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=65536, download_dir=None, load_format=LoadFormat.AUTO, tenso

Loading safetensors checkpoint shards:   0% Completed | 0/12 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:   8% Completed | 1/12 [00:01<00:13,  1.19s/it]
Loading safetensors checkpoint shards:  17% Completed | 2/12 [00:02<00:10,  1.04s/it]
Loading safetensors checkpoint shards:  25% Completed | 3/12 [00:03<00:09,  1.03s/it]
Loading safetensors checkpoint shards:  33% Completed | 4/12 [00:04<00:07,  1.02it/s]
Loading safetensors checkpoint shards:  42% Completed | 5/12 [00:05<00:06,  1.00it/s]
Loading safetensors checkpoint shards:  50% Completed | 6/12 [00:05<00:05,  1.16it/s]
Loading safetensors checkpoint shards:  58% Completed | 7/12 [00:06<00:04,  1.16it/s]
Loading safetensors checkpoint shards:  67% Completed | 8/12 [00:07<00:04,  1.01s/it]
Loading safetensors checkpoint shards:  75% Completed | 9/12 [00:08<00:02,  1.02it/s]
Loading safetensors checkpoint shards:  83% Completed | 10/12 [00:09<00:01,  1.01it/s]
Loading safetensors checkpoint shards:  92% Completed | 11/12

INFO 09-19 19:46:41 [default_loader.py:272] Loading weights took 11.76 seconds
INFO 09-19 19:46:41 [gpu_model_runner.py:1782] Model loading took 49.8945 GiB and 12.099335 seconds
INFO 09-19 19:46:41 [gpu_model_runner.py:2221] Encoder cache will be initialized with a budget of 16384 tokens, and profiled with 64 image items of the maximum feature size.
INFO 09-19 19:47:06 [backends.py:509] Using cache directory: /root/.cache/vllm/torch_compile_cache/7382194048/rank_0_0/backbone for vLLM's torch.compile
INFO 09-19 19:47:06 [backends.py:520] Dynamo bytecode transform time: 4.13 s
INFO 09-19 19:47:08 [backends.py:155] Directly load the compiled graph(s) for shape None from the cache, took 0.367 s
INFO 09-19 19:47:08 [monitor.py:34] torch.compile takes 4.13 s in total
INFO 09-19 19:47:09 [gpu_worker.py:232] Available KV cache memory: 109.67 GiB
INFO 09-19 19:47:09 [kv_cache_utils.py:716] GPU KV cache size: 410,704 tokens
INFO 09-19 19:47:09 [kv_cache_utils.py:720] Maximum concurrency for 65,

Capturing CUDA graphs: 100%|██████████| 67/67 [00:10<00:00,  6.55it/s]


INFO 09-19 19:47:19 [gpu_model_runner.py:2306] Graph capturing finished in 10 secs, took 0.37 GiB
INFO 09-19 19:47:19 [core.py:172] init engine (profile, create kv cache, warmup model) took 38.32 seconds


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0% 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

--------------------------------------------------
  1 . A duck in water  2 . A lion in grass  3 . A bird on a tree branch  4 . A whale in water  5 . A starfish on the beach  6 . A snail on a white background  7 . A flower with a bee  8 . Two dogs in the snow  9 . A cat on the ground  1 0 . A guinea pig on a table  1 1 . A rabbit in the forest  1 2 . A horse and a pony in a field
--------------------------------------------------




In [34]:
main("chat", "aria", None, QUESTION, IMAGE_URLS)

INFO 09-19 19:50:51 [config.py:853] This model supports multiple tasks: {'score', 'embed', 'reward', 'generate', 'classify'}. Defaulting to 'generate'.
INFO 09-19 19:50:51 [config.py:1467] Using max model len 65536
INFO 09-19 19:50:51 [config.py:2267] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 09-19 19:50:51 [config.py:4566] full_cuda_graph is not supported with cascade attention. Disabling cascade attention.
INFO 09-19 19:50:54 [__init__.py:244] Automatically detected platform rocm.
INFO 09-19 19:51:03 [core.py:459] Waiting for init message from front-end.
INFO 09-19 19:51:03 [core.py:69] Initializing a V1 LLM engine (v0.9.2.dev364+gb432b7a28) with config: model='rhymes-ai/Aria', speculative_config=None, tokenizer='rhymes-ai/Aria', skip_tokenizer_init=False, tokenizer_mode=slow, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=65536, download_dir=None, load_format=LoadFormat.AUTO, tenso

Loading safetensors checkpoint shards:   0% Completed | 0/12 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:   8% Completed | 1/12 [00:01<00:12,  1.17s/it]
Loading safetensors checkpoint shards:  17% Completed | 2/12 [00:02<00:09,  1.00it/s]
Loading safetensors checkpoint shards:  25% Completed | 3/12 [00:03<00:08,  1.01it/s]
Loading safetensors checkpoint shards:  33% Completed | 4/12 [00:03<00:07,  1.07it/s]
Loading safetensors checkpoint shards:  42% Completed | 5/12 [00:04<00:06,  1.05it/s]
Loading safetensors checkpoint shards:  50% Completed | 6/12 [00:05<00:05,  1.18it/s]
Loading safetensors checkpoint shards:  58% Completed | 7/12 [00:06<00:04,  1.12it/s]
Loading safetensors checkpoint shards:  67% Completed | 8/12 [00:07<00:04,  1.01s/it]
Loading safetensors checkpoint shards:  75% Completed | 9/12 [00:08<00:02,  1.04it/s]
Loading safetensors checkpoint shards:  83% Completed | 10/12 [00:09<00:01,  1.03it/s]
Loading safetensors checkpoint shards:  92% Completed | 11/12

INFO 09-19 19:51:17 [default_loader.py:272] Loading weights took 11.51 seconds
INFO 09-19 19:51:18 [gpu_model_runner.py:1782] Model loading took 49.8945 GiB and 11.905837 seconds
INFO 09-19 19:51:18 [gpu_model_runner.py:2221] Encoder cache will be initialized with a budget of 16384 tokens, and profiled with 64 image items of the maximum feature size.
INFO 09-19 19:51:42 [backends.py:509] Using cache directory: /root/.cache/vllm/torch_compile_cache/7382194048/rank_0_0/backbone for vLLM's torch.compile
INFO 09-19 19:51:42 [backends.py:520] Dynamo bytecode transform time: 4.18 s
INFO 09-19 19:51:44 [backends.py:155] Directly load the compiled graph(s) for shape None from the cache, took 0.363 s
INFO 09-19 19:51:45 [monitor.py:34] torch.compile takes 4.18 s in total
INFO 09-19 19:51:46 [gpu_worker.py:232] Available KV cache memory: 109.67 GiB
INFO 09-19 19:51:46 [kv_cache_utils.py:716] GPU KV cache size: 410,704 tokens
INFO 09-19 19:51:46 [kv_cache_utils.py:720] Maximum concurrency for 65,

Capturing CUDA graphs: 100%|██████████| 67/67 [00:10<00:00,  6.47it/s]


INFO 09-19 19:51:56 [gpu_model_runner.py:2306] Graph capturing finished in 10 secs, took 0.37 GiB
INFO 09-19 19:51:56 [core.py:172] init engine (profile, create kv cache, warmup model) took 38.54 seconds
INFO 09-19 19:51:59 [chat_utils.py:421] Detected the chat template content format to be 'openai'. You can set `--chat-template-content-format` to override this.


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0% 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

--------------------------------------------------
 1 . The first image shows a mallard duck swimming in a body of water. 
 2 . The second image features a majestic lion sitting in a grassy field. 
 3 . The third image captures a blue and white bird perched on a branch, holding an insect in its beak. 
 4 . The fourth image displays a whale swimming in the ocean, viewed from above. 
 5 . The fifth image shows a starfish lying on a sandy beach. 
 6 . The sixth image depicts a snail crawling on a white surface. 
 7 . The seventh image shows a purple thistle flower with a bee on it. 
 8 . The eighth image captures two sled dogs standing in the snow. 
 9 . The ninth image shows a ginger and white cat sitting on a bed of fallen leaves. 
 1 0 . The tenth image shows a black and white guinea pig sitting on a wet surface. 
 1 1 . The eleventh image shows a brown rabbit sitting on the ground in a forest. 
 1 2 . The twelfth image shows a horse and a pony running in a field.<
--------------------

