In [1]:
import time
from collections.abc import Sequence
from dataclasses import asdict
from typing import NamedTuple

from vllm import LLM, EngineArgs, PromptType, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from vllm.utils import FlexibleArgumentParser

INFO 09-25 20:06:38 [__init__.py:244] Automatically detected platform rocm.


In [2]:
class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: Sequence[PromptType]

#### Florence-2-large

In [3]:
def run_florence2():
    engine_args = EngineArgs(
        model="microsoft/Florence-2-large",
        tokenizer="Isotr0py/Florence-2-tokenizer",
        max_num_seqs=8,
        trust_remote_code=True,
        limit_mm_per_prompt={"image": 1},
        dtype="half",
    )

    prompts = [
        {  # implicit prompt with task token
            "prompt": "<DETAILED_CAPTION>",
            "multi_modal_data": {"image": ImageAsset("stop_sign").pil_image},
        },
        {  # explicit encoder/decoder prompt
            "encoder_prompt": {
                "prompt": "Describe in detail what is shown in the image.",
                "multi_modal_data": {"image": ImageAsset("cherry_blossom").pil_image},
            },
            "decoder_prompt": "",
        },
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )

#### Llama-3.2-11B-Vision-Instruct

In [4]:
def run_mllama():
    engine_args = EngineArgs(
        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
        max_model_len=8192,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": 1},
        dtype="half",
    )

    prompts = [
        {  # Implicit prompt
            "prompt": "<|image|><|begin_of_text|>What is the content of this image?",  # noqa: E501
            "multi_modal_data": {
                "image": ImageAsset("stop_sign").pil_image,
            },
        },
        {  # Explicit prompt
            "encoder_prompt": {
                "prompt": "<|image|>",
                "multi_modal_data": {
                    "image": ImageAsset("stop_sign").pil_image,
                },
            },
            "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.",  # noqa: E501
        },
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )

#### Whisper-large-v3-turbo

In [5]:
def run_whisper():
    engine_args = EngineArgs(
        model="openai/whisper-large-v3-turbo",
        max_model_len=448,
        max_num_seqs=16,
        limit_mm_per_prompt={"audio": 1},
        dtype="half",
    )

    prompts = [
        {  # Test implicit prompt
            "prompt": "<|startoftranscript|>",
            "multi_modal_data": {
                "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
            },
        },
        {  # Test explicit encoder/decoder prompt
            "encoder_prompt": {
                "prompt": "",
                "multi_modal_data": {
                    "audio": AudioAsset("winning_call").audio_and_sample_rate,
                },
            },
            "decoder_prompt": "<|startoftranscript|>",
        },
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )

### Model Map

In [6]:
model_example_map = {
    "florence2": run_florence2, # bug in vLLM v0.9.2
    "mllama": run_mllama,
    "whisper": run_whisper,
}

In [7]:
def main(model, seed=42):
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")

    req_data = model_example_map[model]()

    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
        req_data.engine_args.limit_mm_per_prompt or {}
    )

    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)

    prompts = req_data.prompts

    # Create a sampling params object.
    sampling_params = SamplingParams(
        temperature=0,
        top_p=1.0,
        max_tokens=64,
        skip_special_tokens=False,
    )

    start = time.time()

    outputs = llm.generate(prompts, sampling_params)

    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Decoder prompt: {prompt!r}, Generated text: {generated_text!r}")

    duration = time.time() - start

    print("Duration:", duration)
    print("RPS:", len(prompts) / duration)

In [8]:
main("whisper")

config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

INFO 09-25 20:07:16 [config.py:853] This model supports multiple tasks: {'score', 'reward', 'transcription', 'generate', 'classify', 'embed'}. Defaulting to 'transcription'.


tokenizer_config.json: 0.00B [00:00, ?B/s]

INFO 09-25 20:07:16 [config.py:1467] Using max model len 448
INFO 09-25 20:07:25 [config.py:4566] full_cuda_graph is not supported with cascade attention. Disabling cascade attention.
INFO 09-25 20:07:25 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.2.dev364+gb432b7a28) with config: model='openai/whisper-large-v3-turbo', speculative_config=None, tokenizer='openai/whisper-large-v3-turbo', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=448, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hid

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

INFO 09-25 20:07:27 [rocm.py:233] Using ROCmFlashAttention backend.
INFO 09-25 20:07:27 [parallel_state.py:1076] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 09-25 20:07:27 [model_runner.py:1171] Starting to load model openai/whisper-large-v3-turbo...
INFO 09-25 20:07:27 [weight_utils.py:292] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

INFO 09-25 20:07:29 [weight_utils.py:308] Time spent downloading weights for openai/whisper-large-v3-turbo: 1.537903 seconds
INFO 09-25 20:07:29 [weight_utils.py:345] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 09-25 20:07:30 [default_loader.py:272] Loading weights took 0.68 seconds
INFO 09-25 20:07:30 [model_runner.py:1203] Model loading took 1.9297 GiB and 2.599989 seconds
INFO 09-25 20:07:31 [enc_dec_model_runner.py:315] Starting profile run for multi-modal models.
INFO 09-25 20:07:59 [worker.py:294] Memory profiling takes 28.90 seconds
INFO 09-25 20:07:59 [worker.py:294] the current vLLM instance can use total_gpu_memory (191.69GiB) x gpu_memory_utilization (0.90) = 172.52GiB
INFO 09-25 20:07:59 [worker.py:294] model weights take 1.93GiB; non_torch_memory takes 0.68GiB; PyTorch activation peak memory takes 0.66GiB; the rest of the memory reserved for KV Cache is 169.25GiB.
INFO 09-25 20:07:59 [executor_base.py:113] # rocm blocks: 69324, # CPU blocks: 1638
INFO 09-25 20:07:59 [executor_base.py:118] Maximum concurrency for 448 tokens per request: 2475.86x
INFO 09-25 20:08:00 [llm_engine.py:428] init engine (profile, create kv cache, warmup model) took 29.92 seconds


Adding requests:   0%|          | 0/2 [00:00<?, ?it/s]

Processed prompts:   0% 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Decoder prompt: '<|startoftranscript|>', Generated text: '<|transcribe|><|notimestamps|> The first words I spoke in the original phonograph, a little piece of practical poetry. Mary had a little lamb, its streets were quite as slow, and everywhere that Mary went the lamb was sure to go.'
Decoder prompt: '<|startoftranscript|>', Generated text: "<|transcribe|><|notimestamps|> And the 0-1 pitch on the way to Edgar Martinez. Swung on the line down the left field line for a base hit. Here comes Joy. Here is Junior to third base. They're going to wave him in. The throw to the plate will be late. The Mariners are going to"
Duration: 2.00260329246521
RPS: 0.9987000458478198
