In [2]:
from dataclasses import asdict
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args

from PIL.Image import Image

from vllm import LLM, EngineArgs
#from vllm.entrypoints.score_utils import ScoreMultiModalParam # supported from > v0.10.2
from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser

In [4]:
class TextQuery(TypedDict):
    modality: Literal["text"]
    text: str


class ImageQuery(TypedDict):
    modality: Literal["image"]
    image: Image


class TextImageQuery(TypedDict):
    modality: Literal["text+image"]
    text: str
    image: Image


# class TextImagesQuery(TypedDict):
#     modality: Literal["text+images"]
#     text: str
#     image: ScoreMultiModalParam

In [5]:
# QueryModality = Literal["text", "image", "text+image", "text+images"]
# Query = Union[TextQuery, ImageQuery, TextImageQuery, TextImagesQuery]

QueryModality = Literal["text", "image", "text+image"]
Query = Union[TextQuery, ImageQuery, TextImageQuery]

In [6]:
class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompt: Optional[str] = None
    image: Optional[Image] = None
    query: Optional[str] = None
    # documents: Optional[ScoreMultiModalParam] = None

### e5-v

In [18]:
def run_e5_v(query: Query) -> ModelRequestData:
    llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"  # noqa: E501

    if query["modality"] == "text":
        text = query["text"]
        prompt = llama3_template.format(f"{text}\nSummary above sentence in one word: ")
        image = None
    elif query["modality"] == "image":
        prompt = llama3_template.format("<image>\nSummary above image in one word: ")
        image = query["image"]
    else:
        modality = query["modality"]
        raise ValueError(f"Unsupported query modality: '{modality}'")

    engine_args = EngineArgs(
        model="royokong/e5-v",
        # runner="pooling", # supported from > v0.10.2
        task="embed",
        max_model_len=4096,
        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image=image,
    )

### VLM2Vec

In [19]:
def run_vlm2vec(query: Query) -> ModelRequestData:
    if query["modality"] == "text":
        text = query["text"]
        prompt = f"Find me an everyday image that matches the given caption: {text}"  # noqa: E501
        image = None
    elif query["modality"] == "image":
        prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image."  # noqa: E501
        image = query["image"]
    elif query["modality"] == "text+image":
        text = query["text"]
        prompt = (
            f"<|image_1|> Represent the given image with the following question: {text}"  # noqa: E501
        )
        image = query["image"]
    else:
        modality = query["modality"]
        raise ValueError(f"Unsupported query modality: '{modality}'")

    engine_args = EngineArgs(
        model="TIGER-Lab/VLM2Vec-Full",
        # runner="pooling", # supported from > v0.10.2
        task="embed",
        max_model_len=4096,
        trust_remote_code=True,
        mm_processor_kwargs={"num_crops": 4},
        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image=image,
    )

### jina-reranker-m0

In [20]:
def run_jinavl_reranker(query: Query) -> ModelRequestData:
    if query["modality"] != "text+images":
        raise ValueError(f"Unsupported query modality: '{query['modality']}'")

    engine_args = EngineArgs(
        model="jinaai/jina-reranker-m0",
        runner="pooling", # supported from > v0.10.2
        max_model_len=32768,
        trust_remote_code=True,
        mm_processor_kwargs={
            "min_pixels": 3136,
            "max_pixels": 602112,
        },
        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        query=query["text"],
        documents=query["image"],
    )

## Model Map

In [21]:
model_example_map = {
    "e5_v": run_e5_v,
    "vlm2vec": run_vlm2vec,
    "jinavl_reranker": run_jinavl_reranker, # supported from > v0.10.2
}

## Inference

In [22]:
def get_query(modality: QueryModality):
    if modality == "text":
        return TextQuery(modality="text", text="A dog sitting in the grass")

    if modality == "image":
        return ImageQuery(
            modality="image",
            image=fetch_image(
                "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg"  # noqa: E501
            ),
        )

    if modality == "text+image":
        return TextImageQuery(
            modality="text+image",
            text="A cat standing in the snow.",
            image=fetch_image(
                "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg"  # noqa: E501
            ),
        )

    # if modality == "text+images":
    #     return TextImagesQuery(
    #         modality="text+images",
    #         text="slm markdown",
    #         image={
    #             "content": [
    #                 {
    #                     "type": "image_url",
    #                     "image_url": {
    #                         "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
    #                     },
    #                 },
    #                 {
    #                     "type": "image_url",
    #                     "image_url": {
    #                         "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
    #                     },
    #                 },
    #             ]
    #         },
    #     )

    msg = f"Modality {modality} is not supported."
    raise ValueError(msg)

### Embedding task

In [23]:
def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
    query = get_query(modality)
    req_data = model_example_map[model](query)

    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
        req_data.engine_args.limit_mm_per_prompt or {}
    )

    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)

    mm_data = {}
    if req_data.image is not None:
        mm_data["image"] = req_data.image

    outputs = llm.embed(
        {
            "prompt": req_data.prompt,
            "multi_modal_data": mm_data,
        }
    )

    print("-" * 50)
    for output in outputs:
        print(output.outputs.embedding)
        print("-" * 50)

### Scoring task

In [24]:
def run_score(model: str, modality: QueryModality, seed: Optional[int]):
    query = get_query(modality)
    req_data = model_example_map[model](query)

    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)

    outputs = llm.score(req_data.query, req_data.documents)

    print("-" * 30)
    print([output.outputs.score for output in outputs])
    print("-" * 30)

### Examples

In [25]:
run_encode(model="e5_v", modality="image", seed=42)

config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

INFO 09-24 19:37:56 [config.py:1467] Using max model len 4096
INFO 09-24 19:37:56 [arg_utils.py:1580] (Disabling) chunked prefill by default
INFO 09-24 19:37:56 [arg_utils.py:1583] (Disabling) prefix caching by default
INFO 09-24 19:38:03 [config.py:4566] full_cuda_graph is not supported with cascade attention. Disabling cascade attention.
INFO 09-24 19:38:03 [config.py:4581] Only "last" pooling supports chunked prefill and prefix caching; disabling both.


tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

INFO 09-24 19:38:07 [__init__.py:244] Automatically detected platform rocm.
INFO 09-24 19:38:16 [core.py:459] Waiting for init message from front-end.
INFO 09-24 19:38:16 [config.py:4581] Only "last" pooling supports chunked prefill and prefix caching; disabling both.
INFO 09-24 19:38:16 [core.py:69] Initializing a V1 LLM engine (v0.9.2.dev364+gb432b7a28) with config: model='royokong/e5-v', speculative_config=None, tokenizer='royokong/e5-v', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), o

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


INFO 09-24 19:38:18 [gpu_model_runner.py:1751] Starting to load model royokong/e5-v...
INFO 09-24 19:38:19 [gpu_model_runner.py:1756] Loading model from scratch...
INFO 09-24 19:38:19 [rocm.py:224] Using Triton Attention backend on V1 engine.
INFO 09-24 19:38:19 [config.py:4581] Only "last" pooling supports chunked prefill and prefix caching; disabling both.
INFO 09-24 19:38:19 [rocm.py:224] Using Triton Attention backend on V1 engine.
INFO 09-24 19:38:19 [weight_utils.py:292] Using model weights format ['*.safetensors']
INFO 09-24 19:38:32 [weight_utils.py:308] Time spent downloading weights for royokong/e5-v: 13.168944 seconds


Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:01<00:04,  1.66s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:03<00:03,  1.74s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:05<00:01,  1.78s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.24s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.42s/it]



INFO 09-24 19:38:38 [default_loader.py:272] Loading weights took 5.72 seconds
INFO 09-24 19:38:38 [gpu_model_runner.py:1782] Model loading took 15.6055 GiB and 19.177332 seconds
INFO 09-24 19:38:38 [gpu_model_runner.py:2221] Encoder cache will be initialized with a budget of 4096 tokens, and profiled with 2 image items of the maximum feature size.
INFO 09-24 19:39:03 [backends.py:509] Using cache directory: /root/.cache/vllm/torch_compile_cache/92c636d592/rank_0_0/backbone for vLLM's torch.compile
INFO 09-24 19:39:03 [backends.py:520] Dynamo bytecode transform time: 4.85 s
INFO 09-24 19:39:19 [backends.py:181] Cache the graph of shape None for later use
INFO 09-24 19:39:19 [backends.py:193] Compiling a graph for general shape takes 14.45 s
INFO 09-24 19:39:22 [monitor.py:34] torch.compile takes 19.29 s in total
INFO 09-24 19:39:47 [gpu_worker.py:232] Available KV cache memory: 155.50 GiB
INFO 09-24 19:39:48 [kv_cache_utils.py:716] GPU KV cache size: 1,273,872 tokens
INFO 09-24 19:39:48

Capturing CUDA graphs: 100%|██████████| 67/67 [00:15<00:00,  4.39it/s]


INFO 09-24 19:40:03 [gpu_model_runner.py:2306] Graph capturing finished in 15 secs, took 0.56 GiB
INFO 09-24 19:40:03 [core.py:172] init engine (profile, create kv cache, warmup model) took 85.09 seconds


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Processed prompts:   0% 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

--------------------------------------------------
[-0.005020863376557827, -0.007925724610686302, -0.0086805559694767, 0.03563791885972023, 0.000362140970537439, -0.005002301651984453, -0.002409890526905656, 0.03875623643398285, -0.021865349262952805, 0.018982142210006714, -0.0001366970973322168, -0.024340204894542694, -0.03264334425330162, -0.02009582705795765, -0.0034276749938726425, -0.014366536401212215, -0.02050417847931385, 0.03259384632110596, -0.020553674548864365, -9.077692811843008e-05, 0.016098935157060623, 0.01502237282693386, 0.007436940912157297, -0.025614755228161812, 0.006211887579411268, -0.0031136777251958847, -0.0013371953973546624, 0.009317831136286259, 0.02546626329421997, 0.0018360334215685725, 0.012597015127539635, 0.0028863002080470324, 0.018363427370786667, -0.009416825138032436, -0.021481746807694435, -0.0017973638605326414, -0.0002461321128066629, 0.010549072176218033, 0.000602859363425523, 0.0007134544430300593, 0.00822889432311058, -0.0037834353279322386, -



In [26]:
run_score(model="e5_v", modality="image", seed=42)

INFO 09-24 19:41:15 [config.py:1467] Using max model len 4096
INFO 09-24 19:41:15 [arg_utils.py:1580] (Disabling) chunked prefill by default
INFO 09-24 19:41:15 [arg_utils.py:1583] (Disabling) prefix caching by default
INFO 09-24 19:41:15 [config.py:4566] full_cuda_graph is not supported with cascade attention. Disabling cascade attention.
INFO 09-24 19:41:15 [config.py:4581] Only "last" pooling supports chunked prefill and prefix caching; disabling both.
INFO 09-24 19:41:18 [__init__.py:244] Automatically detected platform rocm.
INFO 09-24 19:41:27 [core.py:459] Waiting for init message from front-end.
INFO 09-24 19:41:27 [config.py:4581] Only "last" pooling supports chunked prefill and prefix caching; disabling both.
INFO 09-24 19:41:27 [core.py:69] Initializing a V1 LLM engine (v0.9.2.dev364+gb432b7a28) with config: model='royokong/e5-v', speculative_config=None, tokenizer='royokong/e5-v', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, toke

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


INFO 09-24 19:41:29 [gpu_model_runner.py:1751] Starting to load model royokong/e5-v...
INFO 09-24 19:41:29 [gpu_model_runner.py:1756] Loading model from scratch...
INFO 09-24 19:41:29 [rocm.py:224] Using Triton Attention backend on V1 engine.
INFO 09-24 19:41:29 [config.py:4581] Only "last" pooling supports chunked prefill and prefix caching; disabling both.
INFO 09-24 19:41:29 [rocm.py:224] Using Triton Attention backend on V1 engine.
INFO 09-24 19:41:29 [weight_utils.py:292] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:01<00:05,  1.67s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:03<00:03,  1.77s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:05<00:01,  1.80s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.25s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.44s/it]



INFO 09-24 19:41:35 [default_loader.py:272] Loading weights took 5.77 seconds
INFO 09-24 19:41:35 [gpu_model_runner.py:1782] Model loading took 15.6055 GiB and 6.044245 seconds
INFO 09-24 19:41:36 [gpu_model_runner.py:2221] Encoder cache will be initialized with a budget of 4096 tokens, and profiled with 2 image items of the maximum feature size.
INFO 09-24 19:41:52 [backends.py:509] Using cache directory: /root/.cache/vllm/torch_compile_cache/92c636d592/rank_0_0/backbone for vLLM's torch.compile
INFO 09-24 19:41:52 [backends.py:520] Dynamo bytecode transform time: 4.45 s
INFO 09-24 19:41:55 [backends.py:155] Directly load the compiled graph(s) for shape None from the cache, took 0.364 s
INFO 09-24 19:41:55 [monitor.py:34] torch.compile takes 4.45 s in total
INFO 09-24 19:42:19 [gpu_worker.py:232] Available KV cache memory: 155.52 GiB
INFO 09-24 19:42:19 [kv_cache_utils.py:716] GPU KV cache size: 1,274,032 tokens
INFO 09-24 19:42:19 [kv_cache_utils.py:720] Maximum concurrency for 4,096

Capturing CUDA graphs: 100%|██████████| 67/67 [00:08<00:00,  7.67it/s]


INFO 09-24 19:42:28 [gpu_model_runner.py:2306] Graph capturing finished in 9 secs, took 0.56 GiB
INFO 09-24 19:42:28 [core.py:172] init engine (profile, create kv cache, warmup model) took 52.54 seconds


AttributeError: 'ModelRequestData' object has no attribute 'documents'