In [1]:
from vllm import LLM, SamplingParams
from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt, zip_enc_dec_prompts

INFO 09-25 19:46:25 [__init__.py:244] Automatically detected platform rocm.


In [2]:
def create_prompts(tokenizer):
    text_prompt_raw = "Hello, my name is"
    text_prompt = TextPrompt(prompt="The president of the United States is")
    tokens_prompt = TokensPrompt(
        prompt_token_ids=tokenizer.encode(prompt="The capital of France is")
    )

    single_text_prompt_raw = text_prompt_raw  # Pass a string directly
    single_text_prompt = text_prompt  # Pass a TextPrompt
    single_tokens_prompt = tokens_prompt  # Pass a TokensPrompt

    enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
        # Pass encoder prompt string directly, &
        # pass decoder prompt tokens
        encoder_prompt=single_text_prompt_raw,
        decoder_prompt=single_tokens_prompt,
    )
    enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
        # Pass TextPrompt to encoder, and
        # pass decoder prompt string directly
        encoder_prompt=single_text_prompt,
        decoder_prompt=single_text_prompt_raw,
    )
    enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
        # Pass encoder prompt tokens directly, and
        # pass TextPrompt to decoder
        encoder_prompt=single_tokens_prompt,
        decoder_prompt=single_text_prompt,
    )

    zipped_prompt_list = zip_enc_dec_prompts(
        ["An encoder prompt", "Another encoder prompt"],
        ["A decoder prompt", "Another decoder prompt"],
    )

    return [
        single_text_prompt_raw,
        single_text_prompt,
        single_tokens_prompt,
        enc_dec_prompt1,
        enc_dec_prompt2,
        enc_dec_prompt3,
    ] + zipped_prompt_list


In [3]:
def create_sampling_params():
    return SamplingParams(
        temperature=0,
        top_p=1.0,
        min_tokens=0,
        max_tokens=20,
    )

In [4]:
def print_outputs(outputs):
    print("-" * 50)
    for i, output in enumerate(outputs):
        prompt = output.prompt
        encoder_prompt = output.encoder_prompt
        generated_text = output.outputs[0].text
        print(f"Output {i + 1}:")
        print(
            f"Encoder prompt: {encoder_prompt!r}\n"
            f"Decoder prompt: {prompt!r}\n"
            f"Generated text: {generated_text!r}"
        )
        print("-" * 50)

In [5]:
def main():
    dtype = "float"

    llm = LLM(
        model="facebook/bart-large-cnn",
        dtype=dtype,
    )

    tokenizer = llm.llm_engine.get_tokenizer_group()

    prompts = create_prompts(tokenizer)
    sampling_params = create_sampling_params()

    outputs = llm.generate(prompts, sampling_params)

    print_outputs(outputs)

In [6]:
main()

INFO 09-25 19:46:50 [config.py:853] This model supports multiple tasks: {'reward', 'embed', 'classify', 'generate', 'score'}. Defaulting to 'generate'.
INFO 09-25 19:46:50 [config.py:1467] Using max model len 1024
INFO 09-25 19:46:57 [config.py:4566] full_cuda_graph is not supported with cascade attention. Disabling cascade attention.
INFO 09-25 19:46:57 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.2.dev364+gb432b7a28) with config: model='facebook/bart-large-cnn', speculative_config=None, tokenizer='facebook/bart-large-cnn', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float32, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=Fals

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 09-25 19:46:59 [default_loader.py:272] Loading weights took 0.66 seconds
INFO 09-25 19:46:59 [model_runner.py:1203] Model loading took 2.0527 GiB and 1.015113 seconds
INFO 09-25 19:47:16 [worker.py:294] Memory profiling takes 15.94 seconds
INFO 09-25 19:47:16 [worker.py:294] the current vLLM instance can use total_gpu_memory (191.69GiB) x gpu_memory_utilization (0.90) = 172.52GiB
INFO 09-25 19:47:16 [worker.py:294] model weights take 2.05GiB; non_torch_memory takes 1.08GiB; PyTorch activation peak memory takes 0.61GiB; the rest of the memory reserved for KV Cache is 168.78GiB.
INFO 09-25 19:47:16 [executor_base.py:113] # rocm blocks: 115218, # CPU blocks: 2730
INFO 09-25 19:47:16 [executor_base.py:118] Maximum concurrency for 1024 tokens per request: 1800.28x
INFO 09-25 19:47:17 [llm_engine.py:428] init engine (profile, create kv cache, warmup model) took 17.19 seconds


Adding requests:   0%|          | 0/8 [00:00<?, ?it/s]

Processed prompts:   0% 0/8 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

--------------------------------------------------
Output 1:
Encoder prompt: 'Hello, my name is'
Decoder prompt: None
Generated text: "Hello, my name is John. I'm a writer."
--------------------------------------------------
Output 2:
Encoder prompt: 'The president of the United States is'
Decoder prompt: None
Generated text: 'The president of the United States is.'
--------------------------------------------------
Output 3:
Encoder prompt: None
Decoder prompt: None
Generated text: 'The capital of France is Paris.'
--------------------------------------------------
Output 4:
Encoder prompt: 'Hello, my name is'
Decoder prompt: None
Generated text: 'is, a city in the south of France. The capital is the largest city in the country.'
--------------------------------------------------
Output 5:
Encoder prompt: 'The president of the United States is'
Decoder prompt: 'Hello, my name is'
Generated text: ". I'm the president of the United States. I'm also the president of the United Nations."
