In [12]:
from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory
import time

In [3]:
prefix = (
    "You are an expert school principal, skilled in effectively managing "
    "faculty and staff. Draft 10-15 questions for a potential first grade "
    "Head Teacher for my K-12, all-girls', independent school that emphasizes "
    "community, joyful discovery, and life-long learning. The candidate is "
    "coming in for a first-round panel interview for a 8th grade Math "
    "teaching role. They have 5 years of previous teaching experience "
    "as an assistant teacher at a co-ed, public school with experience "
    "in middle school math teaching. Based on these information, fulfill "
    "the following paragraph: "
)

In [4]:
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]

In [5]:
generating_prompts = [prefix + prompt for prompt in prompts]

In [6]:
sampling_params = SamplingParams(temperature=0.0)

### Prefix caching disabled

In [15]:
regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)

print("Results without `enable_prefix_caching`")

start_time = time.time()
outputs = regular_llm.generate(generating_prompts, sampling_params)
end_time = time.time()

regular_generated_texts = []
print("-" * 50)
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    regular_generated_texts.append(generated_text)
    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
    print(f"Generation time: {end_time - start_time} seconds.")
    print("-" * 50)

INFO 09-05 20:51:41 [config.py:853] This model supports multiple tasks: {'generate', 'reward', 'embed', 'classify', 'score'}. Defaulting to 'generate'.
INFO 09-05 20:51:41 [config.py:1467] Using max model len 2048
INFO 09-05 20:51:41 [config.py:2267] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 09-05 20:51:41 [config.py:4566] full_cuda_graph is not supported with cascade attention. Disabling cascade attention.
INFO 09-05 20:51:43 [__init__.py:244] Automatically detected platform rocm.
INFO 09-05 20:51:52 [core.py:459] Waiting for init message from front-end.
INFO 09-05 20:51:53 [core.py:69] Initializing a V1 LLM engine (v0.9.2.dev364+gb432b7a28) with config: model='facebook/opt-125m', speculative_config=None, tokenizer='facebook/opt-125m', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, t

Loading pt checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading pt checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.65it/s]
Loading pt checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.65it/s]



INFO 09-05 20:51:54 [gpu_model_runner.py:1782] Model loading took 0.2500 GiB and 0.358850 seconds
INFO 09-05 20:51:55 [backends.py:509] Using cache directory: /root/.cache/vllm/torch_compile_cache/c36373a52c/rank_0_0/backbone for vLLM's torch.compile
INFO 09-05 20:51:55 [backends.py:520] Dynamo bytecode transform time: 1.37 s
INFO 09-05 20:51:57 [backends.py:155] Directly load the compiled graph(s) for shape None from the cache, took 0.808 s
INFO 09-05 20:51:57 [monitor.py:34] torch.compile takes 1.37 s in total
INFO 09-05 20:52:09 [gpu_worker.py:232] Available KV cache memory: 73.79 GiB
INFO 09-05 20:52:10 [kv_cache_utils.py:716] GPU KV cache size: 2,149,280 tokens
INFO 09-05 20:52:10 [kv_cache_utils.py:720] Maximum concurrency for 2,048 tokens per request: 1049.45x
INFO 09-05 20:52:10 [rocm.py:224] Using Triton Attention backend on V1 engine.


Capturing CUDA graphs:  99%|█████████▊| 66/67 [00:06<00:00, 10.74it/s]

INFO 09-05 20:52:16 [gpu_model_runner.py:2306] Graph capturing finished in 7 secs, took 0.19 GiB
INFO 09-05 20:52:16 [core.py:172] init engine (profile, create kv cache, warmup model) took 22.72 seconds
Results without `enable_prefix_caching`


Capturing CUDA graphs: 100%|██████████| 67/67 [00:06<00:00,  9.92it/s]


Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0% 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

--------------------------------------------------
Prompt: "You are an expert school principal, skilled in effectively managing faculty and staff. Draft 10-15 questions for a potential first grade Head Teacher for my K-12, all-girls', independent school that emphasizes community, joyful discovery, and life-long learning. The candidate is coming in for a first-round panel interview for a 8th grade Math teaching role. They have 5 years of previous teaching experience as an assistant teacher at a co-ed, public school with experience in middle school math teaching. Based on these information, fulfill the following paragraph: Hello, my name is"
Generated text: ' Sarah, I am a teacher at a private school in the city of New York'
Generation time: 0.028345346450805664 seconds.
--------------------------------------------------
Prompt: "You are an expert school principal, skilled in effectively managing faculty and staff. Draft 10-15 questions for a potential first grade Head Teacher for my K-1

In [16]:
del regular_llm
cleanup_dist_env_and_memory()



### Prefix caching enabled

In [17]:
prefix_cached_llm = LLM(
    model="facebook/opt-125m",
    enable_prefix_caching=True,
    gpu_memory_utilization=0.4,
)

# Warmup so that the shared prompt's KV cache is computed.
prefix_cached_llm.generate(generating_prompts[0], sampling_params)

# Generate with prefix caching.
start_time = time.time()
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
end_time = time.time()

print("Results with `enable_prefix_caching`")

cached_generated_texts = []
print("-" * 50)
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    cached_generated_texts.append(generated_text)
    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
    print(f"Generation time: {end_time - start_time} seconds.")
    print("-" * 50)

INFO 09-05 20:55:57 [config.py:853] This model supports multiple tasks: {'generate', 'reward', 'embed', 'classify', 'score'}. Defaulting to 'generate'.
INFO 09-05 20:55:57 [config.py:1467] Using max model len 2048
INFO 09-05 20:55:57 [config.py:2267] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 09-05 20:55:57 [config.py:4566] full_cuda_graph is not supported with cascade attention. Disabling cascade attention.
INFO 09-05 20:56:00 [__init__.py:244] Automatically detected platform rocm.
INFO 09-05 20:56:09 [core.py:459] Waiting for init message from front-end.
INFO 09-05 20:56:09 [core.py:69] Initializing a V1 LLM engine (v0.9.2.dev364+gb432b7a28) with config: model='facebook/opt-125m', speculative_config=None, tokenizer='facebook/opt-125m', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, t

Loading pt checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading pt checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.87it/s]
Loading pt checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.87it/s]



INFO 09-05 20:56:10 [gpu_model_runner.py:1782] Model loading took 0.2500 GiB and 0.318966 seconds
INFO 09-05 20:56:12 [backends.py:509] Using cache directory: /root/.cache/vllm/torch_compile_cache/c36373a52c/rank_0_0/backbone for vLLM's torch.compile
INFO 09-05 20:56:12 [backends.py:520] Dynamo bytecode transform time: 1.36 s
INFO 09-05 20:56:13 [backends.py:155] Directly load the compiled graph(s) for shape None from the cache, took 0.812 s
INFO 09-05 20:56:13 [monitor.py:34] torch.compile takes 1.36 s in total
INFO 09-05 20:56:26 [gpu_worker.py:232] Available KV cache memory: 73.79 GiB
INFO 09-05 20:56:26 [kv_cache_utils.py:716] GPU KV cache size: 2,149,280 tokens
INFO 09-05 20:56:26 [kv_cache_utils.py:720] Maximum concurrency for 2,048 tokens per request: 1049.45x
INFO 09-05 20:56:26 [rocm.py:224] Using Triton Attention backend on V1 engine.


Capturing CUDA graphs: 100%|██████████| 67/67 [00:06<00:00,  9.92it/s]


INFO 09-05 20:56:33 [gpu_model_runner.py:2306] Graph capturing finished in 7 secs, took 0.19 GiB
INFO 09-05 20:56:33 [core.py:172] init engine (profile, create kv cache, warmup model) took 22.42 seconds


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0% 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0% 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Results with `enable_prefix_caching`
--------------------------------------------------
Prompt: "You are an expert school principal, skilled in effectively managing faculty and staff. Draft 10-15 questions for a potential first grade Head Teacher for my K-12, all-girls', independent school that emphasizes community, joyful discovery, and life-long learning. The candidate is coming in for a first-round panel interview for a 8th grade Math teaching role. They have 5 years of previous teaching experience as an assistant teacher at a co-ed, public school with experience in middle school math teaching. Based on these information, fulfill the following paragraph: Hello, my name is"
Generated text: ' Sarah, I am a teacher at a private school in the city of New York'
Generation time: 0.02359747886657715 seconds.
--------------------------------------------------
Prompt: "You are an expert school principal, skilled in effectively managing faculty and staff. Draft 10-15 questions for a potential

In [18]:
generated_same = all(
    [
        regular_generated_texts[i] == cached_generated_texts[i]
        for i in range(len(prompts))
    ]
)
print(f"Generated answers are the same: {generated_same}")

Generated answers are the same: True


In [19]:
del prefix_cached_llm
cleanup_dist_env_and_memory()

