In [37]:
from vllm import LLM, SamplingParams
import time

In [38]:
import gc
import ray
import torch

def clean_up(llm):
    try:
        del llm
    except Exception as e:
        print(f"Failed to unload model: {e}")
    finally:
        gc.collect()
        torch.cuda.empty_cache()
        ray.shutdown()

### Automatic Prefix Caching

In [39]:
LONG_PROMPT = (
    "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n"
    + """
| ID  | Name          | Age | Occupation    | Country       | Email                  | Phone Number   | Address                       |
|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
| 1   | John Doe      | 29  | Engineer      | USA           | john.doe@example.com   | 555-1234       | 123 Elm St, Springfield, IL  |
| 2   | Jane Smith    | 34  | Doctor        | Canada        | jane.smith@example.com | 555-5678       | 456 Oak St, Toronto, ON      |
| 3   | Alice Johnson | 27  | Teacher       | UK            | alice.j@example.com    | 555-8765       | 789 Pine St, London, UK      |
| 4   | Bob Brown     | 45  | Artist        | Australia     | bob.b@example.com      | 555-4321       | 321 Maple St, Sydney, NSW    |
| 5   | Carol White   | 31  | Scientist     | New Zealand   | carol.w@example.com    | 555-6789       | 654 Birch St, Wellington, NZ |
| 6   | Dave Green    | 28  | Lawyer        | Ireland       | dave.g@example.com     | 555-3456       | 987 Cedar St, Dublin, IE     |
| 7   | Emma Black    | 40  | Musician      | USA           | emma.b@example.com     | 555-1111       | 246 Ash St, New York, NY     |
| 8   | Frank Blue    | 37  | Chef          | Canada        | frank.b@example.com    | 555-2222       | 135 Spruce St, Vancouver, BC |
| 9   | Grace Yellow  | 50  | Engineer      | UK            | grace.y@example.com    | 555-3333       | 864 Fir St, Manchester, UK   |
| 10  | Henry Violet  | 32  | Artist        | Australia     | henry.v@example.com    | 555-4444       | 753 Willow St, Melbourne, VIC|
| 11  | Irene Orange  | 26  | Scientist     | New Zealand   | irene.o@example.com    | 555-5555       | 912 Poplar St, Auckland, NZ  |
| 12  | Jack Indigo   | 38  | Teacher       | Ireland       | jack.i@example.com     | 555-6666       | 159 Elm St, Cork, IE         |
| 13  | Karen Red     | 41  | Lawyer        | USA           | karen.r@example.com    | 555-7777       | 357 Cedar St, Boston, MA     |
| 14  | Leo Brown     | 30  | Chef          | Canada        | leo.b@example.com      | 555-8888       | 246 Oak St, Calgary, AB      |
| 15  | Mia Green     | 33  | Musician      | UK            | mia.g@example.com      | 555-9999       | 975 Pine St, Edinburgh, UK   |
| 16  | Noah Yellow   | 29  | Doctor        | Australia     | noah.y@example.com     | 555-0000       | 864 Birch St, Brisbane, QLD  |
| 17  | Olivia Blue   | 35  | Engineer      | New Zealand   | olivia.b@example.com   | 555-1212       | 753 Maple St, Hamilton, NZ   |
| 18  | Peter Black   | 42  | Artist        | Ireland       | peter.b@example.com    | 555-3434       | 912 Fir St, Limerick, IE     |
| 19  | Quinn White   | 28  | Scientist     | USA           | quinn.w@example.com    | 555-5656       | 159 Willow St, Seattle, WA   |
| 20  | Rachel Red    | 31  | Teacher       | Canada        | rachel.r@example.com   | 555-7878       | 357 Poplar St, Ottawa, ON    |
| 21  | Steve Green   | 44  | Lawyer        | UK            | steve.g@example.com    | 555-9090       | 753 Elm St, Birmingham, UK   |
| 22  | Tina Blue     | 36  | Musician      | Australia     | tina.b@example.com     | 555-1213       | 864 Cedar St, Perth, WA      |
| 23  | Umar Black    | 39  | Chef          | New Zealand   | umar.b@example.com     | 555-3435       | 975 Spruce St, Christchurch, NZ|
| 24  | Victor Yellow | 43  | Engineer      | Ireland       | victor.y@example.com   | 555-5657       | 246 Willow St, Galway, IE    |
| 25  | Wendy Orange  | 27  | Artist        | USA           | wendy.o@example.com    | 555-7879       | 135 Elm St, Denver, CO       |
| 26  | Xavier Green  | 34  | Scientist     | Canada        | xavier.g@example.com   | 555-9091       | 357 Oak St, Montreal, QC     |
| 27  | Yara Red      | 41  | Teacher       | UK            | yara.r@example.com     | 555-1214       | 975 Pine St, Leeds, UK       |
| 28  | Zack Blue     | 30  | Lawyer        | Australia     | zack.b@example.com     | 555-3436       | 135 Birch St, Adelaide, SA   |
| 29  | Amy White     | 33  | Musician      | New Zealand   | amy.w@example.com      | 555-5658       | 159 Maple St, Wellington, NZ |
| 30  | Ben Black     | 38  | Chef          | Ireland       | ben.b@example.com      | 555-7870       | 246 Fir St, Waterford, IE    |
"""
)

In [40]:
def get_generation_time(llm, sampling_params, prompts):

    start_time = time.time()
    output = llm.generate(prompts, sampling_params=sampling_params)
    end_time = time.time()
    
    print("-" * 30)
    print(f"Output: {output[0].outputs[0].text}")
    print(f"Generation time: {end_time - start_time} seconds.")
    print("-" * 30)

In [41]:
llm = LLM(model="lmsys/longchat-13b-16k", enable_prefix_caching=True)

sampling_params = SamplingParams(temperature=0, max_tokens=100)

# first query
get_generation_time(
    llm,
    sampling_params,
    LONG_PROMPT
    + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
)

# second query
get_generation_time(
    llm,
    sampling_params,
    LONG_PROMPT
    + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
)

INFO 09-05 20:05:03 [config.py:853] This model supports multiple tasks: {'score', 'classify', 'embed', 'reward', 'generate'}. Defaulting to 'generate'.
INFO 09-05 20:05:03 [config.py:1467] Using max model len 16384
INFO 09-05 20:05:03 [config.py:2267] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 09-05 20:05:03 [config.py:4566] full_cuda_graph is not supported with cascade attention. Disabling cascade attention.
INFO 09-05 20:05:06 [__init__.py:244] Automatically detected platform rocm.
INFO 09-05 20:05:15 [core.py:459] Waiting for init message from front-end.
INFO 09-05 20:05:15 [core.py:69] Initializing a V1 LLM engine (v0.9.2.dev364+gb432b7a28) with config: model='lmsys/longchat-13b-16k', speculative_config=None, tokenizer='lmsys/longchat-13b-16k', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=16384, download_dir=None, load_format=LoadFo

Loading pt checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]
Loading pt checkpoint shards:  33% Completed | 1/3 [00:05<00:10,  5.03s/it]
Loading pt checkpoint shards:  67% Completed | 2/3 [00:08<00:03,  3.89s/it]
Loading pt checkpoint shards: 100% Completed | 3/3 [00:13<00:00,  4.38s/it]
Loading pt checkpoint shards: 100% Completed | 3/3 [00:13<00:00,  4.36s/it]



INFO 09-05 20:05:29 [default_loader.py:272] Loading weights took 13.09 seconds
INFO 09-05 20:05:29 [gpu_model_runner.py:1782] Model loading took 24.3262 GiB and 13.505399 seconds
INFO 09-05 20:05:35 [backends.py:509] Using cache directory: /root/.cache/vllm/torch_compile_cache/a3ae9284dc/rank_0_0/backbone for vLLM's torch.compile
INFO 09-05 20:05:35 [backends.py:520] Dynamo bytecode transform time: 5.73 s
INFO 09-05 20:05:38 [backends.py:155] Directly load the compiled graph(s) for shape None from the cache, took 0.368 s
INFO 09-05 20:05:39 [monitor.py:34] torch.compile takes 5.73 s in total
INFO 09-05 20:05:52 [gpu_worker.py:232] Available KV cache memory: 144.97 GiB
INFO 09-05 20:05:52 [kv_cache_utils.py:716] GPU KV cache size: 190,000 tokens
INFO 09-05 20:05:52 [kv_cache_utils.py:720] Maximum concurrency for 16,384 tokens per request: 11.60x
INFO 09-05 20:05:52 [rocm.py:224] Using Triton Attention backend on V1 engine.


Capturing CUDA graphs: 100%|██████████| 67/67 [00:09<00:00,  6.90it/s]
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


INFO 09-05 20:06:02 [gpu_model_runner.py:2306] Graph capturing finished in 10 secs, took 0.29 GiB
INFO 09-05 20:06:02 [core.py:172] init engine (profile, create kv cache, warmup model) took 32.13 seconds


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0% 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

------------------------------
Output: 29.
Generation time: 0.32152247428894043 seconds.
------------------------------


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0% 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

------------------------------
Output: 30.
Generation time: 0.05277061462402344 seconds.
------------------------------


In [42]:
try:
    del llm
except Exception as e:
    print(f"Failed to unload model: {e}")
finally:
    gc.collect()
    torch.cuda.empty_cache()
    ray.shutdown()



### Context Extension

In [43]:
def create_llm():
    rope_theta = 1000000
    original_max_position_embeddings = 32768
    factor = 4.0

    hf_overrides = {
        "rope_theta": rope_theta,
        "rope_scaling": {
            "rope_type": "yarn",
            "factor": factor,
            "original_max_position_embeddings": original_max_position_embeddings,
        },
        "max_model_len": int(original_max_position_embeddings * factor),
    }

    llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides)
    return llm

In [44]:
def run_llm_chat(llm):
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
        max_tokens=128,
    )

    conversation = [
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "Hello"},
        {"role": "assistant", "content": "Hello! How can I assist you today?"},
    ]
    outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
    return outputs

In [45]:
def print_outputs(outputs):
    print("\nGenerated Outputs:\n" + "-" * 80)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}\n")
        print(f"Generated text: {generated_text!r}")
        print("-" * 80)

In [46]:
llm = create_llm()
outputs = run_llm_chat(llm)
print_outputs(outputs)

INFO 09-05 20:07:43 [config.py:853] This model supports multiple tasks: {'score', 'classify', 'embed', 'reward', 'generate'}. Defaulting to 'generate'.
INFO 09-05 20:07:43 [config.py:1467] Using max model len 131072
INFO 09-05 20:07:43 [config.py:2267] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 09-05 20:07:43 [config.py:4566] full_cuda_graph is not supported with cascade attention. Disabling cascade attention.
INFO 09-05 20:07:46 [__init__.py:244] Automatically detected platform rocm.
INFO 09-05 20:07:55 [core.py:459] Waiting for init message from front-end.
INFO 09-05 20:07:55 [core.py:69] Initializing a V1 LLM engine (v0.9.2.dev364+gb432b7a28) with config: model='Qwen/Qwen3-0.6B', speculative_config=None, tokenizer='Qwen/Qwen3-0.6B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, 

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.17it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.17it/s]



INFO 09-05 20:07:57 [default_loader.py:272] Loading weights took 0.50 seconds
INFO 09-05 20:07:57 [gpu_model_runner.py:1782] Model loading took 1.2324 GiB and 1.000104 seconds
INFO 09-05 20:08:02 [backends.py:509] Using cache directory: /root/.cache/vllm/torch_compile_cache/3096bb6dc7/rank_0_0/backbone for vLLM's torch.compile
INFO 09-05 20:08:02 [backends.py:520] Dynamo bytecode transform time: 4.92 s
INFO 09-05 20:08:05 [backends.py:155] Directly load the compiled graph(s) for shape None from the cache, took 0.702 s
INFO 09-05 20:08:05 [monitor.py:34] torch.compile takes 4.92 s in total
INFO 09-05 20:08:18 [gpu_worker.py:232] Available KV cache memory: 164.94 GiB
INFO 09-05 20:08:18 [kv_cache_utils.py:716] GPU KV cache size: 1,544,160 tokens
INFO 09-05 20:08:18 [kv_cache_utils.py:720] Maximum concurrency for 131,072 tokens per request: 11.78x
INFO 09-05 20:08:18 [rocm.py:224] Using Triton Attention backend on V1 engine.


Capturing CUDA graphs: 100%|██████████| 67/67 [00:08<00:00,  7.94it/s]


INFO 09-05 20:08:26 [gpu_model_runner.py:2306] Graph capturing finished in 8 secs, took 0.20 GiB
INFO 09-05 20:08:26 [core.py:172] init engine (profile, create kv cache, warmup model) took 28.95 seconds

Generated Outputs:
--------------------------------------------------------------------------------
Prompt: None

Generated text: '<think>\nOkay, the user just said "Hello." I need to respond appropriately. Since they\'re asking hello, my job is to acknowledge it. I should make sure to keep the response friendly and helpful. Maybe add a smiley to keep it warm. Let me check if there\'s any context I\'m missing. No, there\'s nothing given. So I\'ll go with a simple greeting and offer assistance. Let me make sure the response is clear and matches the tone. Alright, time to send it.\n'
--------------------------------------------------------------------------------


In [50]:
try:
    del llm
except Exception as e:
    print(f"Failed to unload model: {e}")
finally:
    gc.collect()
    torch.cuda.empty_cache()
    ray.shutdown()

