In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

In [None]:
import torch
from vllm import LLM, SamplingParams
from vllm.attention.backends.abstract import AttentionType, SharedSelfAttentionType

  from .autonotebook import tqdm as notebook_tqdm
2024-10-10 15:14:49,423	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


### Tests

In [None]:
# enforce xformers attention backend - changes are made here
os.environ["VLLM_ATTENTION_BACKEND"] = "XFORMERS_CLA"

```python
# Inside LlamaAttention.forward
# Change q to fixed input for testing.
if self.cache_config.debug_kv_sharing: q = torch.ones_like(q)
attn_output = self.attn(q, k, v, kv_cache, attn_metadata, compute_new_kv=compute_new_kv_map)
if self.cache_config.debug_kv_sharing: self.attn_outputs.append(attn_output)
```

### TEST 1

In [None]:
### test 1: No prefix caching, no quantization, use v1 block manager, no cuda graphs ###
ENABLE_PREFIX_CACHING = False
KV_CACHE_DTYPE = "auto"
USE_V2_BLOCK_MANAGER = True
ENFORCE_EAGER = True
DEBUG_KV_SHARING = True


# This the base llm generator with KV cache sharing, where layer 1 gets its KV cache from layer 0.
test_llm_generator = LLM(model="/home/k/models/dummy-llama", 
                         tokenizer="meta-llama/Meta-Llama-3.1-8B-Instruct", 
                         enforce_eager=ENFORCE_EAGER, 
                         kv_cache_dtype=KV_CACHE_DTYPE,
                         enable_prefix_caching=ENABLE_PREFIX_CACHING, 
                         use_v2_block_manager=USE_V2_BLOCK_MANAGER,
                         tensor_parallel_size=1, 
                         dtype="bfloat16", 
                         max_model_len=1024*1, 
                         kv_cache_map={0:0, 1:0}, 
                         gpu_memory_utilization=0.2,
                         debug_kv_sharing=DEBUG_KV_SHARING)

In [None]:
model = test_llm_generator.llm_engine.model_executor.driver_worker.model_runner.model
worker = test_llm_generator.llm_engine.model_executor.driver_worker

In [None]:
# Debugging is on.
assert model.model.cache_config.debug_kv_sharing

In [None]:
def reset_attn_outputs_and_metadatas(model):
    # Reset attn outputs and attn metadatas from warmup.
    if len(model.model.layers[0].self_attn.attn_outputs) > 0:
        model.model.layers[0].self_attn.attn_outputs = []
    if len(model.model.layers[1].self_attn.attn_outputs) > 0:
        model.model.layers[1].self_attn.attn_outputs = []
    if len(model.model.attn_metadatas) > 0:
        model.model.attn_metadatas = []

In [None]:
# Just prefill.
reset_attn_outputs_and_metadatas(model)
output = test_llm_generator.generate(["hello world!", "hello"], SamplingParams(temperature=0.0, max_tokens=1, ignore_eos=True))

# Attention outputs of layer 0 and layer 1 should be equal, Q is fixed to 1s during debug mode.
attn_output_layer0 = model.model.layers[0].self_attn.attn_outputs[0]
attn_output_layer1 = model.model.layers[1].self_attn.attn_outputs[0]
assert torch.equal(attn_output_layer0, attn_output_layer1)
# 1 attention metadata for a single model forward pass
assert len(model.model.attn_metadatas) == 1 
attn_metadata = model.model.attn_metadatas[0]
# No decode request with max_tokens=1.
assert attn_metadata.decode_metadata is None
# Layer 0 creates new KV and layer 1 reuses KV from cache during prefill.
expected = [SharedSelfAttentionType.PREFILL_KV_NEW.name, SharedSelfAttentionType.PREFILL_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.prefill_metadata.shared_self_attention_types == expected

Processed prompts: 100%|██████████████████████████████████████████████| 2/2 [00:00<00:00, 262.64it/s, est. speed input: 799.52 toks/s, output: 266.04 toks/s]


In [None]:
# 3 tokens decode.
reset_attn_outputs_and_metadatas(model)
output = test_llm_generator.generate(["hello world!", "hello"], SamplingParams(temperature=0.0, max_tokens=3, ignore_eos=True))

# Attention outputs of layer 0 and layer 1 should be equal, Q is fixed to 1s during debug mode.
for attn_output_layer0, attn_output_layer1 in zip(model.model.layers[0].self_attn.attn_outputs, 
                                                  model.model.layers[1].self_attn.attn_outputs):
    assert torch.equal(attn_output_layer0, attn_output_layer1)

# 3 attention metadata for a 3 model forward pass (1 prefill + 2 decode)
assert len(model.model.attn_metadatas) == 3 
# Prefill phase.
attn_metadata = model.model.attn_metadatas[0]
assert attn_metadata.decode_metadata is None
expected = [SharedSelfAttentionType.PREFILL_KV_NEW.name, SharedSelfAttentionType.PREFILL_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.prefill_metadata.shared_self_attention_types == expected
# Decode phase 1.
attn_metadata = model.model.attn_metadatas[1]
assert attn_metadata.prefill_metadata is None
expected = [SharedSelfAttentionType.DECODE_KV_NEW.name, SharedSelfAttentionType.DECODE_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.decode_metadata.shared_self_attention_types == expected
# Decode phase 2.
attn_metadata = model.model.attn_metadatas[2]
assert attn_metadata.prefill_metadata is None
expected = [SharedSelfAttentionType.DECODE_KV_NEW.name, SharedSelfAttentionType.DECODE_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.decode_metadata.shared_self_attention_types == expected

Processed prompts: 100%|██████████████████████████████████████████████| 2/2 [00:00<00:00, 124.73it/s, est. speed input: 376.85 toks/s, output: 376.54 toks/s]


### TEST 2 (fp8)

```python
if kv_cache_dtype in ("fp8", "fp8_e4m3"):
    target_dtype = torch.float8_e4m3fn
elif kv_cache_dtype == "fp8_e5m2":
    target_dtype = torch.float8_e5m2
```

Quantiztion stack: 
- PagedAttention.write_to_paged_cache
- _custom_ops.reshape_and_cache
- torch.ops._C_cache_ops.reshape_and_cache
- cache_kernels.cu/reshape_and_cache
- CALL_RESHAPE_AND_CACHE (macro)
- reshape_and_cache_kernel
- fp8::scaled_convert
- scaled_vec_conversion

In [None]:
### test 1: No prefix caching, fp8 quantization, use v1 block manager, no cuda graphs ###
ENABLE_PREFIX_CACHING = False
KV_CACHE_DTYPE = "fp8"
USE_V2_BLOCK_MANAGER = True
ENFORCE_EAGER = True
DEBUG_KV_SHARING = True


# This the base llm generator with KV cache sharing, where layer 1 gets its KV cache from layer 0.
test_llm_generator = LLM(model="/home/k/models/dummy-llama", 
                         tokenizer="meta-llama/Meta-Llama-3.1-8B-Instruct", 
                         enforce_eager=ENFORCE_EAGER, 
                         kv_cache_dtype=KV_CACHE_DTYPE,
                         enable_prefix_caching=ENABLE_PREFIX_CACHING, 
                         use_v2_block_manager=USE_V2_BLOCK_MANAGER,
                         tensor_parallel_size=1, 
                         dtype="bfloat16",
                         max_model_len=1024*1, 
                         kv_cache_map={0:0, 1:0}, 
                         gpu_memory_utilization=0.2, 
                         debug_kv_sharing=DEBUG_KV_SHARING)

INFO 10-10 15:15:14 config.py:629] Using fp8 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. Meanwhile, it may cause accuracy drop without a proper scaling factor
INFO 10-10 15:15:14 llm_engine.py:237] Initializing an LLM engine (v0.1.dev2968+g2fa3c83) with config: model='/home/k/models/dummy-llama', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=fp8, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_trace

Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/home/k/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/.no_exist/0e9e39f249a16976918f6564b8830bc894c89659/tokenizer.model'
ERROR:huggingface_hub.file_download:Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/home/k/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/.no_exist/0e9e39f249a16976918f6564b8830bc894c89659/tokenizer.model'


INFO 10-10 15:15:15 selector.py:122] Using XFormers CLA backend.


  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 10-10 15:15:16 model_runner.py:1049] Starting to load model /home/k/models/dummy-llama...
INFO 10-10 15:15:16 selector.py:122] Using XFormers CLA backend.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.31it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.31it/s]



INFO 10-10 15:15:17 model_runner.py:1060] Loading model weights took 2.8008 GB
INFO 10-10 15:15:18 gpu_executor.py:122] # GPU blocks: 11874, # CPU blocks: 131072


In [None]:
model = test_llm_generator.llm_engine.model_executor.driver_worker.model_runner.model
worker = test_llm_generator.llm_engine.model_executor.driver_worker

In [None]:
# Debugging is on.
assert model.model.cache_config.debug_kv_sharing

In [None]:
def reset_attn_outputs_and_metadatas(model):
    # Reset attn outputs and attn metadatas from warmup.
    if len(model.model.layers[0].self_attn.attn_outputs) > 0:
        model.model.layers[0].self_attn.attn_outputs = []
    if len(model.model.layers[1].self_attn.attn_outputs) > 0:
        model.model.layers[1].self_attn.attn_outputs = []
    if len(model.model.attn_metadatas) > 0:
        model.model.attn_metadatas = []

In [None]:
# Just prefill.
reset_attn_outputs_and_metadatas(model)
output = test_llm_generator.generate(["hello world!", "hello"], SamplingParams(temperature=0.0, max_tokens=1, ignore_eos=True))

# Attention outputs of layer 0 and layer 1 should be equal, Q is fixed to 1s during debug mode.
attn_output_layer0 = model.model.layers[0].self_attn.attn_outputs[0]
attn_output_layer1 = model.model.layers[1].self_attn.attn_outputs[0]
assert torch.allclose(attn_output_layer0, attn_output_layer1, atol=1e-2, rtol=1e-2)
# 1 attention metadata for a single model forward pass
assert len(model.model.attn_metadatas) == 1 
attn_metadata = model.model.attn_metadatas[0]
# No decode request with max_tokens=1.
assert attn_metadata.decode_metadata is None
# Layer 0 creates new KV and layer 1 reuses KV from cache during prefill.
expected = [SharedSelfAttentionType.PREFILL_KV_NEW.name, SharedSelfAttentionType.PREFILL_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.prefill_metadata.shared_self_attention_types == expected

Processed prompts: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 128.44it/s, est. speed input: 388.09 toks/s, output: 129.25 toks/s]


In [None]:
# 3 tokens decode.
reset_attn_outputs_and_metadatas(model)
output = test_llm_generator.generate(["hello world!", "hello"], SamplingParams(temperature=0.0, max_tokens=3, ignore_eos=True))

# Attention outputs of layer 0 and layer 1 should be equal, Q is fixed to 1s during debug mode.
for attn_output_layer0, attn_output_layer1 in zip(model.model.layers[0].self_attn.attn_outputs, 
                                                  model.model.layers[1].self_attn.attn_outputs):
    assert torch.allclose(attn_output_layer0, attn_output_layer1, atol=1e-2, rtol=1e-2)

# 3 attention metadata for a 3 model forward pass (1 prefill + 2 decode)
assert len(model.model.attn_metadatas) == 3 
# Prefill phase.
attn_metadata = model.model.attn_metadatas[0]
assert attn_metadata.decode_metadata is None
expected = [SharedSelfAttentionType.PREFILL_KV_NEW.name, SharedSelfAttentionType.PREFILL_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.prefill_metadata.shared_self_attention_types == expected
# Decode phase 1.
attn_metadata = model.model.attn_metadatas[1]
assert attn_metadata.prefill_metadata is None
expected = [SharedSelfAttentionType.DECODE_KV_NEW.name, SharedSelfAttentionType.DECODE_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.decode_metadata.shared_self_attention_types == expected
# Decode phase 2.
attn_metadata = model.model.attn_metadatas[2]
assert attn_metadata.prefill_metadata is None
expected = [SharedSelfAttentionType.DECODE_KV_NEW.name, SharedSelfAttentionType.DECODE_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.decode_metadata.shared_self_attention_types == expected

Processed prompts: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 120.05it/s, est. speed input: 362.65 toks/s, output: 362.37 toks/s]


### TEST 3 (cuda graphs)

In [None]:
### test 1: No prefix caching, bf16 quantization, use v1 block manager, cuda graph mode ###
ENABLE_PREFIX_CACHING = False
KV_CACHE_DTYPE = "auto"
USE_V2_BLOCK_MANAGER = True
ENFORCE_EAGER = False
DEBUG_KV_SHARING = True


# This the base llm generator with KV cache sharing, where layer 1 gets its KV cache from layer 0.
test_llm_generator = LLM(model="/home/k/models/dummy-llama", 
                         tokenizer="meta-llama/Meta-Llama-3.1-8B-Instruct", 
                         enforce_eager=ENFORCE_EAGER, 
                         kv_cache_dtype=KV_CACHE_DTYPE,
                         enable_prefix_caching=ENABLE_PREFIX_CACHING, 
                         use_v2_block_manager=USE_V2_BLOCK_MANAGER,
                         tensor_parallel_size=1, 
                         dtype="bfloat16", 
                         max_model_len=1024*1, 
                         kv_cache_map={0:0, 1:0}, 
                         gpu_memory_utilization=0.2,
                         debug_kv_sharing=DEBUG_KV_SHARING)

INFO 10-09 16:44:55 llm_engine.py:237] Initializing an LLM engine (v0.1.dev2968+g2fa3c83) with config: model='/home/k/models/dummy-llama', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/home/k/models/dummy-llama, use_v2_block_manager=True, num_scheduler_steps=1, chunked_prefill_

Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/home/k/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/.no_exist/0e9e39f249a16976918f6564b8830bc894c89659/tokenizer.model'
ERROR:huggingface_hub.file_download:Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/home/k/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/.no_exist/0e9e39f249a16976918f6564b8830bc894c89659/tokenizer.model'


INFO 10-09 16:44:57 model_runner.py:1049] Starting to load model /home/k/models/dummy-llama...


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.22it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.21it/s]



INFO 10-09 16:44:58 model_runner.py:1060] Loading model weights took 2.7696 GB
INFO 10-09 16:44:58 gpu_executor.py:122] # GPU blocks: 8049, # CPU blocks: 65536
INFO 10-09 16:44:58 model_runner.py:1383] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 10-09 16:44:58 model_runner.py:1387] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 10-09 16:45:31 model_runner.py:1511] Graph capturing finished in 33 secs.


In [None]:
model = test_llm_generator.llm_engine.model_executor.driver_worker.model_runner.model
worker = test_llm_generator.llm_engine.model_executor.driver_worker

In [None]:
# Debugging is on.
assert model.model.cache_config.debug_kv_sharing

In [None]:
def reset_attn_outputs_and_metadatas(model):
    # Reset attn outputs and attn metadatas from warmup.
    if len(model.model.layers[0].self_attn.attn_outputs) > 0:
        model.model.layers[0].self_attn.attn_outputs = []
    if len(model.model.layers[1].self_attn.attn_outputs) > 0:
        model.model.layers[1].self_attn.attn_outputs = []
    if len(model.model.attn_metadatas) > 0:
        model.model.attn_metadatas = []

In [None]:
# Just prefill.
reset_attn_outputs_and_metadatas(model)
output = test_llm_generator.generate(["hello world!", "hello"], SamplingParams(temperature=0.0, max_tokens=1, ignore_eos=True))

# Attention outputs of layer 0 and layer 1 should be equal, Q is fixed to 1s during debug mode.
attn_output_layer0 = model.model.layers[0].self_attn.attn_outputs[0]
attn_output_layer1 = model.model.layers[1].self_attn.attn_outputs[0]
assert torch.equal(attn_output_layer0, attn_output_layer1)
# 1 attention metadata for a single model forward pass
assert len(model.model.attn_metadatas) == 1 
attn_metadata = model.model.attn_metadatas[0]
# No decode request with max_tokens=1.
assert attn_metadata.decode_metadata is None
# Layer 0 creates new KV and layer 1 reuses KV from cache during prefill.
expected = [SharedSelfAttentionType.PREFILL_KV_NEW.name, SharedSelfAttentionType.PREFILL_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.prefill_metadata.shared_self_attention_types == expected

Processed prompts: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 163.54it/s, est. speed input: 495.73 toks/s, output: 165.06 toks/s]


In [None]:
# 3 tokens decode.
reset_attn_outputs_and_metadatas(model)
output = test_llm_generator.generate(["hello world!", "hello"], SamplingParams(temperature=0.0, max_tokens=3, ignore_eos=True))

# Attention outputs of layer 0 and layer 1 should be equal, Q is fixed to 1s during debug mode.
for attn_output_layer0, attn_output_layer1 in zip(model.model.layers[0].self_attn.attn_outputs, 
                                                  model.model.layers[1].self_attn.attn_outputs):
    assert torch.equal(attn_output_layer0, attn_output_layer1)


    
#     NOTE: Any python object stored here is not updated when it is
#     cuda-graph replayed. If you have values that need to be changed
#     dynamically, it should be stored in tensor. The tensor has to be
#     updated from `CUDAGraphRunner.forward` API.
    
    
# # 3 attention metadata for a 3 model forward pass (1 prefill + 2 decode)
# assert len(model.model.attn_metadatas) == 3 
# # Prefill phase.
# attn_metadata = model.model.attn_metadatas[0]
# assert attn_metadata.decode_metadata is None
# expected = [SharedSelfAttentionType.PREFILL_KV_NEW.name, SharedSelfAttentionType.PREFILL_KV_SHARED.name] # [layer 0, layer 1]
# assert attn_metadata.prefill_metadata.shared_self_attention_types == expected
# # Decode phase 1.
# attn_metadata = model.model.attn_metadatas[1]
# assert attn_metadata.prefill_metadata is None
# expected = [SharedSelfAttentionType.DECODE_KV_NEW.name, SharedSelfAttentionType.DECODE_KV_SHARED.name] # [layer 0, layer 1]
# assert attn_metadata.decode_metadata.shared_self_attention_types == expected
# # Decode phase 2.
# attn_metadata = model.model.attn_metadatas[2]
# assert attn_metadata.prefill_metadata is None
# expected = [SharedSelfAttentionType.DECODE_KV_NEW.name, SharedSelfAttentionType.DECODE_KV_SHARED.name] # [layer 0, layer 1]
# assert attn_metadata.decode_metadata.shared_self_attention_types == expected

Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 79.58it/s, est. speed input: 239.93 toks/s, output: 239.80 toks/s]


### TEST 4 (block manager v1)

In [None]:
ENABLE_PREFIX_CACHING = False
KV_CACHE_DTYPE = "auto"
USE_V2_BLOCK_MANAGER = False
ENFORCE_EAGER = True
DEBUG_KV_SHARING = True


# This the base llm generator with KV cache sharing, where layer 1 gets its KV cache from layer 0.
test_llm_generator = LLM(model="/home/k/models/dummy-llama", 
                         tokenizer="meta-llama/Meta-Llama-3.1-8B-Instruct", 
                         enforce_eager=ENFORCE_EAGER, 
                         kv_cache_dtype=KV_CACHE_DTYPE,
                         enable_prefix_caching=ENABLE_PREFIX_CACHING, 
                         use_v2_block_manager=USE_V2_BLOCK_MANAGER,
                         tensor_parallel_size=1, 
                         dtype="bfloat16", 
                         max_model_len=1024*1,
                         kv_cache_map={0:0, 1:0}, 
                         gpu_memory_utilization=0.2,
                         debug_kv_sharing=DEBUG_KV_SHARING)

INFO 10-09 16:47:27 llm_engine.py:237] Initializing an LLM engine (v0.1.dev2968+g2fa3c83) with config: model='/home/k/models/dummy-llama', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/home/k/models/dummy-llama, use_v2_block_manager=False, num_scheduler_steps=1, chunked_prefill_

Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/home/k/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/.no_exist/0e9e39f249a16976918f6564b8830bc894c89659/tokenizer.model'
ERROR:huggingface_hub.file_download:Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/home/k/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/.no_exist/0e9e39f249a16976918f6564b8830bc894c89659/tokenizer.model'


INFO 10-09 16:47:29 model_runner.py:1049] Starting to load model /home/k/models/dummy-llama...


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.29it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.28it/s]



INFO 10-09 16:47:29 model_runner.py:1060] Loading model weights took 2.7696 GB
INFO 10-09 16:47:30 gpu_executor.py:122] # GPU blocks: 8049, # CPU blocks: 65536


In [None]:
model = test_llm_generator.llm_engine.model_executor.driver_worker.model_runner.model
worker = test_llm_generator.llm_engine.model_executor.driver_worker

In [None]:
# Debugging is on.
assert model.model.cache_config.debug_kv_sharing

In [None]:
def reset_attn_outputs_and_metadatas(model):
    # Reset attn outputs and attn metadatas from warmup.
    if len(model.model.layers[0].self_attn.attn_outputs) > 0:
        model.model.layers[0].self_attn.attn_outputs = []
    if len(model.model.layers[1].self_attn.attn_outputs) > 0:
        model.model.layers[1].self_attn.attn_outputs = []
    if len(model.model.attn_metadatas) > 0:
        model.model.attn_metadatas = []

In [None]:
# Just prefill.
reset_attn_outputs_and_metadatas(model)
output = test_llm_generator.generate(["hello world!", "hello"], SamplingParams(temperature=0.0, max_tokens=1, ignore_eos=True))

# Attention outputs of layer 0 and layer 1 should be equal, Q is fixed to 1s during debug mode.
attn_output_layer0 = model.model.layers[0].self_attn.attn_outputs[0]
attn_output_layer1 = model.model.layers[1].self_attn.attn_outputs[0]
assert torch.equal(attn_output_layer0, attn_output_layer1)
# 1 attention metadata for a single model forward pass
assert len(model.model.attn_metadatas) == 1 
attn_metadata = model.model.attn_metadatas[0]
# No decode request with max_tokens=1.
assert attn_metadata.decode_metadata is None
# Layer 0 creates new KV and layer 1 reuses KV from cache during prefill.
expected = [SharedSelfAttentionType.PREFILL_KV_NEW.name, SharedSelfAttentionType.PREFILL_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.prefill_metadata.shared_self_attention_types == expected

Processed prompts: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 121.16it/s, est. speed input: 366.24 toks/s, output: 121.98 toks/s]


In [None]:
# 3 tokens decode.
reset_attn_outputs_and_metadatas(model)
output = test_llm_generator.generate(["hello world!", "hello"], SamplingParams(temperature=0.0, max_tokens=3, ignore_eos=True))

# Attention outputs of layer 0 and layer 1 should be equal, Q is fixed to 1s during debug mode.
for attn_output_layer0, attn_output_layer1 in zip(model.model.layers[0].self_attn.attn_outputs, 
                                                  model.model.layers[1].self_attn.attn_outputs):
    assert torch.equal(attn_output_layer0, attn_output_layer1)

# 3 attention metadata for a 3 model forward pass (1 prefill + 2 decode)
assert len(model.model.attn_metadatas) == 3 
# Prefill phase.
attn_metadata = model.model.attn_metadatas[0]
assert attn_metadata.decode_metadata is None
expected = [SharedSelfAttentionType.PREFILL_KV_NEW.name, SharedSelfAttentionType.PREFILL_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.prefill_metadata.shared_self_attention_types == expected
# Decode phase 1.
attn_metadata = model.model.attn_metadatas[1]
assert attn_metadata.prefill_metadata is None
expected = [SharedSelfAttentionType.DECODE_KV_NEW.name, SharedSelfAttentionType.DECODE_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.decode_metadata.shared_self_attention_types == expected
# Decode phase 2.
attn_metadata = model.model.attn_metadatas[2]
assert attn_metadata.prefill_metadata is None
expected = [SharedSelfAttentionType.DECODE_KV_NEW.name, SharedSelfAttentionType.DECODE_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.decode_metadata.shared_self_attention_types == expected

Processed prompts: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 113.05it/s, est. speed input: 342.08 toks/s, output: 341.75 toks/s]


### TEST 5 (prefix caching)

In [None]:
from vllm.core.block.interfaces import Device

In [None]:
### test 1: No prefix caching, bf16 quantization, use v1 block manager, cuda graph mode ###
ENABLE_PREFIX_CACHING = True
KV_CACHE_DTYPE = "auto"
USE_V2_BLOCK_MANAGER = True
ENFORCE_EAGER = True
DEBUG_KV_SHARING = True


# This the base llm generator with KV cache sharing, where layer 1 gets its KV cache from layer 0.
test_llm_generator = LLM(model="/home/k/models/dummy-llama", 
                         tokenizer="meta-llama/Meta-Llama-3.1-8B-Instruct", 
                         enforce_eager=ENFORCE_EAGER, 
                         kv_cache_dtype=KV_CACHE_DTYPE,
                         enable_prefix_caching=ENABLE_PREFIX_CACHING, 
                         use_v2_block_manager=USE_V2_BLOCK_MANAGER,
                         tensor_parallel_size=1,
                         dtype="bfloat16",
                         max_model_len=1024*1,
                         kv_cache_map={0:0, 1:0},
                         gpu_memory_utilization=0.2,
                         debug_kv_sharing=DEBUG_KV_SHARING)

INFO 10-09 18:24:18 llm_engine.py:237] Initializing an LLM engine (v0.1.dev2968+g2fa3c83) with config: model='/home/k/models/dummy-llama', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/home/k/models/dummy-llama, use_v2_block_manager=True, num_scheduler_steps=1, chunked_prefill_e

Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/home/k/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/.no_exist/0e9e39f249a16976918f6564b8830bc894c89659/tokenizer.model'
ERROR:huggingface_hub.file_download:Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/home/k/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/.no_exist/0e9e39f249a16976918f6564b8830bc894c89659/tokenizer.model'


INFO 10-09 18:24:19 model_runner.py:1049] Starting to load model /home/k/models/dummy-llama...


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.34it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.34it/s]



INFO 10-09 18:24:20 model_runner.py:1060] Loading model weights took 2.7696 GB
INFO 10-09 18:24:21 gpu_executor.py:122] # GPU blocks: 8305, # CPU blocks: 65536


In [None]:
model = test_llm_generator.llm_engine.model_executor.driver_worker.model_runner.model
worker = test_llm_generator.llm_engine.model_executor.driver_worker

In [None]:
scheduler = test_llm_generator.llm_engine.scheduler[0]
gpu_allocator = scheduler.block_manager.block_allocator._allocators[Device.GPU]
cpu_allocator = scheduler.block_manager.block_allocator._allocators[Device.CPU]
gpu_allocator._cached_blocks, cpu_allocator._cached_blocks

({}, {})

In [None]:
# Debugging is on.
assert model.model.cache_config.debug_kv_sharing

In [None]:
def reset_attn_outputs_and_metadatas(model):
    # Reset attn outputs and attn metadatas from warmup.
    if len(model.model.layers[0].self_attn.attn_outputs) > 0:
        model.model.layers[0].self_attn.attn_outputs = []
    if len(model.model.layers[1].self_attn.attn_outputs) > 0:
        model.model.layers[1].self_attn.attn_outputs = []
    if len(model.model.attn_metadatas) > 0:
        model.model.attn_metadatas = []

In [None]:
# Just prefill.
reset_attn_outputs_and_metadatas(model)
output = test_llm_generator.generate(["hello world!", "hello"], SamplingParams(temperature=0.0, max_tokens=1, ignore_eos=True))

# Attention outputs of layer 0 and layer 1 should be equal, Q is fixed to 1s during debug mode.
attn_output_layer0 = model.model.layers[0].self_attn.attn_outputs[0]
attn_output_layer1 = model.model.layers[1].self_attn.attn_outputs[0]
assert torch.equal(attn_output_layer0, attn_output_layer1)
# 1 attention metadata for a single model forward pass
assert len(model.model.attn_metadatas) == 1 
attn_metadata = model.model.attn_metadatas[0]
# No decode request with max_tokens=1.
assert attn_metadata.decode_metadata is None
# Layer 0 creates new KV and layer 1 reuses KV from cache during prefill.
expected = [SharedSelfAttentionType.PREFILL_KV_NEW.name, SharedSelfAttentionType.PREFILL_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.prefill_metadata.shared_self_attention_types == expected

Processed prompts: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 123.85it/s, est. speed input: 374.75 toks/s, output: 124.81 toks/s]


In [None]:
scheduler = test_llm_generator.llm_engine.scheduler[0]
gpu_allocator = scheduler.block_manager.block_allocator._allocators[Device.GPU]
cpu_allocator = scheduler.block_manager.block_allocator._allocators[Device.CPU]
gpu_allocator._cached_blocks, cpu_allocator._cached_blocks

({}, {})

In [None]:
# Just prefill with prefix cached.
reset_attn_outputs_and_metadatas(model)
output = test_llm_generator.generate(["hello world!", "hello"], SamplingParams(temperature=0.0, max_tokens=3, ignore_eos=True))

Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 86.09it/s, est. speed input: 259.70 toks/s, output: 259.55 toks/s]


In [None]:
scheduler = test_llm_generator.llm_engine.scheduler[0]
gpu_allocator = scheduler.block_manager.block_allocator._allocators[Device.GPU]
cpu_allocator = scheduler.block_manager.block_allocator._allocators[Device.CPU]
gpu_allocator._cached_blocks, cpu_allocator._cached_blocks

({}, {})

In [None]:
scheduler.block_manager.block_allocator

<vllm.core.block.cpu_gpu_block_allocator.CpuGpuBlockAllocator>

In [None]:
gpu_allocator._block_size

16

In [None]:
len(gpu_allocator._block_pool._pool)

33220

In [None]:
gpu_allocator._block_pool._pool[0].token_ids

[128000, 15339, 1917, 0, 220, 12451]

In [None]:
gpu_allocator._block_pool._pool[1].token_ids

[128000, 15339, 12451, 37364]

In [None]:
gpu_allocator._block_pool._pool[2].token_ids

[]

In [None]:
scheduler.block_manager.block_allocator.get_prefix_cache_hit_rate(Device.GPU)

0.0