In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

In [None]:
import torch
from vllm import LLM, SamplingParams
from vllm.attention.backends.abstract import AttentionType, SharedSelfAttentionType

  from .autonotebook import tqdm as notebook_tqdm
2024-10-08 19:58:36,896	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


### Tests

In [None]:
# enforce xformers attention backend - changes are made here
os.environ["VLLM_ATTENTION_BACKEND"] = "XFORMERS"

```python
# Inside LlamaAttention.forward
# Change q to fixed input for testing.
if self.cache_config.debug_kv_sharing: q = torch.ones_like(q)
attn_output = self.attn(q, k, v, kv_cache, attn_metadata, compute_new_kv=compute_new_kv_map)
if self.cache_config.debug_kv_sharing: self.attn_outputs.append(attn_output)
```

### TEST 1

In [None]:
### test 1: No prefix caching, no quantization, use v1 block manager, no cuda graphs ###
ENABLE_PREFIX_CACHING = False
KV_CACHE_DTYPE = "auto"
USE_V2_BLOCK_MANAGER = True
ENFORCE_EAGER = True
DEBUG_KV_SHARING = True


# This the base llm generator with KV cache sharing, where layer 1 gets its KV cache from layer 0.
test_llm_generator = LLM(model="/workspace/models/dummy-llama", tokenizer="meta-llama/Meta-Llama-3.1-8B-Instruct", 
						 enforce_eager=ENFORCE_EAGER, kv_cache_dtype=KV_CACHE_DTYPE,
       					 enable_prefix_caching=ENABLE_PREFIX_CACHING, use_v2_block_manager=USE_V2_BLOCK_MANAGER,
						 tensor_parallel_size=1, dtype="bfloat16", max_model_len=1024*1, kv_cache_map={0:0, 1:0}, gpu_memory_utilization=0.2,
						 debug_kv_sharing=DEBUG_KV_SHARING)

INFO 10-08 19:58:43 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='/workspace/models/dummy-llama', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/workspace/models/dummy-llama, use_v2_block_manager=True, enable_prefix_caching=False)
INFO 10-08 19:58:43 selector.py:116] Using XForme

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 10-08 19:58:44 model_runner.py:879] Starting to load model /workspace/models/dummy-llama...
INFO 10-08 19:58:44 selector.py:116] Using XFormers backend.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.26it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.25it/s]



INFO 10-08 19:58:45 model_runner.py:890] Loading model weights took 2.8008 GB
INFO 10-08 19:58:46 gpu_executor.py:121] # GPU blocks: 5937, # CPU blocks: 65536


In [None]:
model = test_llm_generator.llm_engine.model_executor.driver_worker.model_runner.model
worker = test_llm_generator.llm_engine.model_executor.driver_worker

In [None]:
# Debugging is on.
assert model.model.cache_config.debug_kv_sharing

In [None]:
def reset_attn_outputs_and_metadatas(model):
	# Reset attn outputs and attn metadatas from warmup.
	if len(model.model.layers[0].self_attn.attn_outputs) > 0:
		model.model.layers[0].self_attn.attn_outputs = []
	if len(model.model.layers[1].self_attn.attn_outputs) > 0:
		model.model.layers[1].self_attn.attn_outputs = []
	if len(model.model.attn_metadatas) > 0:
		model.model.attn_metadatas = []

In [None]:
# Just prefill.
reset_attn_outputs_and_metadatas(model)
output = test_llm_generator.generate(["hello world!", "hello"], SamplingParams(temperature=0.0, max_tokens=1, ignore_eos=True))

# Attention outputs of layer 0 and layer 1 should be equal, Q is fixed to 1s during debug mode.
attn_output_layer0 = model.model.layers[0].self_attn.attn_outputs[0]
attn_output_layer1 = model.model.layers[1].self_attn.attn_outputs[0]
assert torch.equal(attn_output_layer0, attn_output_layer1)
# 1 attention metadata for a single model forward pass
assert len(model.model.attn_metadatas) == 1 
attn_metadata = model.model.attn_metadatas[0]
# No decode request with max_tokens=1.
assert attn_metadata.decode_metadata is None
# Layer 0 creates new KV and layer 1 reuses KV from cache during prefill.
expected = [SharedSelfAttentionType.PREFILL_KV_NEW.name, SharedSelfAttentionType.PREFILL_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.prefill_metadata.shared_self_attention_types == expected

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|██████████| 2/2 [00:00<00:00,  7.73it/s, est. speed input: 23.20 toks/s, output: 7.73 toks/s]


In [None]:
# 3 tokens decode.
reset_attn_outputs_and_metadatas(model)
output = test_llm_generator.generate(["hello world!", "hello"], SamplingParams(temperature=0.0, max_tokens=3, ignore_eos=True))

# Attention outputs of layer 0 and layer 1 should be equal, Q is fixed to 1s during debug mode.
for attn_output_layer0, attn_output_layer1 in zip(model.model.layers[0].self_attn.attn_outputs, 
                                                  model.model.layers[1].self_attn.attn_outputs):
    assert torch.equal(attn_output_layer0, attn_output_layer1)

# 3 attention metadata for a 3 model forward pass (1 prefill + 2 decode)
assert len(model.model.attn_metadatas) == 3 
# Prefill phase.
attn_metadata = model.model.attn_metadatas[0]
assert attn_metadata.decode_metadata is None
expected = [SharedSelfAttentionType.PREFILL_KV_NEW.name, SharedSelfAttentionType.PREFILL_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.prefill_metadata.shared_self_attention_types == expected
# Decode phase 1.
attn_metadata = model.model.attn_metadatas[1]
assert attn_metadata.prefill_metadata is None
expected = [SharedSelfAttentionType.DECODE_KV_NEW.name, SharedSelfAttentionType.DECODE_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.decode_metadata.shared_self_attention_types == expected
# Decode phase 2.
attn_metadata = model.model.attn_metadatas[2]
assert attn_metadata.prefill_metadata is None
expected = [SharedSelfAttentionType.DECODE_KV_NEW.name, SharedSelfAttentionType.DECODE_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.decode_metadata.shared_self_attention_types == expected

Processed prompts: 100%|██████████| 2/2 [00:00<00:00, 17.83it/s, est. speed input: 53.53 toks/s, output: 53.52 toks/s]


### TEST 2 (fp8)

In [None]:
### test 1: No prefix caching, fp8 quantization, use v1 block manager, no cuda graphs ###
ENABLE_PREFIX_CACHING = False
KV_CACHE_DTYPE = "fp8"
USE_V2_BLOCK_MANAGER = True
ENFORCE_EAGER = True
DEBUG_KV_SHARING = True


# This the base llm generator with KV cache sharing, where layer 1 gets its KV cache from layer 0.
test_llm_generator = LLM(model="/workspace/models/dummy-llama", tokenizer="meta-llama/Meta-Llama-3.1-8B-Instruct", 
						 enforce_eager=ENFORCE_EAGER, kv_cache_dtype=KV_CACHE_DTYPE,
       					 enable_prefix_caching=ENABLE_PREFIX_CACHING, use_v2_block_manager=USE_V2_BLOCK_MANAGER,
						 tensor_parallel_size=1, dtype="bfloat16", max_model_len=1024*1, kv_cache_map={0:0, 1:0}, gpu_memory_utilization=0.2,
						 debug_kv_sharing=DEBUG_KV_SHARING)

INFO 10-08 19:59:59 config.py:576] Using fp8 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. Meanwhile, it may cause accuracy drop without a proper scaling factor
INFO 10-08 19:59:59 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='/workspace/models/dummy-llama', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=fp8, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_ti

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.27it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.27it/s]



INFO 10-08 20:00:00 model_runner.py:890] Loading model weights took 2.7696 GB
INFO 10-08 20:00:01 gpu_executor.py:121] # GPU blocks: 16610, # CPU blocks: 131072


In [None]:
model = test_llm_generator.llm_engine.model_executor.driver_worker.model_runner.model
worker = test_llm_generator.llm_engine.model_executor.driver_worker

In [None]:
# Debugging is on.
assert model.model.cache_config.debug_kv_sharing

In [None]:
def reset_attn_outputs_and_metadatas(model):
	# Reset attn outputs and attn metadatas from warmup.
	if len(model.model.layers[0].self_attn.attn_outputs) > 0:
		model.model.layers[0].self_attn.attn_outputs = []
	if len(model.model.layers[1].self_attn.attn_outputs) > 0:
		model.model.layers[1].self_attn.attn_outputs = []
	if len(model.model.attn_metadatas) > 0:
		model.model.attn_metadatas = []

In [None]:
# Just prefill.
reset_attn_outputs_and_metadatas(model)
output = test_llm_generator.generate(["hello world!", "hello"], SamplingParams(temperature=0.0, max_tokens=1, ignore_eos=True))

# Attention outputs of layer 0 and layer 1 should be equal, Q is fixed to 1s during debug mode.
attn_output_layer0 = model.model.layers[0].self_attn.attn_outputs[0]
attn_output_layer1 = model.model.layers[1].self_attn.attn_outputs[0]
assert torch.equal(attn_output_layer0, attn_output_layer1)
# 1 attention metadata for a single model forward pass
assert len(model.model.attn_metadatas) == 1 
attn_metadata = model.model.attn_metadatas[0]
# No decode request with max_tokens=1.
assert attn_metadata.decode_metadata is None
# Layer 0 creates new KV and layer 1 reuses KV from cache during prefill.
expected = [SharedSelfAttentionType.PREFILL_KV_NEW.name, SharedSelfAttentionType.PREFILL_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.prefill_metadata.shared_self_attention_types == expected

Processed prompts: 100%|██████████| 2/2 [00:00<00:00, 70.06it/s, est. speed input: 211.72 toks/s, output: 70.53 toks/s]


AssertionError: 

In [None]:
attn_output_layer0, attn_output_layer1

(tensor([[ 0.0109, -0.0005,  0.0287,  ...,  0.0007, -0.0010,  0.0004],
         [ 0.0013,  0.0067,  0.0334,  ...,  0.0557, -0.0334,  0.0114],
         [ 0.0029,  0.0021,  0.0306,  ...,  0.0140, -0.0356,  0.0200],
         [ 0.0019,  0.0014,  0.0311,  ...,  0.0135, -0.0352,  0.0198],
         [ 0.0109, -0.0005,  0.0287,  ...,  0.0007, -0.0010,  0.0004],
         [ 0.0013,  0.0067,  0.0334,  ...,  0.0557, -0.0334,  0.0114]],
        device='cuda:0', dtype=torch.bfloat16),
 tensor([[ 0.0137, -0.0078,  0.0293,  ...,  0.0078, -0.0088,  0.0078],
         [ 0.0031,  0.0021,  0.0339,  ...,  0.0564, -0.0339,  0.0134],
         [ 0.0047, -0.0020,  0.0312,  ...,  0.0142, -0.0369,  0.0204],
         [ 0.0035, -0.0027,  0.0317,  ...,  0.0135, -0.0366,  0.0203],
         [ 0.0137, -0.0078,  0.0293,  ...,  0.0078, -0.0088,  0.0078],
         [ 0.0031,  0.0021,  0.0339,  ...,  0.0564, -0.0339,  0.0134]],
        device='cuda:0', dtype=torch.bfloat16))

### TEST 3 (cuda graphs)

In [None]:
### test 1: No prefix caching, bf16 quantization, use v1 block manager, cuda graph mode ###
ENABLE_PREFIX_CACHING = False
KV_CACHE_DTYPE = "auto"
USE_V2_BLOCK_MANAGER = True
ENFORCE_EAGER = False
DEBUG_KV_SHARING = True


# This the base llm generator with KV cache sharing, where layer 1 gets its KV cache from layer 0.
test_llm_generator = LLM(model="/workspace/models/dummy-llama", tokenizer="meta-llama/Meta-Llama-3.1-8B-Instruct", 
						 enforce_eager=ENFORCE_EAGER, kv_cache_dtype=KV_CACHE_DTYPE,
       					 enable_prefix_caching=ENABLE_PREFIX_CACHING, use_v2_block_manager=USE_V2_BLOCK_MANAGER,
						 tensor_parallel_size=1, dtype="bfloat16", max_model_len=1024*1, kv_cache_map={0:0, 1:0}, gpu_memory_utilization=0.2,
						 debug_kv_sharing=DEBUG_KV_SHARING)

INFO 10-08 20:01:59 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='/workspace/models/dummy-llama', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/workspace/models/dummy-llama, use_v2_block_manager=True, enable_prefix_caching=False)
INFO 10-08 20:01:59 model_runner.py:879] Startin

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.26it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.26it/s]



INFO 10-08 20:02:01 model_runner.py:890] Loading model weights took 2.7696 GB
INFO 10-08 20:02:02 gpu_executor.py:121] # GPU blocks: 53681, # CPU blocks: 65536
INFO 10-08 20:02:02 model_runner.py:1181] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 10-08 20:02:02 model_runner.py:1185] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 10-08 20:02:34 model_runner.py:1300] Graph capturing finished in 32 secs.


In [None]:
model = test_llm_generator.llm_engine.model_executor.driver_worker.model_runner.model
worker = test_llm_generator.llm_engine.model_executor.driver_worker

In [None]:
# Debugging is on.
assert model.model.cache_config.debug_kv_sharing

In [None]:
def reset_attn_outputs_and_metadatas(model):
	# Reset attn outputs and attn metadatas from warmup.
	if len(model.model.layers[0].self_attn.attn_outputs) > 0:
		model.model.layers[0].self_attn.attn_outputs = []
	if len(model.model.layers[1].self_attn.attn_outputs) > 0:
		model.model.layers[1].self_attn.attn_outputs = []
	if len(model.model.attn_metadatas) > 0:
		model.model.attn_metadatas = []

In [None]:
# Just prefill.
reset_attn_outputs_and_metadatas(model)
output = test_llm_generator.generate(["hello world!", "hello"], SamplingParams(temperature=0.0, max_tokens=1, ignore_eos=True))

# Attention outputs of layer 0 and layer 1 should be equal, Q is fixed to 1s during debug mode.
attn_output_layer0 = model.model.layers[0].self_attn.attn_outputs[0]
attn_output_layer1 = model.model.layers[1].self_attn.attn_outputs[0]
assert torch.equal(attn_output_layer0, attn_output_layer1)
# 1 attention metadata for a single model forward pass
assert len(model.model.attn_metadatas) == 1 
attn_metadata = model.model.attn_metadatas[0]
# No decode request with max_tokens=1.
assert attn_metadata.decode_metadata is None
# Layer 0 creates new KV and layer 1 reuses KV from cache during prefill.
expected = [SharedSelfAttentionType.PREFILL_KV_NEW.name, SharedSelfAttentionType.PREFILL_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.prefill_metadata.shared_self_attention_types == expected

Processed prompts: 100%|██████████| 2/2 [00:00<00:00, 182.40it/s, est. speed input: 555.64 toks/s, output: 184.84 toks/s]


In [None]:
# 3 tokens decode.
reset_attn_outputs_and_metadatas(model)
output = test_llm_generator.generate(["hello world!", "hello"], SamplingParams(temperature=0.0, max_tokens=3, ignore_eos=True))

# Attention outputs of layer 0 and layer 1 should be equal, Q is fixed to 1s during debug mode.
for attn_output_layer0, attn_output_layer1 in zip(model.model.layers[0].self_attn.attn_outputs, 
                                                  model.model.layers[1].self_attn.attn_outputs):
    assert torch.equal(attn_output_layer0, attn_output_layer1)

# 3 attention metadata for a 3 model forward pass (1 prefill + 2 decode)
assert len(model.model.attn_metadatas) == 3 
# Prefill phase.
attn_metadata = model.model.attn_metadatas[0]
assert attn_metadata.decode_metadata is None
expected = [SharedSelfAttentionType.PREFILL_KV_NEW.name, SharedSelfAttentionType.PREFILL_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.prefill_metadata.shared_self_attention_types == expected
# Decode phase 1.
attn_metadata = model.model.attn_metadatas[1]
assert attn_metadata.prefill_metadata is None
expected = [SharedSelfAttentionType.DECODE_KV_NEW.name, SharedSelfAttentionType.DECODE_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.decode_metadata.shared_self_attention_types == expected
# Decode phase 2.
attn_metadata = model.model.attn_metadatas[2]
assert attn_metadata.prefill_metadata is None
expected = [SharedSelfAttentionType.DECODE_KV_NEW.name, SharedSelfAttentionType.DECODE_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.decode_metadata.shared_self_attention_types == expected

Processed prompts: 100%|██████████| 2/2 [00:00<00:00, 103.80it/s, est. speed input: 314.16 toks/s, output: 313.80 toks/s]


AssertionError: 

In [None]:
len(model.model.attn_metadatas)

1

### TEST 4 (block manager v1)

In [None]:
### test 1: No prefix caching, bf16 quantization, use v1 block manager, cuda graph mode ###
ENABLE_PREFIX_CACHING = False
KV_CACHE_DTYPE = "auto"
USE_V2_BLOCK_MANAGER = False
ENFORCE_EAGER = True
DEBUG_KV_SHARING = True


# This the base llm generator with KV cache sharing, where layer 1 gets its KV cache from layer 0.
test_llm_generator = LLM(model="/workspace/models/dummy-llama", tokenizer="meta-llama/Meta-Llama-3.1-8B-Instruct", 
						 enforce_eager=ENFORCE_EAGER, kv_cache_dtype=KV_CACHE_DTYPE,
       					 enable_prefix_caching=ENABLE_PREFIX_CACHING, use_v2_block_manager=USE_V2_BLOCK_MANAGER,
						 tensor_parallel_size=1, dtype="bfloat16", max_model_len=1024*1, kv_cache_map={0:0, 1:0}, gpu_memory_utilization=0.2,
						 debug_kv_sharing=DEBUG_KV_SHARING)

INFO 10-08 20:07:31 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='/workspace/models/dummy-llama', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/workspace/models/dummy-llama, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 10-08 20:07:31 model_runner.py:879] Startin

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.25it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.25it/s]



INFO 10-08 20:07:33 model_runner.py:890] Loading model weights took 2.7696 GB
INFO 10-08 20:07:33 gpu_executor.py:121] # GPU blocks: 53425, # CPU blocks: 65536


In [None]:
model = test_llm_generator.llm_engine.model_executor.driver_worker.model_runner.model
worker = test_llm_generator.llm_engine.model_executor.driver_worker

In [None]:
# Debugging is on.
assert model.model.cache_config.debug_kv_sharing

In [None]:
def reset_attn_outputs_and_metadatas(model):
	# Reset attn outputs and attn metadatas from warmup.
	if len(model.model.layers[0].self_attn.attn_outputs) > 0:
		model.model.layers[0].self_attn.attn_outputs = []
	if len(model.model.layers[1].self_attn.attn_outputs) > 0:
		model.model.layers[1].self_attn.attn_outputs = []
	if len(model.model.attn_metadatas) > 0:
		model.model.attn_metadatas = []

In [None]:
# Just prefill.
reset_attn_outputs_and_metadatas(model)
output = test_llm_generator.generate(["hello world!", "hello"], SamplingParams(temperature=0.0, max_tokens=1, ignore_eos=True))

# Attention outputs of layer 0 and layer 1 should be equal, Q is fixed to 1s during debug mode.
attn_output_layer0 = model.model.layers[0].self_attn.attn_outputs[0]
attn_output_layer1 = model.model.layers[1].self_attn.attn_outputs[0]
assert torch.equal(attn_output_layer0, attn_output_layer1)
# 1 attention metadata for a single model forward pass
assert len(model.model.attn_metadatas) == 1 
attn_metadata = model.model.attn_metadatas[0]
# No decode request with max_tokens=1.
assert attn_metadata.decode_metadata is None
# Layer 0 creates new KV and layer 1 reuses KV from cache during prefill.
expected = [SharedSelfAttentionType.PREFILL_KV_NEW.name, SharedSelfAttentionType.PREFILL_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.prefill_metadata.shared_self_attention_types == expected

Processed prompts: 100%|██████████| 2/2 [00:00<00:00,  5.40it/s, est. speed input: 16.22 toks/s, output: 5.41 toks/s]


In [None]:
# 3 tokens decode.
reset_attn_outputs_and_metadatas(model)
output = test_llm_generator.generate(["hello world!", "hello"], SamplingParams(temperature=0.0, max_tokens=3, ignore_eos=True))

# Attention outputs of layer 0 and layer 1 should be equal, Q is fixed to 1s during debug mode.
for attn_output_layer0, attn_output_layer1 in zip(model.model.layers[0].self_attn.attn_outputs, 
                                                  model.model.layers[1].self_attn.attn_outputs):
    assert torch.equal(attn_output_layer0, attn_output_layer1)

# 3 attention metadata for a 3 model forward pass (1 prefill + 2 decode)
assert len(model.model.attn_metadatas) == 3 
# Prefill phase.
attn_metadata = model.model.attn_metadatas[0]
assert attn_metadata.decode_metadata is None
expected = [SharedSelfAttentionType.PREFILL_KV_NEW.name, SharedSelfAttentionType.PREFILL_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.prefill_metadata.shared_self_attention_types == expected
# Decode phase 1.
attn_metadata = model.model.attn_metadatas[1]
assert attn_metadata.prefill_metadata is None
expected = [SharedSelfAttentionType.DECODE_KV_NEW.name, SharedSelfAttentionType.DECODE_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.decode_metadata.shared_self_attention_types == expected
# Decode phase 2.
attn_metadata = model.model.attn_metadatas[2]
assert attn_metadata.prefill_metadata is None
expected = [SharedSelfAttentionType.DECODE_KV_NEW.name, SharedSelfAttentionType.DECODE_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.decode_metadata.shared_self_attention_types == expected

Processed prompts: 100%|██████████| 2/2 [00:00<00:00, 94.08it/s, est. speed input: 284.54 toks/s, output: 284.25 toks/s]


### TEST 5 (prefix caching)

In [None]:
### test 1: No prefix caching, bf16 quantization, use v1 block manager, cuda graph mode ###
ENABLE_PREFIX_CACHING = True
KV_CACHE_DTYPE = "auto"
USE_V2_BLOCK_MANAGER = False
ENFORCE_EAGER = True
DEBUG_KV_SHARING = True


# This the base llm generator with KV cache sharing, where layer 1 gets its KV cache from layer 0.
test_llm_generator = LLM(model="/workspace/models/dummy-llama", tokenizer="meta-llama/Meta-Llama-3.1-8B-Instruct", 
						 enforce_eager=ENFORCE_EAGER, kv_cache_dtype=KV_CACHE_DTYPE,
       					 enable_prefix_caching=ENABLE_PREFIX_CACHING, use_v2_block_manager=USE_V2_BLOCK_MANAGER,
						 tensor_parallel_size=1, dtype="bfloat16", max_model_len=1024*1, kv_cache_map={0:0, 1:0}, gpu_memory_utilization=0.2,
						 debug_kv_sharing=DEBUG_KV_SHARING)

INFO 10-08 20:09:13 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='/workspace/models/dummy-llama', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/workspace/models/dummy-llama, use_v2_block_manager=False, enable_prefix_caching=True)
INFO 10-08 20:09:13 model_runner.py:879] Starting

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.18it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.18it/s]



INFO 10-08 20:09:14 model_runner.py:890] Loading model weights took 2.7696 GB
INFO 10-08 20:09:15 gpu_executor.py:121] # GPU blocks: 53425, # CPU blocks: 65536
INFO 10-08 20:09:15 block_manager_v1.py:263] Automatic prefix caching is enabled.


In [None]:
# Just prefill.
reset_attn_outputs_and_metadatas(model)
output = test_llm_generator.generate(["hello world!", "hello"], SamplingParams(temperature=0.0, max_tokens=1, ignore_eos=True))

# Attention outputs of layer 0 and layer 1 should be equal, Q is fixed to 1s during debug mode.
attn_output_layer0 = model.model.layers[0].self_attn.attn_outputs[0]
attn_output_layer1 = model.model.layers[1].self_attn.attn_outputs[0]
assert torch.equal(attn_output_layer0, attn_output_layer1)
# 1 attention metadata for a single model forward pass
assert len(model.model.attn_metadatas) == 1 
attn_metadata = model.model.attn_metadatas[0]
# No decode request with max_tokens=1.
assert attn_metadata.decode_metadata is None
# Layer 0 creates new KV and layer 1 reuses KV from cache during prefill.
expected = [SharedSelfAttentionType.PREFILL_KV_NEW.name, SharedSelfAttentionType.PREFILL_KV_SHARED.name] # [layer 0, layer 1]
assert attn_metadata.prefill_metadata.shared_self_attention_types == expected

Processed prompts: 100%|██████████| 2/2 [00:00<00:00, 95.46it/s, est. speed input: 288.79 toks/s, output: 96.16 toks/s]


IndexError: list index out of range