In [1]:
import copy
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache
from safetensors.torch import save_file, load_file

device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
model_name = "Qwen/Qwen2.5-0.5B"

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
model = model.to(device)

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
INITIAL_PROMPT="""
Prompt caching + persistent prompt db

# Goal

- Release a library that can be used in conjunction with any HF model, that provides the following:
    - cache_activation(model, prompt)
    - run_with_activation(model, cached_prompt, prompt_suffix)
    - The cached activations should be stored in a persistent database
- I really like one of the extensions—making a publicly available prompt cache api
"""

In [4]:
original_prompt_cache = DynamicCache()

inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to(device)

with torch.no_grad():
    original_prompt_cache = model(**inputs_initial_prompt, past_key_values=original_prompt_cache).past_key_values

In [7]:
from itertools import chain

def save_cache_to_disk(prompt_cache: DynamicCache, path: str):
    tensors = {
        # We should investigate why the KV tensors are not contiguous
        f"{key_or_value}_{layer_index}": tensor.contiguous()
        for key_or_value, layer_index, tensor in
        chain(
            (("k", layer_idx, tensor) for layer_idx, tensor in enumerate(prompt_cache.key_cache)),
            (("v", layer_idx, tensor) for layer_idx, tensor in enumerate(prompt_cache.value_cache))
        )
    }

    save_file(tensors, path)


def load_cache_from_disk(path: str) -> DynamicCache:
    cache = DynamicCache()
    tensors = load_file(path)
    num_layers = len(tensors) // 2

    for layer_idx in range(num_layers):
        key_states = tensors[f"k_{layer_idx}"].to(device)
        value_states = tensors[f"v_{layer_idx}"].to(device)
        cache.update(key_states, value_states, layer_idx)

    return cache

save_cache_to_disk(original_prompt_cache, "prompt_cache.safetensors")

In [8]:
reloaded_prompt_cache = load_cache_from_disk("prompt_cache.safetensors")

In [9]:
reloaded_prompt_cache

DynamicCache()

In [10]:
prompts = ["\n# Project Name", "\n# Next Steps", "\n# Potential issues"]
responses = []

for prompt in prompts:
    full_prompt = INITIAL_PROMPT + prompt
    new_inputs = tokenizer(full_prompt, return_tensors="pt").to("cuda")

    # We need to make a copy of the original cache for each prompt, since
    # it gets modified by each of the generation runs in `prompts`
    cache = copy.deepcopy(reloaded_prompt_cache)
    outputs = model.generate(**new_inputs, past_key_values=cache, max_new_tokens=25)
    response = tokenizer.batch_decode(outputs)[0]
    responses.append(response)

for prompt, response in zip(prompts, responses):
    full_prompt = INITIAL_PROMPT + prompt
    print(prompt)
    print(response[len(full_prompt):])

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



# Project Name


- [PromptCache](https://github.com/alexisbregman/promptcache)

# Installation

- Clone

# Next Steps


- [ ] Add a prompt cache api
- [ ] Add a prompt cache api
- [ ] Add a prompt

# Potential issues


- The cache should be able to handle multiple prompts
- The cache should be able to handle multiple prompts with different prompts
