## MLX example

https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/examples/chat.py

In [1]:
# https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/examples/chat.py

from mlx_lm import generate, load
from mlx_lm.models.cache import load_prompt_cache, make_prompt_cache, save_prompt_cache

model, tokenizer = load("mlx-community/Qwen2.5-32B-Instruct-4bit")

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

In [2]:
# Make the initial prompt cache for the model
prompt_cache = make_prompt_cache(model)

In [3]:
def chat(prompt: str):
    messages = [{"role": "user", "content": prompt}]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    response = generate(model, tokenizer, prompt=prompt, 
        verbose=True, temp=0.0, 
        prompt_cache=prompt_cache
    ) 
    return response

In [4]:
response = chat("Hi my name is <Name>.")

Prompt: <|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Hi my name is <Name>.<|im_end|>
<|im_start|>assistant

Hello <Name>! It's nice to meet you. How can I assist you today?
Prompt: 36 tokens, 44.617 tokens-per-sec
Generation: 20 tokens, 13.596 tokens-per-sec
Peak memory: 18.530 GB


In [5]:
response = chat("What's my name?")

Prompt: <|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
What's my name?<|im_end|>
<|im_start|>assistant

You told me your name is <Name>, but since <Name> is a placeholder, I didn't actually receive your specific name. Could you please tell me your name again?
Prompt: 34 tokens, 44.430 tokens-per-sec
Generation: 37 tokens, 13.188 tokens-per-sec
Peak memory: 18.530 GB


In [6]:
# Save the prompt cache to disk to reuse it at a later time
save_prompt_cache("mistral_prompt.safetensors", prompt_cache)

# Load the prompt cache from disk
prompt_cache = load_prompt_cache("mistral_prompt.safetensors")
prompt_cache

[<mlx_lm.models.cache.KVCache at 0x107c41630>,
 <mlx_lm.models.cache.KVCache at 0x107c41a20>,
 <mlx_lm.models.cache.KVCache at 0x107c41b00>,
 <mlx_lm.models.cache.KVCache at 0x107c41a90>,
 <mlx_lm.models.cache.KVCache at 0x107c41400>,
 <mlx_lm.models.cache.KVCache at 0x72d2fdfd0>,
 <mlx_lm.models.cache.KVCache at 0x72d2fda20>,
 <mlx_lm.models.cache.KVCache at 0x72d2fd5c0>,
 <mlx_lm.models.cache.KVCache at 0x72d2fd780>,
 <mlx_lm.models.cache.KVCache at 0x72d2fc210>,
 <mlx_lm.models.cache.KVCache at 0x72d2fc1a0>,
 <mlx_lm.models.cache.KVCache at 0x72d2fc050>,
 <mlx_lm.models.cache.KVCache at 0x72d2fde10>,
 <mlx_lm.models.cache.KVCache at 0x72d2fc0c0>,
 <mlx_lm.models.cache.KVCache at 0x72d2fc130>,
 <mlx_lm.models.cache.KVCache at 0x72d2fdcc0>,
 <mlx_lm.models.cache.KVCache at 0x72d2fe0b0>,
 <mlx_lm.models.cache.KVCache at 0x72d2fde80>,
 <mlx_lm.models.cache.KVCache at 0x72d2fe190>,
 <mlx_lm.models.cache.KVCache at 0x72d2fdc50>,
 <mlx_lm.models.cache.KVCache at 0x72d2fe3c0>,
 <mlx_lm.mode