In [1]:
#transformers fork: https://github.com/AnswerDotAI/transformers/tree/cla-llama

# vscode jupyter kill hanging process:
#  ps aux | grep "/workspace/py_venvs/cla/bin/python -m ipykernel_launcher" | awk '{print $2}' | xargs kill -9

In [13]:
import os
from pathlib import Path
import safetensors
import safetensors.torch
from safetensors.torch import save_file
from transformers import AutoConfig
import json

### Prepare VLLM Weights (Full Finetune)

These models are trained with different CLA factors, key-value states are shared between consequent layers. For example with CLA factor 2, with layer 0 and layer 1, layer 2 and layer 3, etc and with CLA factor 3, with (layer 0, layer 1, layer 2) and (layer 3, layer 4 and layer 5), etc.

For fast evaluation benchmarking we can simply copy over the first layers' key-value weights and make it compatible with VLLM.

In the actual efficient inference implementation, we will use the shared key-value states to reduce the memory footprint and KV cache.

In [26]:
models_dir = Path("/workspace/models/")
model_path = "llama-3-8b-instruct-hqq-plus-dataset-CLA-3"
list(Path(models_dir/model_path).glob("*"))

[PosixPath('/workspace/models/llama-3-8b-instruct-hqq-plus-dataset-CLA-3/model_state_dict.safetensors')]

In [27]:
weights = safetensors.torch.load_file(str(models_dir/model_path/'model_state_dict.safetensors'))

In [28]:
len(weights.keys())

291

In [29]:
cla_fixed_weights = {}
cla_factor = 3
for name, param in iter(weights.items()):
    if "k_proj" in name or "v_proj" in name:
        layer_idx = int(name.split('.')[2])
        if layer_idx % cla_factor != 0:
            cla_group_idx = layer_idx // cla_factor * cla_factor
            cla_fixed_weights[name] = weights[name.replace(f"model.layers.{layer_idx}", f"model.layers.{cla_group_idx}")].clone()
        else:
            cla_fixed_weights[name] = weights[name]
    else:
        cla_fixed_weights[name] = weights[name]

In [30]:
# create save dir
save_model_dir = models_dir/(model_path + "-vllm")
os.makedirs(save_model_dir, exist_ok=True)

In [31]:
# save fixed weights
save_file(cla_fixed_weights, save_model_dir/"model_state_dict.safetensors")

In [32]:
# save model config
model_config = AutoConfig.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
model_config.rope_scaling['type'] = 'dynamic'
model_config.rope_scaling['factor'] = 2.0
model_config_filename = save_model_dir/"config.json"
with open(model_config_filename, "w+") as f: json.dump(model_config.to_dict(), f)

In [33]:
list(save_model_dir.glob("*"))

[PosixPath('/workspace/models/llama-3-8b-instruct-hqq-plus-dataset-CLA-3-vllm/config.json'),
 PosixPath('/workspace/models/llama-3-8b-instruct-hqq-plus-dataset-CLA-3-vllm/model_state_dict.safetensors')]

### Test VLLM

In [1]:
import vllm
from vllm import LLM, SamplingParams

In [2]:
from transformers import AutoTokenizer

In [4]:
# FIXME: "/workspace/models/llama-3-8b-instruct-hqq-plus-dataset-CLA-2-vllm",
# Copying over key-value weights doesn't work, model training might have went wrong.
# Ideally key-value proj weights shouldn't be updated during training for the layers sharing key value states from previous layers.

# Looks like kv weights didn't get updated for the expected layers but yet copying over doesn't work.
# Does it mean kv weights weren't trained but still used for that layer? So layer sharing wasn't successful?

# TODO: Copying weights won't work because hidden states passed to kv weights also change every layer, so the only way is to share the kv activations.

llm = LLM(model="/workspace/models/llama-3-8b-instruct-hqq-plus-dataset-CLA-2", 
          tokenizer="meta-llama/Meta-Llama-3-8B-Instruct", 
          dtype="bfloat16",
          tensor_parallel_size=1,
          enforce_eager=False, 
          gpu_memory_utilization=0.9)

INFO 07-18 13:48:31 llm_engine.py:169] Initializing an LLM engine (v0.5.0.post1) with config: model='/workspace/models/llama-3-8b-instruct-hqq-plus-dataset-CLA-2', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/workspace/models/llama-3-8b-instruct-hqq-plus-dataset-CLA-2)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 07-18 13:49:23 model_runner.py:234] Loading model weights took 14.9614 GB
INFO 07-18 13:49:26 gpu_executor.py:83] # GPU blocks: 11714, # CPU blocks: 2048
INFO 07-18 13:49:30 model_runner.py:864] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 07-18 13:49:30 model_runner.py:868] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 07-18 13:49:44 model_runner.py:1022] Graph capturing finished in 14 secs.


In [5]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
prompt = "Say hello world 10 times as a numbered list"
# prompt = "Write a poem about love and robots"
# prompt = "Continue the fib series: 1 1 2 3 5 8"
messages = [{"role": "user", "content": prompt}]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs =llm.generate(prompt, SamplingParams(temperature=0.0, max_tokens=256))
response_text = outputs[0].outputs[0].text

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.92s/it, est. speed input: 10.95 toks/s, output: 33.36 toks/s]


In [7]:
print(response_text)

Here is the list of "Hello World" said 10 times:

1. Hello World
2. Hello World
3. Hello World
4. Hello World
5. Hello World
6. Hello World
7. Hello World
8. Hello World
9. Hello World
10. Hello World


In [8]:
# Check key value proj weights

In [26]:
import torch
from transformers import AutoTokenizer, AutoConfig
from transformers.utils import hub, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME

In [10]:
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
idx = hub.cached_file(MODEL_NAME, SAFE_WEIGHTS_INDEX_NAME)
pretrained_files, _ = hub.get_checkpoint_shard_files(MODEL_NAME, idx)

In [22]:
orig_weights = {}
for fn in pretrained_files:
	orig_weights.update(safetensors.torch.load_file(fn))

In [23]:
len(orig_weights.keys())

291

In [34]:
CLA2_weights = safetensors.torch.load_file("/workspace/models/llama-3-8b-instruct-hqq-plus-dataset-CLA-2/model_state_dict.safetensors")

In [35]:
cla_factor = 2
for name, param in iter(orig_weights.items()):
    if "k_proj" in name or "v_proj" in name:
        layer_idx = int(name.split('.')[2])
        if layer_idx % cla_factor != 0:
            closeness = torch.isclose(CLA2_weights[name], orig_weights[name], rtol=1e-5, atol=1e-5).float().mean().item()
            assert closeness == 1.0
            print("CLA KV shared:", name, closeness)
        else:
            closeness = torch.isclose(CLA2_weights[name], orig_weights[name], rtol=1e-5, atol=1e-5).float().mean().item()
            assert closeness != 1.0
            print("Trained:", name, closeness)
    else:
        # closeness = torch.isclose(CLA2_weights[name], orig_weights[name], rtol=1e-5, atol=1e-5).float().mean().item()
        # print("Trained:", name, closeness)
        pass

Trained: model.layers.0.self_attn.k_proj.weight 0.7948763370513916
Trained: model.layers.0.self_attn.v_proj.weight 0.4667935371398926
CLA KV shared: model.layers.1.self_attn.k_proj.weight 1.0
CLA KV shared: model.layers.1.self_attn.v_proj.weight 1.0
Trained: model.layers.2.self_attn.k_proj.weight 0.8456332683563232
Trained: model.layers.2.self_attn.v_proj.weight 0.524782657623291
CLA KV shared: model.layers.3.self_attn.k_proj.weight 1.0
CLA KV shared: model.layers.3.self_attn.v_proj.weight 1.0
Trained: model.layers.4.self_attn.k_proj.weight 0.8604786396026611
Trained: model.layers.4.self_attn.v_proj.weight 0.6191747188568115
CLA KV shared: model.layers.5.self_attn.k_proj.weight 1.0
CLA KV shared: model.layers.5.self_attn.v_proj.weight 1.0
Trained: model.layers.6.self_attn.k_proj.weight 0.871098518371582
Trained: model.layers.6.self_attn.v_proj.weight 0.5928778648376465
CLA KV shared: model.layers.7.self_attn.k_proj.weight 1.0
CLA KV shared: model.layers.7.self_attn.v_proj.weight 1.0
Tr