In [1]:
import safetensors
from safetensors.torch import save_file
from pathlib import Path
from transformers.utils import hub, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
import copy,os,sys
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
import torch
from glob import glob

In [2]:
from transformers.generation.configuration_utils import GenerationConfig

In [3]:
# import anthropic

In [4]:
from fastcore.parallel import parallel
from tqdm import tqdm
import time

In [5]:
from bitsandbytes.nn import Linear4bit, Params4bit
import torch.nn as nn
from typing import List
from accelerate import init_empty_weights

In [6]:
models_dir = Path("/weka/home-keremturgutlu/models/")

In [7]:
MODEL_NAME = "meta-llama/Llama-2-7b-hf"
idx = hub.cached_file(MODEL_NAME, SAFE_WEIGHTS_INDEX_NAME)
files, _ = hub.get_checkpoint_shard_files(MODEL_NAME, idx)

In [8]:
files

['/admin/home-keremturgutlu/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423/model-00001-of-00002.safetensors',
 '/admin/home-keremturgutlu/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423/model-00002-of-00002.safetensors']

In [9]:
os.environ.get("CUDA_VISIBLE_DEVICES")

In [10]:
from vllm import LLM, SamplingParams

In [11]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [12]:
sys.path.append("../")

**TODO:** vllm bnb quantized custom model

### Load Dataset

In [13]:
# !pip install datasets
# !pip install fastcore

In [14]:
from datasets import load_dataset

In [15]:
dataset = load_dataset("microsoft/orca-math-word-problems-200k")['train'].shuffle(seed=42)
# train with 10k for starters. Then 100k.
# dataset = dataset.select(range(0,100000))

# select last 5k as validation
dataset = dataset.select(range(len(dataset)-5000,len(dataset)))

In [16]:
dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 5000
})

In [17]:
dataset[21]

{'question': 'Charlie decided to make metal supports for the wings. He needs 635 lbs of metal in total and needs to buy an additional 359 lbs of metal. How much metal does he have in storage?',
 'answer': 'Charlie needs a total of 635 lbs of metal and has to buy an additional 359 lbs of metal. To find out how much metal he already has in storage, we subtract the amount he needs to buy from the total amount he needs.\n\nSo, 635 lbs (total needed) - 359 lbs (amount to buy) = 276 lbs.\n\nCharlie has 276 lbs of metal in storage.'}

In [21]:
EXTRACT_ANSWER_PROMPT = """Given a passage with a detailed answer to a question, extract the final answer as "Answer:<the final answer>\n"

<example>
To solve this problem, we need to find out how much of the job each worker can do in one hour, and then add those amounts together to find out how much of the job they can do together in one hour.

Worker A can do 1/10 of the job in one hour.
Worker B can do 1/12 of the job in one hour.
Worker C can do 1/15 of the job in one hour.

Now, let's add these fractions to find out how much of the job they can do together in one hour:

1/10 + 1/12 + 1/15

To add these fractions, we need a common denominator. The least common multiple (LCM) of 10, 12, and 15 is 60. So we convert each fraction to have a denominator of 60:

(6/60) + (5/60) + (4/60)

Now we can add the fractions:

6/60 + 5/60 + 4/60 = 15/60

This simplifies to 1/4, which means that together, workers A, B, and C can do 1/4 of the job in one hour.

To find out how long it will take them to complete the entire job, we take the reciprocal of 1/4:

1 / (1/4) = 4

So, it will take workers A, B, and C working together 4 hours to complete the job.

Answer: 4 hours
</example>

<example>
If Mr. Roberts chooses the installment plan, he would pay a down payment of $120 and then $30 a month for 12 months. 

The total cost for the installment plan would be:
Down payment: $120
Monthly payments: $30/month * 12 months = $360

Total cost with installment plan: $120 + $360 = $480

If he pays cash, the television costs $400.

To find out how much he can save by paying cash, we subtract the cash price from the total cost of the installment plan:

Savings by paying cash: $480 - $400 = $80

Mr. Roberts can save $80 by paying cash for the television.

Answer $80
</example>

<example>
Let's denote the total capacity of the reservoir as T gallons.

According to the information given, 6 million gallons is 60% of the total capacity. So we can write the following equation:

0.60 * T = 6 million gallons

From this, we can solve for T:

T = 6 million gallons / 0.60
T = 10 million gallons

Now, we are told that the normal level is 5 million gallons short of the total capacity. Therefore, the normal level (N) is:

N = T - 5 million gallons
N = 10 million gallons - 5 million gallons
N = 5 million gallons

Now we need to find the ratio of the amount of water in the reservoir at the end of the month (6 million gallons) to the normal level (5 million gallons):

Ratio = Amount at the end of the month / Normal level
Ratio = 6 million gallons / 5 million gallons
Ratio = 6 / 5
Ratio = 1.2

So the ratio of the amount of water in the reservoir at the end of the month to the normal level is 1.2:1.

Answer: 1.2:1
</example>

Now it is your turn:

{text}
"""

In [22]:
# client = anthropic.Anthropic(
#     api_key=open("/weka/home-keremturgutlu/claude-api.key").read().strip(),
# )

In [23]:
# def extract_short_answer(answer_text):
#     message = client.messages.create(
#         model="claude-3-haiku-20240307",
#         max_tokens=20,
#         temperature=0.0,
#         messages=[
#             {"role": "user", "content": EXTRACT_ANSWER_PROMPT.format(text=answer_text)},
#             {"role": "assistant", "content": "Answer:"}
#         ],
#     )
    
#     return message.content[0].text.strip()

In [24]:
# short_answers_gt = []

In [25]:
# for i, a in tqdm(enumerate(dataset[:100]['answer'])):
#     if len(short_answers_gt) > i: continue
#     short_answers_gt.append(extract_short_answer(a))
#     if i % 10 == 0: time.sleep(3)

In [26]:
# !pip install openai

In [18]:
import re

def extract_last_number_or_ratio(s):
    # Find all sequences of digits, possibly with leading currency symbols, decimal points, and ratios
    patterns = re.findall(r'[\$€£]?\d+(?:\.\d+)?(?:\:\d+(?:\.\d+)?)?', s)
    
    # Return the last pattern found, or None if there are no matches
    if patterns:
        return patterns[-1]
    else:
        return None

# Example usage
examples = [
    "The item costs $123.45, but with a discount of $10.00, the final price is $113.45.",
    "The ratio of water to concentrate is 5.5:1 for the mixture.",
    "The investment return was 10:1.",
    "Answer is 42.3.\nAnswer is 42"
]

for s in examples:
    print(f"The last occurring number or ratio in \"{s}\" is: {extract_last_number_or_ratio(s)}")


The last occurring number or ratio in "The item costs $123.45, but with a discount of $10.00, the final price is $113.45." is: $113.45
The last occurring number or ratio in "The ratio of water to concentrate is 5.5:1 for the mixture." is: 5.5:1
The last occurring number or ratio in "The investment return was 10:1." is: 10:1
The last occurring number or ratio in "Answer is 42.3.
Answer is 42" is: 42


In [19]:
short_answers_gt = parallel(extract_last_number_or_ratio, dataset['answer'], progress=True)

In [20]:
len(short_answers_gt)

5000

In [21]:
short_answers_gt[:5]

(#5) ['4','$80','1.2:1','50','2:1']

In [22]:
torch.tensor([True if a == '' else False for a in short_answers_gt]).float().mean()

tensor(0.)

### Baseline

#### Zero-shot:

`n=500, exact_match_score=0.068`

In [26]:
llm = LLM(model=MODEL_NAME, tokenizer=MODEL_NAME, dtype="bfloat16")

INFO 03-26 10:16:10 llm_engine.py:87] Initializing an LLM engine with config: model='meta-llama/Llama-2-7b-hf', tokenizer='meta-llama/Llama-2-7b-hf', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
INFO 03-26 10:16:15 weight_utils.py:163] Using model weights format ['*.safetensors']
INFO 03-26 10:18:21 llm_engine.py:357] # GPU blocks: 2820, # CPU blocks: 512
INFO 03-26 10:18:23 model_runner.py:684] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 03-26 10:18:23 model_runner.py:688] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consid

In [29]:
inputs = [f"###Question:\n{question}\n###Answer:\n" for question in dataset[:500]['question']]

In [31]:
outputs = llm.generate(inputs, SamplingParams(temperature=0.0, stop_token_ids=[tokenizer.eos_token_id], max_tokens=1024))

Processed prompts: 100%|█████████████████████████████████████████████████████████████████████████████████████| 500/500 [04:04<00:00,  2.05it/s]


In [32]:
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]

In [33]:
sum(p==g for p,g in zip(short_answers_pred, short_answers_gt))/len(short_answers_pred)

0.068

### 5-shot

`n=500, exact_match_score=`

In [37]:
llm = LLM(model=MODEL_NAME, tokenizer=MODEL_NAME, dtype="bfloat16")

INFO 03-27 09:16:50 llm_engine.py:87] Initializing an LLM engine with config: model='meta-llama/Llama-2-7b-hf', tokenizer='meta-llama/Llama-2-7b-hf', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
INFO 03-27 09:16:55 weight_utils.py:163] Using model weights format ['*.safetensors']
INFO 03-27 09:20:52 llm_engine.py:357] # GPU blocks: 2820, # CPU blocks: 512
INFO 03-27 09:20:56 model_runner.py:684] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 03-27 09:20:56 model_runner.py:688] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consid

In [29]:
few_shot_examples = [f"###Question:\n{ex['question']}\n###Answer:{ex['answer']}" for ex in dataset.select(range(len(dataset)-5,len(dataset)))]

In [32]:
few_shot_prompt = "\n\n".join(few_shot_examples)

In [35]:
inputs = [few_shot_prompt + "\n\n" + f"###Question:\n{question}\n###Answer:\n" for question in dataset[:500]['question']]

In [None]:
outputs = llm.generate(inputs, SamplingParams(temperature=0.0, stop_token_ids=[tokenizer.eos_token_id], max_tokens=1024))

Processed prompts:  67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                    | 333/500 [11:15<07:12,  2.59s/it]

In [None]:
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]

In [None]:
sum(p==g for p,g in zip(short_answers_pred, short_answers_gt))/len(short_answers_pred)

### Full Finetune 

`n=500, exact_match_score=0.182`

#### VLLM

In [31]:
llm = LLM(model=str(models_dir/"llama-7b-orca-math-10k-full"), tokenizer=MODEL_NAME, dtype="bfloat16")

INFO 03-26 09:58:27 llm_engine.py:87] Initializing an LLM engine with config: model='/weka/home-keremturgutlu/models/llama-7b-orca-math-10k-full', tokenizer='meta-llama/Llama-2-7b-hf', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
INFO 03-26 09:58:41 llm_engine.py:357] # GPU blocks: 2867, # CPU blocks: 512
INFO 03-26 09:58:43 model_runner.py:684] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 03-26 09:58:43 model_runner.py:688] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing

In [80]:
inputs = [f"###Question:\n{question}\n###Answer:\n" for question in dataset[:500]['question']]

In [81]:
outputs = llm.generate(inputs, SamplingParams(temperature=0.0, stop_token_ids=[tokenizer.eos_token_id], max_tokens=1024))

Processed prompts: 100%|█████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:35<00:00,  5.22it/s]


In [82]:
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]

In [83]:
sum(p==g for p,g in zip(short_answers_pred, short_answers_gt))/len(short_answers_pred)

0.182

In [77]:
%%time
output = llm.generate(inputs[0], SamplingParams(temperature=0.0, stop_token_ids=[tokenizer.eos_token_id], max_tokens=1024))

Processed prompts: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.82s/it]

CPU times: user 5.8 s, sys: 10.4 ms, total: 5.81 s
Wall time: 5.82 s





In [78]:
o = output[0].outputs[0]

In [79]:
len(o.token_ids)/5.82

77.31958762886597

In [70]:
metrics = output[0].metrics

In [73]:
metrics

RequestMetrics(arrival_time=1673463.661995299, last_token_time=1673463.661995299, first_scheduled_time=1711447501.5359676, first_token_time=1711447501.5566397, time_in_queue=1709774037.8739722, finished_time=1711447507.3578482)

#### HF (Much Slower)

In [26]:
# model_state_dict = safetensors.torch.load_file(model_path/"model_state_dict.safetensors")

In [48]:
# list(model_state_dict.keys())[:4]

In [49]:
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_NAME,
#     use_cache=False,
#     torch_dtype=torch.bfloat16,
#     _attn_implementation="sdpa",
# )

In [50]:
# model.load_state_dict(model_state_dict);

In [51]:
# model.to("cuda");

In [52]:
# tokenizer.pad_token_id = tokenizer.eos_token_id

In [53]:
# sample = dataset[5]

In [None]:
# question = sample['question']
# output = model.generate(torch.tensor(tokenizer.encode(f"###Question:\n{question}\n###Answer:\n")).view(1,-1).cuda(),
#                          do_sample=False)

In [54]:
# extract_last_number_or_ratio(tokenizer.decode(output[0]))

In [55]:
# short_answers_pred = []
# for i in tqdm(range(0,len(dataset),4)):
    
#     inputs = [f"###Question:\n{question}\n###Answer:\n" for question in dataset[i:i+4]['question']]
#     input_ids = tokenizer(inputs)['input_ids']
    
#     max_toks = max(len(toks) for toks in input_ids)
    
#     b = torch.stack([torch.tensor(((max_toks-len(toks))*[tokenizer.unk_token_id])+toks) for toks in input_ids])
    
#     input_lens = [len(toks) for toks in input_ids]
    
#     output = model.generate(b.cuda(), 
#                             do_sample=False, 
#                             use_cache=True,
#                             pad_token_id=tokenizer.unk_token_id, 
#                             eos_token_id=tokenizer.eos_token_id, 
#                             max_new_tokens=1024).cpu()
    
#     pred = [extract_last_number_or_ratio(tokenizer.decode(o[o!=tokenizer.unk_token_id][n:])) for o,n in zip(output,input_lens)]

#     short_answers_pred.extend(pred)
#     if i > 20: break

In [56]:
# sum(p==g for p,g in zip(short_answers_pred, short_answers_gt))/len(short_answers_pred)

### Full Finetune + Post Quantize 

vLLM only supports AWQ quantization at the moment.

#### HF BnB 4bit

In [32]:
def replace_linear(model:nn.Module, linear_replacement:nn.Module, quant_config:dict|None=None,
                   skip_modules:List[str]=["lm_head"], **kwargs):
    """
    Replace linear modules with a new Linear module.
    Parameters:
        model (`torch.nn.Module`):
            Input model or `torch.nn.Module` as the function is run recursively.
        linear_replacement (`torch.nn.Module`):
            The linear module that replaces the old one. Only expects standard arguments.
            If other arguments need to be passed, use a lambda.
        skip_modules (`List[str]`, *optional*, defaults to `lm_head`):
            List of modules names not to convert. Defaults to `lm_head`.
    """
    for name, module in model.named_children():
        if name in skip_modules:
            print(f"Skipping {name}")
            continue
        
        if len(list(module.children())) > 0:
            replace_linear(module, linear_replacement, quant_config, skip_modules, **kwargs)

        if isinstance(module, torch.nn.Linear):
            if issubclass(linear_replacement, Linear4bit):
                model._modules[name] = linear_replacement(
                    module.in_features,
                    module.out_features,
                    module.bias is not None,
                    **kwargs
                )
            # elif issubclass(linear_replacement, HQQLinear):
            #     model._modules[name] = linear_replacement(module, quant_config, **kwargs)
            else:
                raise ValueError(f"Unsupported linear replacement: {type(linear_replacement)}")
    return model

In [33]:
def load_and_quantize(module:nn.Module, name:str, value:torch.Tensor, device:torch.device=None, dtype:torch.dtype=None,
                      skip_names:list[str]=[], is_meta_rank:bool=False, low_memory:bool=True, verbose:bool=False,
                      quant_method:str='bnb', is_dora:bool=False):
    """
    Loads `value` tensor into submodule of `module`, optionally skipping `skip_names` and converting to `dtype`.

    Quantizes `Params4bit` on `device` then places on "cpu" if low_memory=True or "meta" if is_meta_rank=True.
    """
    def place_on_device(value):
        if is_meta_rank:
            device = 'meta'
        elif low_memory:
            device = 'cpu'
        return value.to(device=device, dtype=dtype)

    if any([skip_name in name for skip_name in skip_names]):
        if verbose:
            print(f"Skipping {name} because it is in skip_names")
        return

    module_key, _, value_key = name.rpartition('.')
    try:
        submodule = module.get_submodule(module_key)
    except AttributeError as e:
        print(f"Module {module_key} not found:\n{e}")
        return

    try:
        if quant_method=='bnb':
            param = submodule.get_parameter(value_key)
            if isinstance(param, Params4bit):
                # With `sync_module_states=True`, a meta device Params4bit needs to be the same
                # shape as the quantized Params4bit with an initialized quant_state. However,
                # FSDP only syncs parameters and buffers, so the quant_state isn't copied. This
                # workaround quantizes Params4bit to initialize quant_state on all ranks, then
                # replaces Params4bit's data with a meta tensor to free memory on non-rank 0.
                if is_dora:
                    setattr(submodule, "dora_scale", value.norm(p=2, dim=1).to(dtype=dtype).to("cpu"))                
                    print("DORA scale initialized")
                value = type(param)(value.to(device=device, dtype=dtype).data, **param.__dict__).cuda(device)
                if is_meta_rank:
                    value = type(param)(value.data.to("meta"), **value.__dict__)
                elif low_memory:
                    value = type(param)(value.data.to("cpu"), **value.__dict__)
                # print("Loaded quantized layer")
            else:
                value = type(param)(place_on_device(value).data)
                # print("Loaded regular layer")
    except AttributeError:
        # it's a buffer
        value = place_on_device(value)
        pass
    setattr(submodule, value_key, value)

def load_and_quantize_parallel(name_param, model, **kwargs):
    name, param = name_param
    load_and_quantize(model, name, param, **kwargs)

In [100]:
cfg = AutoConfig.from_pretrained(MODEL_NAME)
cfg._attn_implementation = "sdpa"
skip_modules = ["lm_head"]
load_param_skip_names = ['inv_freq']
compute_dtype = torch_dtype = torch.bfloat16

In [101]:
with init_empty_weights():
    model = AutoModelForCausalLM.from_config(cfg)
    model.model = replace_linear(model.model, Linear4bit, compute_dtype=compute_dtype,
                                 quant_type='nf4', quant_storage=torch_dtype, skip_modules=skip_modules)
model.is_loaded_in_4bit = True

In [102]:
weights = safetensors.torch.load_file(glob(str(models_dir/"llama-7b-orca-math-10k-full/*.safetensors"))[0])

In [None]:
parallel(load_and_quantize_parallel, 
         iter(weights.items()), 
         n_workers=8, 
         threadpool=True,
         model=model, 
         dtype=torch_dtype, 
         device=torch.cuda.current_device(),
         skip_names=load_param_skip_names,
         is_meta_rank=False,
         verbose=True,
         quant_method="bnb",
         is_dora=False)

In [106]:
model.cuda();

In [124]:
%%time
question = dataset[5]['question']
output = model.generate(torch.tensor(tokenizer.encode(f"###Question:\n{question}\n###Answer:\n")).view(1,-1).cuda(),
                         do_sample=False, max_new_tokens=1024, use_cache=True).cpu()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


CPU times: user 8.32 s, sys: 1.9 ms, total: 8.32 s
Wall time: 8.32 s


In [125]:
print(tokenizer.decode(output[0]))

<s> ###Question:
A certain tax rate is some amount per $100.00. The rate, expressed as a percent, is 65%. What is the tax rate in dollars per $100.00?
###Answer:
To find the tax rate in dollars per $100.00, we need to convert the rate from a percent to a decimal and then multiply it by $100.00.

The tax rate expressed as a percent is 65%. To convert this to a decimal, we divide by 100:

65% / 100 = 0.65

Now, we multiply this decimal by $100.00 to find the tax rate in dollars per $100.00:

0.65 * $100.00 = $65.00

Therefore, the tax rate in dollars per $100.00 is $65.00.</s>


In [119]:
torch.tensor(tokenizer.encode(f"###Question:\n{question}\n###Answer:\n")).view(1,-1).shape

torch.Size([1, 56])

In [123]:
(output[0].shape[0] - 56) / 8.45

19.644970414201186

In [54]:
extract_last_number_or_ratio(tokenizer.decode(output[0]))

In [131]:
valid_dataset = dataset.select(range(500))

In [132]:
tokenizer.pad_token_id = tokenizer.eos_token_id

In [133]:
short_answers_pred = []
bs = 16
for i in tqdm(range(0,len(valid_dataset),bs)):
    
    inputs = [f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset[i:i+bs]['question']]
    input_ids = tokenizer(inputs)['input_ids']
    
    max_toks = max(len(toks) for toks in input_ids)
    b = torch.stack([torch.tensor(((max_toks-len(toks))*[tokenizer.unk_token_id])+toks) for toks in input_ids])
    input_lens = [len(toks) for toks in input_ids]
    
    output = model.generate(b.cuda(), 
                            do_sample=False, 
                            use_cache=True,
                            pad_token_id=tokenizer.unk_token_id, 
                            eos_token_id=tokenizer.eos_token_id, 
                            max_new_tokens=1024).cpu()
    
    pred = [extract_last_number_or_ratio(tokenizer.decode(o[o!=tokenizer.unk_token_id][n:])) for o,n in zip(output,input_lens)]
    short_answers_pred.extend(pred)

    if i > 0: break

  3%|██▎                                                                      | 1/32 [02:53<1:29:31, 173.28s/it]


In [137]:
sum(p==g for p,g in zip(short_answers_pred, short_answers_gt))/len(short_answers_pred)

0.16666666666666666

#### VLLM BnB 4bit (Custom Model)

### QLoRA 

In [27]:
from peft import get_peft_model, LoraConfig, TaskType

In [28]:
idx = hub.cached_file(MODEL_NAME, SAFE_WEIGHTS_INDEX_NAME)
pretrained_files, _ = hub.get_checkpoint_shard_files(MODEL_NAME, idx)

In [29]:
glob(str(models_dir/"llama-7b-orca-math-10k-bnb-qlora/*.safetensors"))[0]

'/weka/home-keremturgutlu/models/llama-7b-orca-math-10k-bnb-qlora/model_state_dict.safetensors'

In [30]:
trained_weights = safetensors.torch.load_file(glob(str(models_dir/"llama-7b-orca-math-10k-bnb-qlora/*.safetensors"))[0])

In [31]:
cfg = AutoConfig.from_pretrained(MODEL_NAME)
cfg._attn_implementation = "sdpa"
skip_modules = ["lm_head"]
load_param_skip_names = ['inv_freq']
compute_dtype = torch_dtype = torch.bfloat16

In [38]:
with init_empty_weights():
    model = AutoModelForCausalLM.from_config(cfg)
    model.model = replace_linear(model.model, Linear4bit, compute_dtype=compute_dtype,
                                 quant_type='nf4', quant_storage=torch_dtype, skip_modules=skip_modules)
model.is_loaded_in_4bit = True

In [39]:
for filename in pretrained_files:
    weights = safetensors.torch.load_file(filename)
    parallel(load_and_quantize_parallel, 
             iter(weights.items()), 
             n_workers=8, 
             threadpool=True,
             model=model, 
             dtype=torch_dtype, 
             device=torch.cuda.current_device(),
             skip_names=load_param_skip_names,
             is_meta_rank=False,
             verbose=True,
             quant_method="bnb",
             is_dora=False)

Skipping model.layers.0.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.1.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.10.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.11.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.12.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.13.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.14.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.15.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.16.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.17.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.18.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.19.self_attn.rotary_emb.inv_freq because it is in skip_names
Skippi

In [40]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False,
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["k_proj", "q_proj", "v_proj", "up_proj", "down_proj", "gate_proj"],
)

In [41]:
model = get_peft_model(model, peft_config)

In [None]:
for n,p in model.named_parameters():
    if "lora" in n: 
        print(n)
        p.data.copy_(trained_weights[n])

In [42]:
model.eval().cuda();

In [43]:
valid_dataset = dataset.select(range(500))

In [44]:
tokenizer.pad_token_id = tokenizer.eos_token_id

In [45]:
short_answers_pred = []
bs = 16
for i in tqdm(range(0,len(valid_dataset),bs)):
    
    inputs = [f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset[i:i+bs]['question']]
    input_ids = tokenizer(inputs)['input_ids']
    
    max_toks = max(len(toks) for toks in input_ids)
    b = torch.stack([torch.tensor(((max_toks-len(toks))*[tokenizer.unk_token_id])+toks) for toks in input_ids])
    input_lens = [len(toks) for toks in input_ids]
    
    output = model.generate(b.cuda(), 
                            do_sample=False, 
                            use_cache=True,
                            pad_token_id=tokenizer.unk_token_id, 
                            eos_token_id=tokenizer.eos_token_id, 
                            max_new_tokens=1024).cpu()
    
    pred = [extract_last_number_or_ratio(tokenizer.decode(o[o!=tokenizer.unk_token_id][n:])) for o,n in zip(output,input_lens)]
    short_answers_pred.extend(pred)

    if i > 0: break

  3%|██▊                                                                                        | 1/32 [03:41<1:54:18, 221.24s/it]


In [46]:
sum(p==g for p,g in zip(short_answers_pred, short_answers_gt))/len(short_answers_pred)

0.03125

### QDoRA

In [47]:
from dora import BNBDORA

In [34]:
idx = hub.cached_file(MODEL_NAME, SAFE_WEIGHTS_INDEX_NAME)
pretrained_files, _ = hub.get_checkpoint_shard_files(MODEL_NAME, idx)

In [35]:
glob(str(models_dir/"llama-7b-orca-math-10k-bnb-qdora/*.safetensors"))[0]

'/weka/home-keremturgutlu/models/llama-7b-orca-math-10k-bnb-qdora/model_state_dict.safetensors'

In [36]:
trained_weights = safetensors.torch.load_file(glob(str(models_dir/"llama-7b-orca-math-10k-bnb-qdora/*.safetensors"))[0])

In [40]:
cfg = AutoConfig.from_pretrained(MODEL_NAME)
cfg._attn_implementation = "sdpa"
skip_modules = ["lm_head"]
load_param_skip_names = ['inv_freq']
compute_dtype = torch_dtype = torch.bfloat16

In [41]:
with init_empty_weights():
    model = AutoModelForCausalLM.from_config(cfg)
    model.model = replace_linear(model.model, Linear4bit, compute_dtype=compute_dtype,
                                 quant_type='nf4', quant_storage=torch_dtype, skip_modules=skip_modules)
model.is_loaded_in_4bit = True

In [42]:
for filename in pretrained_files:
    weights = safetensors.torch.load_file(filename)
    parallel(load_and_quantize_parallel, 
             iter(weights.items()), 
             n_workers=8, 
             threadpool=True,
             model=model, 
             dtype=torch_dtype, 
             device=torch.cuda.current_device(),
             skip_names=load_param_skip_names,
             is_meta_rank=False,
             verbose=True,
             quant_method="bnb",
             is_dora=True)

Skipping model.layers.0.self_attn.rotary_emb.inv_freq because it is in skip_names
DORA scale initialized
DORA scale initialized
DORA scale initialized
DORA scale initialized
DORA scale initialized
DORA scale initialized
DORA scale initialized
DORA scale initialized
DORA scale initialized
DORA scale initialized
DORA scale initialized
Skipping model.layers.1.self_attn.rotary_emb.inv_freq because it is in skip_names
DORA scale initialized
DORA scale initialized
DORA scale initialized
DORA scale initialized
DORA scale initialized
DORA scale initialized
DORA scale initialized
DORA scale initialized
Skipping model.layers.10.self_attn.rotary_emb.inv_freq because it is in skip_names
DORA scale initialized
DORA scale initialized
DORA scale initialized
DORA scale initialized
DORA scale initialized
DORA scale initialized
DORA scale initialized
Skipping model.layers.11.self_attn.rotary_emb.inv_freq because it is in skip_names
DORA scale initialized
Skipping model.layers.12.self_attn.rotary_emb.inv

In [48]:
# Create LORA layers.
lora_target_modules = ["k_proj", "q_proj", "v_proj", "up_proj", "down_proj", "gate_proj"]
lora_rank=64
lora_alpha=16
lora_dropout=0.1
lora_cls = BNBDORA 

for name, _ in model.named_modules():
    module_key, _, value_key = name.rpartition('.')
    if value_key in lora_target_modules:
        m = model.get_submodule(name)
        qlora_layer = lora_cls(m, lora_rank, lora_alpha, lora_dropout)
        parent_module = model.get_submodule(module_key)
        setattr(parent_module, value_key, qlora_layer)

In [52]:
list(trained_weights.keys())

['model.layers.0.mlp.down_proj.dora_layer.lora_A.weight',
 'model.layers.0.mlp.down_proj.dora_layer.lora_B.weight',
 'model.layers.0.mlp.down_proj.magnitude_layer.magnitude',
 'model.layers.0.mlp.gate_proj.dora_layer.lora_A.weight',
 'model.layers.0.mlp.gate_proj.dora_layer.lora_B.weight',
 'model.layers.0.mlp.gate_proj.magnitude_layer.magnitude',
 'model.layers.0.mlp.up_proj.dora_layer.lora_A.weight',
 'model.layers.0.mlp.up_proj.dora_layer.lora_B.weight',
 'model.layers.0.mlp.up_proj.magnitude_layer.magnitude',
 'model.layers.0.self_attn.k_proj.dora_layer.lora_A.weight',
 'model.layers.0.self_attn.k_proj.dora_layer.lora_B.weight',
 'model.layers.0.self_attn.k_proj.magnitude_layer.magnitude',
 'model.layers.0.self_attn.q_proj.dora_layer.lora_A.weight',
 'model.layers.0.self_attn.q_proj.dora_layer.lora_B.weight',
 'model.layers.0.self_attn.q_proj.magnitude_layer.magnitude',
 'model.layers.0.self_attn.v_proj.dora_layer.lora_A.weight',
 'model.layers.0.self_attn.v_proj.dora_layer.lora_B.

In [53]:
for n,p in model.named_parameters():
    if ("dora_layer" in n) or ("magnitude_layer" in n): 
        print(n)
        p.data.copy_(trained_weights[n])

model.layers.0.self_attn.q_proj.magnitude_layer.magnitude
model.layers.0.self_attn.q_proj.dora_layer.lora_A.weight
model.layers.0.self_attn.q_proj.dora_layer.lora_B.weight
model.layers.0.self_attn.k_proj.magnitude_layer.magnitude
model.layers.0.self_attn.k_proj.dora_layer.lora_A.weight
model.layers.0.self_attn.k_proj.dora_layer.lora_B.weight
model.layers.0.self_attn.v_proj.magnitude_layer.magnitude
model.layers.0.self_attn.v_proj.dora_layer.lora_A.weight
model.layers.0.self_attn.v_proj.dora_layer.lora_B.weight
model.layers.0.mlp.gate_proj.magnitude_layer.magnitude
model.layers.0.mlp.gate_proj.dora_layer.lora_A.weight
model.layers.0.mlp.gate_proj.dora_layer.lora_B.weight
model.layers.0.mlp.up_proj.magnitude_layer.magnitude
model.layers.0.mlp.up_proj.dora_layer.lora_A.weight
model.layers.0.mlp.up_proj.dora_layer.lora_B.weight
model.layers.0.mlp.down_proj.magnitude_layer.magnitude
model.layers.0.mlp.down_proj.dora_layer.lora_A.weight
model.layers.0.mlp.down_proj.dora_layer.lora_B.weight
m

In [54]:
model.eval().cuda();

In [55]:
valid_dataset = dataset.select(range(500))

In [56]:
tokenizer.pad_token_id = tokenizer.eos_token_id

In [57]:
short_answers_pred = []
bs = 16
for i in tqdm(range(0,len(valid_dataset),bs)):
    
    inputs = [f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset[i:i+bs]['question']]
    input_ids = tokenizer(inputs)['input_ids']
    
    max_toks = max(len(toks) for toks in input_ids)
    b = torch.stack([torch.tensor(((max_toks-len(toks))*[tokenizer.unk_token_id])+toks) for toks in input_ids])
    input_lens = [len(toks) for toks in input_ids]
    
    output = model.generate(b.cuda(), 
                            do_sample=False, 
                            use_cache=True,
                            pad_token_id=tokenizer.unk_token_id, 
                            eos_token_id=tokenizer.eos_token_id, 
                            max_new_tokens=1024).cpu()
    
    pred = [extract_last_number_or_ratio(tokenizer.decode(o[o!=tokenizer.unk_token_id][n:])) for o,n in zip(output,input_lens)]
    short_answers_pred.extend(pred)

    if i > 0: break

  3%|██▎                                                                      | 1/32 [05:41<2:56:15, 341.15s/it]


In [58]:
sum(p==g for p,g in zip(short_answers_pred, short_answers_gt))/len(short_answers_pred)

0.1875

#### Merging

In [51]:
trained_weights = safetensors.torch.load_file(str(models_dir/"test-hqq-dora/model_state_dict.safetensors"))

In [52]:
merged_directory = models_dir/"test-hqq-dora/merged"
os.makedirs(merged_directory, exist_ok=True)

In [53]:
# Merge and Save
for filename in files:
    weights = safetensors.torch.load_file(filename)
    weights_copy = copy.deepcopy(weights)

    for k, frozen_weight in iter(weights.items()):
        module_key, _, value_key = k.rpartition(".")
        if value_key == "weight" and module_key + ".dora_layer.lora_A.weight" in trained_weights:
            print("Merging:", module_key)
    
            lora_A_weight = trained_weights[module_key + ".dora_layer.lora_A.weight"]
            lora_B_weight = trained_weights[module_key + ".dora_layer.lora_B.weight"]
            magnitude = trained_weights[module_key + ".magnitude_layer.magnitude"]
    
            weight = (frozen_weight + lora_B_weight @ lora_A_weight)
            norm_adapted = weight / weight.norm(p=2, dim=1).view(-1,1)
            new_weight = norm_adapted * magnitude.view(-1,1)
    
            weights_copy[k] = new_weight
    save_file(weights_copy, merged_directory/Path(filename).name)

Merging: model.layers.0.mlp.down_proj
Merging: model.layers.0.mlp.gate_proj
Merging: model.layers.0.mlp.up_proj
Merging: model.layers.0.self_attn.k_proj
Merging: model.layers.0.self_attn.q_proj
Merging: model.layers.0.self_attn.v_proj
Merging: model.layers.1.mlp.down_proj
Merging: model.layers.1.mlp.gate_proj
Merging: model.layers.1.mlp.up_proj
Merging: model.layers.1.self_attn.k_proj
Merging: model.layers.1.self_attn.q_proj
Merging: model.layers.1.self_attn.v_proj


KeyboardInterrupt: 

### Quantized-Llama-Pro

In [70]:
cfg = AutoConfig.from_pretrained(MODEL_NAME)
cfg._attn_implementation = "sdpa"
skip_modules = ["lm_head"]
load_param_skip_names = ['inv_freq']
compute_dtype = torch_dtype = torch.bfloat16

In [71]:
llama_pro_path = Path("/weka/home-keremturgutlu/models/meta-llama/Llama-2-7b-hf_blk_exp-32-35/")
num_original_layers, num_expanded_layers = llama_pro_path.name.split("blk_exp-")[1].split("-")
num_original_layers, num_expanded_layers = int(num_original_layers), int(num_expanded_layers)
total_new_layers = num_expanded_layers - num_original_layers
split = int(num_original_layers / (num_expanded_layers - num_original_layers))
new_layer_ids = [split+(split+1)*n for n in range(total_new_layers)]
new_layer_names = [f"layers.{i}" for i in new_layer_ids]
skip_modules += [str(lid) for lid in new_layer_ids]
cfg.num_hidden_layers = num_expanded_layers

In [72]:
skip_modules

['lm_head', '10', '21', '32']

In [73]:
trained_weights = safetensors.torch.load_file(glob(str(models_dir/"llama-7b-orca-math-10k-bnb-llama-pro/*.safetensors"))[0])

In [74]:
with init_empty_weights():
    model = AutoModelForCausalLM.from_config(cfg)
    model.model = replace_linear(model.model, Linear4bit, compute_dtype=compute_dtype,
                                 quant_type='nf4', quant_storage=torch_dtype, skip_modules=skip_modules)
model.is_loaded_in_4bit = True

Skipping 10
Skipping 21
Skipping 32


In [75]:
pretrained_files = glob(str(llama_pro_path/"*.safetensors"))

In [76]:
for filename in pretrained_files:
    weights = safetensors.torch.load_file(filename)
    parallel(load_and_quantize_parallel, 
             iter(weights.items()), 
             n_workers=8, 
             threadpool=True,
             model=model, 
             dtype=torch_dtype, 
             device=torch.cuda.current_device(),
             skip_names=load_param_skip_names,
             is_meta_rank=False,
             verbose=True,
             quant_method="bnb",
             is_dora=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Skipping model.layers.26.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.27.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.28.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.29.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.30.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.31.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.32.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.33.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.34.self_attn.rotary_emb.inv_freq because it is in skip_names


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Skipping model.layers.0.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.1.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.10.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.11.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.12.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.13.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.14.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.15.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.16.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.17.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.18.self_attn.rotary_emb.inv_freq because it is in skip_names
Skipping model.layers.19.self_attn.rotary_emb.inv_freq because it is in skip_names
Skippi

In [79]:
for n,p in model.named_parameters():
    if n in trained_weights: 
        print(n)
        p.data.copy_(trained_weights[n])

model.layers.10.self_attn.q_proj.weight
model.layers.10.self_attn.k_proj.weight
model.layers.10.self_attn.v_proj.weight
model.layers.10.self_attn.o_proj.weight
model.layers.10.mlp.gate_proj.weight
model.layers.10.mlp.up_proj.weight
model.layers.10.mlp.down_proj.weight
model.layers.10.input_layernorm.weight
model.layers.10.post_attention_layernorm.weight
model.layers.21.self_attn.q_proj.weight
model.layers.21.self_attn.k_proj.weight
model.layers.21.self_attn.v_proj.weight
model.layers.21.self_attn.o_proj.weight
model.layers.21.mlp.gate_proj.weight
model.layers.21.mlp.up_proj.weight
model.layers.21.mlp.down_proj.weight
model.layers.21.input_layernorm.weight
model.layers.21.post_attention_layernorm.weight
model.layers.32.self_attn.q_proj.weight
model.layers.32.self_attn.k_proj.weight
model.layers.32.self_attn.v_proj.weight
model.layers.32.self_attn.o_proj.weight
model.layers.32.mlp.gate_proj.weight
model.layers.32.mlp.up_proj.weight
model.layers.32.mlp.down_proj.weight
model.layers.32.inp

In [80]:
model.eval().cuda();

In [81]:
valid_dataset = dataset.select(range(500))

In [82]:
tokenizer.pad_token_id = tokenizer.eos_token_id

In [83]:
short_answers_pred = []
bs = 16
for i in tqdm(range(0,len(valid_dataset),bs)):
    
    inputs = [f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset[i:i+bs]['question']]
    input_ids = tokenizer(inputs)['input_ids']
    
    max_toks = max(len(toks) for toks in input_ids)
    b = torch.stack([torch.tensor(((max_toks-len(toks))*[tokenizer.unk_token_id])+toks) for toks in input_ids])
    input_lens = [len(toks) for toks in input_ids]
    
    output = model.generate(b.cuda(), 
                            do_sample=False, 
                            use_cache=True,
                            pad_token_id=tokenizer.unk_token_id, 
                            eos_token_id=tokenizer.eos_token_id, 
                            max_new_tokens=1024).cpu()
    
    pred = [extract_last_number_or_ratio(tokenizer.decode(o[o!=tokenizer.unk_token_id][n:])) for o,n in zip(output,input_lens)]
    short_answers_pred.extend(pred)

    if i > 0: break

  3%|██▎                                                                      | 1/32 [02:49<1:27:37, 169.59s/it]


In [84]:
sum(p==g for p,g in zip(short_answers_pred, short_answers_gt))/len(short_answers_pred)

0.21875

In [4]:
args = {"model_name" : 'meta-llama/Llama-2-7b-hf'}
args['expansion_rate'] = 0.1

In [None]:
idx = hub.cached_file(args["model_name"], SAFE_WEIGHTS_INDEX_NAME)
files, _ = hub.get_checkpoint_shard_files(args["model_name"], idx)

In [None]:
files

['/admin/home-keremturgutlu/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423/model-00001-of-00002.safetensors',
 '/admin/home-keremturgutlu/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8a0442e81540efaeb1a0fe3e95477b5e0edfd423/model-00002-of-00002.safetensors']

In [None]:
cfg = AutoConfig.from_pretrained(args["model_name"])
num_original_layers = cfg.num_hidden_layers
num_new_layers = num_original_layers + int(num_original_layers * args['expansion_rate'])
split = int(num_original_layers / (num_new_layers - num_original_layers))
layer_cnt = 0

In [None]:
new_layers, num_original_layers, split

(35, 32, 10)

In [65]:
expanded_weights = {}
for filename in files:
    weights = safetensors.torch.load_file(filename)
    for k,v in iter(weights.items()):
        if 'layers' in k:
            layer_no = int(k.split('layers.')[1].split('.')[0])
            # shift existing layers by previously added layers
            new_layer_no = layer_no + layer_no // split
            if new_layer_no != layer_no:
                print(f"Moving {k} to {new_layer_no}")
            new_k = k.replace(f'layers.{layer_no}', f'layers.{new_layer_no}')
            expanded_weights[new_k] = v
            # add new layers
            if (layer_no+1) % split == 0:
                new_layer_no += 1
                new_k = k.replace(f'layers.{layer_no}', f'layers.{new_layer_no}')
                if 'down_proj' in k or 'o_proj' in k:
                    expanded_weights[new_k] = torch.zeros_like(v)     
                else:
                    expanded_weights[new_k] = v
        else:
            expanded_weights[k] = v

Moving model.layers.10.input_layernorm.weight to 11
Moving model.layers.10.mlp.down_proj.weight to 11
Moving model.layers.10.mlp.gate_proj.weight to 11
Moving model.layers.10.mlp.up_proj.weight to 11
Moving model.layers.10.post_attention_layernorm.weight to 11
Moving model.layers.10.self_attn.k_proj.weight to 11
Moving model.layers.10.self_attn.o_proj.weight to 11
Moving model.layers.10.self_attn.q_proj.weight to 11
Moving model.layers.10.self_attn.rotary_emb.inv_freq to 11
Moving model.layers.10.self_attn.v_proj.weight to 11
Moving model.layers.11.input_layernorm.weight to 12
Moving model.layers.11.mlp.down_proj.weight to 12
Moving model.layers.11.mlp.gate_proj.weight to 12
Moving model.layers.11.mlp.up_proj.weight to 12
Moving model.layers.11.post_attention_layernorm.weight to 12
Moving model.layers.11.self_attn.k_proj.weight to 12
Moving model.layers.11.self_attn.o_proj.weight to 12
Moving model.layers.11.self_attn.q_proj.weight to 12
Moving model.layers.11.self_attn.rotary_emb.inv_

In [64]:
# list(expanded_weights.keys())

In [45]:
# set([k.split("layers.")[1].split(".")[0] for k in expanded_weights.keys() if "layers" in k])

In [66]:
torch.equal(expanded_weights['model.layers.31.mlp.up_proj.weight'],expanded_weights['model.layers.32.mlp.up_proj.weight'])

True

In [70]:
expanded_weights['model.layers.32.mlp.down_proj.weight'].min(), expanded_weights['model.layers.32.mlp.down_proj.weight'].max()

(tensor(0., dtype=torch.float16), tensor(0., dtype=torch.float16))

In [71]:
expanded_weights['model.layers.32.self_attn.o_proj.weight'].min(), expanded_weights['model.layers.32.self_attn.o_proj.weight'].max()

(tensor(0., dtype=torch.float16), tensor(0., dtype=torch.float16))

In [6]:
llama_pro_path = Path("/weka/home-keremturgutlu/models/meta-llama/Llama-2-7b-hf_blk_exp-32-35")

In [8]:
filenames = glob(str(llama_pro_path/"*.safetensors")); filenames

['/weka/home-keremturgutlu/models/meta-llama/Llama-2-7b-hf_blk_exp-32-35/model-00002-of-00002.safetensors',
 '/weka/home-keremturgutlu/models/meta-llama/Llama-2-7b-hf_blk_exp-32-35/model-00001-of-00002.safetensors']

In [36]:
layer_nums = set()
for filename in filenames:
    keys = list(safetensors.torch.load_file(str(filename)).keys())
    layer_nums = layer_nums.union(set([int(k.split("layers.")[1].split(".")[0]) for k in keys if 'layers' in k]))

In [16]:
num_original_layers, num_expanded_layers = llama_pro_path.name.split("blk_exp-")[1].split("-")
num_original_layers, num_expanded_layers = int(num_original_layers), int(num_expanded_layers)

In [18]:
split = int(num_original_layers / (num_expanded_layers - num_original_layers))

10

In [48]:
total_new_layers = num_expanded_layers - num_original_layers

In [50]:
new_layer_ids = [split + (split + 1)*n for n in range(total_new_layers)]

In [51]:
for layer_no in range(num_expanded_layers):
    if layer_no in new_layer_ids:
        print(f"Layer {layer_no} is new")

Layer 10 is new
Layer 21 is new
Layer 32 is new


In [53]:
verify_weights = {}
for filename in filenames:
    weights = safetensors.torch.load_file(str(filename))
    for k,v in iter(weights.items()):
        if any(((f"layers.{i}" in k) or (f"layers.{i-1}" in k) for i in new_layer_ids)):
            verify_weights[k] = v

In [58]:
for k,v in verify_weights.items():
    if any(((f"layers.{i}" in k) for i in new_layer_ids)):
        if 'down_proj' in k or 'o_proj' in k:
            assert torch.equal(v, torch.zeros_like(v))
        else:
            lid = int(k.split("layers.")[1].split(".")[0])
            assert torch.equal(verify_weights[k.replace(f"layers.{lid}", f"layers.{lid-1}")], v)
        

In [5]:
cfg = AutoConfig.from_pretrained(args["model_name"])

In [9]:
from accelerate import init_empty_weights
with init_empty_weights():
    model = AutoModelForCausalLM.from_config(cfg)

In [64]:
def recursive_children(model):
    for n,c in model.named_children():
        # print(n,  c.__class__)
        if n in [str(i) for i in [10,21,32]] + ["lm_head"]: 
            print("skipped", n, c.__class__)
            continue
        if len(list(model.children())) > 0:
            # if n in [str(i) for i in range(32)]: 
            #     print("replaced:", n)
            recursive_children(c)

In [65]:
recursive_children(model)

skipped 10 <class 'transformers.models.llama.modeling_llama.LlamaDecoderLayer'>
skipped 21 <class 'transformers.models.llama.modeling_llama.LlamaDecoderLayer'>
skipped lm_head <class 'torch.nn.modules.linear.Linear'>


### ORCA-Math

In [42]:
import datasets
from torch.utils.data import Dataset
import pandas as pd

In [10]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

In [46]:
ds = datasets.load_dataset("microsoft/orca-math-word-problems-200k")['train']

In [47]:
ds = ds.shuffle(seed=42)

In [15]:
ntrain = 10000
train_ds = ds.select(range(0, ntrain))
valid_ds = ds.select(range(ntrain, len(ds)))

In [17]:
train_ds, valid_ds

(Dataset({
     features: ['question', 'answer'],
     num_rows: 10000
 }),
 Dataset({
     features: ['question', 'answer'],
     num_rows: 190035
 }))

In [18]:
train_ds[0]

{'question': 'Sally had 13 peaches at her roadside fruit dish.  She went to the orchard and picked peaches to stock up. She picked 55 peaches. There are _____ peaches now.',
 'answer': 'Sally originally had 13 peaches. She picked 55 more peaches. To find out the total number of peaches she has now, we add the two amounts together:\n\n13 (original peaches) + 55 (picked peaches) = 68 peaches\n\nSo, there are 68 peaches now.'}

In [34]:
class InstructionDataset(Dataset):
    def __init__(self, dataset, tokenizer, style="alpaca"):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.style = style

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        IGNORE_INDEX = -100  # The default setting in CrossEntropyLoss
        if self.style == "guanaco":
            prompt = self.dataset[index]["text"].split("### Assistant: ")[0]
            example = self.dataset[index]["text"]
        elif self.style == "qna":
            prompt_template = "###Context:\n{context}\n###Question:\n{question}\n###Answer:\n"
            sample = self.dataset[index]
            prompt = prompt_template.format_map(sample)
            example = prompt + sample['answer']
        elif self.style == "qna_no_ctx":
            prompt_template = "###Question:\n{question}\n###Answer:\n"
            sample = self.dataset[index]
            prompt = prompt_template.format_map(sample)
            example = prompt + sample['answer']            
        else: # Alpaca
            ann = self.dataset[index]
            if ann.get("input", "") == "":
                prompt = PROMPT_DICT["prompt_no_input"].format_map(ann)
            else:
                prompt = PROMPT_DICT["prompt_input"].format_map(ann)
            example = prompt + ann["output"]

        prompt = torch.tensor(
            self.tokenizer.encode(prompt), dtype=torch.int64
        )
        example = self.tokenizer.encode(example)
        example.append(self.tokenizer.eos_token_id)
        example = torch.tensor(
            example, dtype=torch.int64
        )
        labels = copy.deepcopy(example)
        labels[: len(prompt)] = -1
        example_mask = example.ge(0)
        label_mask = labels.ge(0)
        example[~example_mask] = 0
        labels[~label_mask] = IGNORE_INDEX

        return {
            "input_ids": example.tolist(),
            "labels": labels.tolist(),
            "attention_mask":example_mask.tolist(),
        }

In [35]:
train_dataset = InstructionDataset(train_ds, tokenizer, style="qna_no_ctx")

In [41]:
seqlens = [len(d['input_ids'] + d['labels']) for d in train_dataset]

In [45]:
pd.value_counts(seqlens, bins=10).sort_index()

  pd.value_counts(seqlens, bins=10).sort_index()


(65.425, 325.4]      844
(325.4, 582.8]      2189
(582.8, 840.2]      2846
(840.2, 1097.6]     2114
(1097.6, 1355.0]    1203
(1355.0, 1612.4]     503
(1612.4, 1869.8]     202
(1869.8, 2127.2]      70
(2127.2, 2384.6]      20
(2384.6, 2642.0]       9
Name: count, dtype: int64