Evaluations with zero-shot, 5-shot, full fine-tune, merged LoRA, merged DoRA.

In [30]:
import torch
from pathlib import Path
import os, json
from safetensors.torch import save_file
import copy
from tqdm import tqdm
import safetensors
import safetensors.torch
from glob import glob
from transformers import AutoConfig, AutoTokenizer
from transformers.utils import hub, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
from hqq.core.quantize import HQQLinear, HQQBackend, BaseQuantizeConfig, Quantizer

from hqq.core.quantize import HQQLinear, BaseQuantizeConfig, Quantizer
from hqq.backends.torchao import patch_hqq_to_aoint4
from fastcore.script import *

# import bitsandbytes as bnb
# from bitsandbytes.nn.modules import Params4bit
# import torch

In [4]:
weights = safetensors.torch.load_file("/workspace/models/llama-3-8b-instruct-orca-math-10k-hqq-lora-ln/model_state_dict.safetensors")

In [15]:
for k,v in weights.items():
	if "layers.0" in k: print(k, v.shape)

model.layers.0.input_layernorm.weight torch.Size([4096])
model.layers.0.mlp.down_proj.lora_A.weight torch.Size([64, 14336])
model.layers.0.mlp.down_proj.lora_B.weight torch.Size([4096, 64])
model.layers.0.mlp.gate_proj.lora_A.weight torch.Size([64, 4096])
model.layers.0.mlp.gate_proj.lora_B.weight torch.Size([14336, 64])
model.layers.0.mlp.up_proj.lora_A.weight torch.Size([64, 4096])
model.layers.0.mlp.up_proj.lora_B.weight torch.Size([14336, 64])
model.layers.0.post_attention_layernorm.weight torch.Size([4096])
model.layers.0.self_attn.k_proj.lora_A.weight torch.Size([64, 4096])
model.layers.0.self_attn.k_proj.lora_B.weight torch.Size([1024, 64])
model.layers.0.self_attn.o_proj.lora_A.weight torch.Size([64, 4096])
model.layers.0.self_attn.o_proj.lora_B.weight torch.Size([4096, 64])
model.layers.0.self_attn.q_proj.lora_A.weight torch.Size([64, 4096])
model.layers.0.self_attn.q_proj.lora_B.weight torch.Size([4096, 64])
model.layers.0.self_attn.v_proj.lora_A.weight torch.Size([64, 4096])

### Prepare Merged Weights

In [8]:
model_dir = Path("/workspace/models/llama-3-8b-instruct-orca-math-10k-hqq-lora-ln")
config_dict = json.load(open(model_dir/"config.json"))["qlora_config"]

In [9]:
config_dict

{'lora_target_modules': ['q_proj',
  'k_proj',
  'v_proj',
  'o_proj',
  'gate_proj',
  'up_proj',
  'down_proj'],
 'compute_dtype': 'bfloat16',
 'lora_rank': 64,
 'lora_alpha': 16,
 'lora_dropout': 0.1,
 'layer_nbits': {'q_proj': 4,
  'k_proj': 4,
  'v_proj': 4,
  'o_proj': 4,
  'gate_proj': 4,
  'up_proj': 4,
  'down_proj': 4},
 'layer_groupsizes': {'q_proj': 64,
  'k_proj': 64,
  'v_proj': 64,
  'o_proj': 64,
  'gate_proj': 64,
  'up_proj': 64,
  'down_proj': 64},
 'skip_dora_all': False,
 'skip_dora_4bit': False,
 'train_layernorms': 1}

In [11]:
lora_layers = config_dict.get("lora_target_modules", [])
layer_nbits = config_dict.get("layer_nbits", {})
layer_groupsizes = config_dict.get("layer_groupsizes", {})
dtype = getattr(torch, config_dict.get("compute_dtype", "bfloat16"))
lora_rank = config_dict.get("lora_rank", 64)
lora_alpha = config_dict.get("lora_alpha", 16)

In [13]:
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
idx = hub.cached_file(MODEL_NAME, SAFE_WEIGHTS_INDEX_NAME)
pretrained_files, _ = hub.get_checkpoint_shard_files(MODEL_NAME, idx)

In [14]:
dora_weights = safetensors.torch.load_file(model_dir/"model_state_dict.safetensors")

In [16]:
lora_layers

['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']

In [36]:
# Here assume quantized layers are same as lora layers.
quantized_layers = lora_layers

In [37]:
if config_dict['train_layernorms']:
	layernorm_layers = set([k for k in dora_weights.keys() if "layernorm" in k])
else:
	layernorm_layers = set([])

In [47]:
existing_quantized_dir = None
bitblas_dtype = torch.half
args = {}
args["infer_type"] = "merged_hqq_lora"

In [48]:
lora_rank, lora_alpha

(64, 16)

In [57]:
save_dir = Path("/workspace/models/llama-3-8b-instruct-orca-math-10k-hqq-lora-ln-merged-vllm")
os.makedirs(save_dir, exist_ok=True)

In [58]:
for filename in pretrained_files:
	
	quantized_state_dict = {}
	file_shard_name = Path(filename).name
	pretrained_weights = safetensors.torch.load_file(filename)

	for n,p in tqdm(iter(pretrained_weights.items())):
		if "inv_freq" in n: continue
		p = p.to(dtype)
		if any(l in n for l in quantized_layers) and "weight" in n:
			if existing_quantized_dir is None:
				NBITS = layer_nbits[n.split(".")[-2]]
				GROUPSIZE = layer_groupsizes[n.split(".")[-2]]
				
				# Get layer-wise quant config.
				quant_config = BaseQuantizeConfig(nbits=NBITS,
													group_size=GROUPSIZE, 
													quant_zero=False,
													quant_scale=False,
													offload_meta=False,
													view_as_float=False, 
													axis=1)
		
				# Prepare HQQ weights and quantize.
				m = torch.nn.Linear(*p.T.shape, bias=False)
				m.weight.data.copy_(p)
				hqq_linear = HQQLinear(m, quant_config, compute_dtype=dtype)
				W_est = hqq_linear.dequantize()

				# Tinygemm weights.
				if args["infer_type"] == "tinygemm":
					patched_hqq_linear = patch_hqq_to_aoint4(hqq_linear, None) # patching deletes `hqq_linear.W_q`.
					quantized_state_dict[n.replace(".weight", ".qweight")] = patched_hqq_linear.weight_int4pack
					quantized_state_dict[n.replace(".weight", ".scales_and_zeros")] = patched_hqq_linear.scales_and_zeros         
				elif args["infer_type"] == "merged_hqq_lora":	
					# Prepare LORA weights.
					pass
				# Bitblas weights.
				elif args["infer_type"] == "bitblas":
					W_q_unpacked = Quantizer.unpack[hqq_linear.meta['packing']](hqq_linear.W_q)
					scale, zero, shape = hqq_linear.meta['scale'], hqq_linear.meta['zero'], hqq_linear.meta['shape']
					scale = scale.to(bitblas_dtype)
					zero = zero.to(bitblas_dtype)
		
					# BitBLAS engine.
					print(f"Tuning BitBLAS for {hqq_linear.in_features}x{hqq_linear.out_features}")
					matmul_config = bitblas.MatmulConfig(M=BITBLAS_OPT_M,
															N=hqq_linear.out_features,
															K=hqq_linear.in_features,
															A_dtype="float16",  
															W_dtype={4:"uint4",2:"uint2"}[NBITS],
															accum_dtype="float16",  
															out_dtype="float16",  
															layout="nt",  
															with_bias=False, 
															group_size=GROUPSIZE,
															with_scaling=True,  
															with_zeros=True,  
															zeros_mode="original",  
															#fast_decoding=True,
														)
					matmul_eng_4bit = _get_or_create_bitblas_operator(matmul_config)		
		
					Wq_bitblas_4bit = matmul_eng_4bit.transform_weight(W_q_unpacked.reshape(shape))
					meta_shape_bitblas = (hqq_linear.out_features, hqq_linear.in_features // GROUPSIZE)
					scales_bitblas_4bit = scale.view(meta_shape_bitblas)
					zeros_bitblas_4bit = zero.view(meta_shape_bitblas)

					quantized_state_dict[n.replace(".weight", ".qweight")] = Wq_bitblas_4bit
					quantized_state_dict[n.replace(".weight", ".scales")]  = scales_bitblas_4bit
					quantized_state_dict[n.replace(".weight", ".zeros")]   = zeros_bitblas_4bit
				else:
					raise ValueError("Invalid inference type.")
			else:
				assert "qweight" in quantized_state_dict
				if args["infer_type"] == "tinygemm":
					assert "scales_and_zeros" in quantized_state_dict
				elif args["infer_type"] == "bitblas":
					assert "scales" in quantized_state_dict and "zeros" in quantized_state_dict
				else:
					raise ValueError("Invalid inference type.")

			# DoRA weights.
			# import pdb; pdb.set_trace()
			if args["infer_type"] == "merged_hqq_lora":
				print(f"Merging layer {n} with LORA")
				lora_a = dora_weights[n.replace(".weight",".lora_A.weight")].cuda()
				lora_b = dora_weights[n.replace(".weight",".lora_B.weight")].cuda()
				scaling = lora_alpha / lora_rank
				quantized_state_dict[n] = (W_est + (lora_b @ lora_a * scaling)).cpu()
				del W_est; torch.cuda.empty_cache()
			else:
				lora_a = dora_weights[n.replace(".weight",".dora_layer.lora_A.weight")].cuda()
				lora_b = dora_weights[n.replace(".weight",".dora_layer.lora_B.weight")].cuda()
				m = dora_weights[n.replace(".weight",".magnitude_layer.magnitude")]
				rescale = m / (W_est + lora_b @ lora_a).norm(p=2, dim=1).detach().cpu()
				lora_a, lora_b = lora_a.cpu(), lora_b.cpu()
				del W_est; torch.cuda.empty_cache()
				if args["infer_type"] == "bitblas":
					lora_a = lora_a.to(bitblas_dtype)
					lora_b = lora_b.to(bitblas_dtype)
					rescale = rescale.to(bitblas_dtype)
				quantized_state_dict[n.replace(".weight", ".lora_A")] = lora_a
				quantized_state_dict[n.replace(".weight", ".lora_B")] = lora_b
				quantized_state_dict[n.replace(".weight", ".rescale")] = rescale
	
		elif n in layernorm_layers:
			quantized_state_dict[n] = dora_weights[n]
			print(f"Replacing {n}.")
			
		else:
			quantized_state_dict[n] = p
			print(f"Copying {n}.")
	
	# Save quantized state_dict.
	quantized_state_dict = {k:v.contiguous() for k,v in quantized_state_dict.items()}
	safetensors.torch.save_file(quantized_state_dict, save_dir/f"{file_shard_name}")

1it [00:00,  7.73it/s]

Copying model.embed_tokens.weight.
Replacing model.layers.0.input_layernorm.weight.


3it [00:00,  5.52it/s]

Merging layer model.layers.0.mlp.down_proj.weight with LORA


4it [00:00,  3.69it/s]

Merging layer model.layers.0.mlp.gate_proj.weight with LORA


8it [00:01,  5.82it/s]

Merging layer model.layers.0.mlp.up_proj.weight with LORA
Replacing model.layers.0.post_attention_layernorm.weight.
Merging layer model.layers.0.self_attn.k_proj.weight with LORA
Merging layer model.layers.0.self_attn.o_proj.weight with LORA


9it [00:01,  5.09it/s]

Merging layer model.layers.0.self_attn.q_proj.weight with LORA
Merging layer model.layers.0.self_attn.v_proj.weight with LORA
Replacing model.layers.1.input_layernorm.weight.


12it [00:02,  5.54it/s]

Merging layer model.layers.1.mlp.down_proj.weight with LORA


13it [00:02,  4.33it/s]

Merging layer model.layers.1.mlp.gate_proj.weight with LORA


17it [00:03,  5.79it/s]

Merging layer model.layers.1.mlp.up_proj.weight with LORA
Replacing model.layers.1.post_attention_layernorm.weight.
Merging layer model.layers.1.self_attn.k_proj.weight with LORA
Merging layer model.layers.1.self_attn.o_proj.weight with LORA


18it [00:03,  6.01it/s]

Merging layer model.layers.1.self_attn.q_proj.weight with LORA
Merging layer model.layers.1.self_attn.v_proj.weight with LORA
Replacing model.layers.2.input_layernorm.weight.


21it [00:04,  5.09it/s]

Merging layer model.layers.2.mlp.down_proj.weight with LORA


22it [00:04,  4.12it/s]

Merging layer model.layers.2.mlp.gate_proj.weight with LORA


23it [00:05,  3.51it/s]

Merging layer model.layers.2.mlp.up_proj.weight with LORA
Replacing model.layers.2.post_attention_layernorm.weight.
Merging layer model.layers.2.self_attn.k_proj.weight with LORA


27it [00:05,  5.63it/s]

Merging layer model.layers.2.self_attn.o_proj.weight with LORA
Merging layer model.layers.2.self_attn.q_proj.weight with LORA
Merging layer model.layers.2.self_attn.v_proj.weight with LORA
Replacing model.layers.3.input_layernorm.weight.


30it [00:06,  5.80it/s]

Merging layer model.layers.3.mlp.down_proj.weight with LORA


31it [00:06,  4.49it/s]

Merging layer model.layers.3.mlp.gate_proj.weight with LORA


35it [00:07,  5.17it/s]

Merging layer model.layers.3.mlp.up_proj.weight with LORA
Replacing model.layers.3.post_attention_layernorm.weight.
Merging layer model.layers.3.self_attn.k_proj.weight with LORA
Merging layer model.layers.3.self_attn.o_proj.weight with LORA


36it [00:07,  5.44it/s]

Merging layer model.layers.3.self_attn.q_proj.weight with LORA
Merging layer model.layers.3.self_attn.v_proj.weight with LORA
Replacing model.layers.4.input_layernorm.weight.


39it [00:07,  5.76it/s]

Merging layer model.layers.4.mlp.down_proj.weight with LORA


40it [00:08,  4.52it/s]

Merging layer model.layers.4.mlp.gate_proj.weight with LORA


44it [00:09,  5.82it/s]

Merging layer model.layers.4.mlp.up_proj.weight with LORA
Replacing model.layers.4.post_attention_layernorm.weight.
Merging layer model.layers.4.self_attn.k_proj.weight with LORA
Merging layer model.layers.4.self_attn.o_proj.weight with LORA


45it [00:09,  5.98it/s]

Merging layer model.layers.4.self_attn.q_proj.weight with LORA
Merging layer model.layers.4.self_attn.v_proj.weight with LORA
Replacing model.layers.5.input_layernorm.weight.


48it [00:09,  6.05it/s]

Merging layer model.layers.5.mlp.down_proj.weight with LORA


49it [00:10,  4.63it/s]

Merging layer model.layers.5.mlp.gate_proj.weight with LORA


53it [00:10,  5.93it/s]

Merging layer model.layers.5.mlp.up_proj.weight with LORA
Replacing model.layers.5.post_attention_layernorm.weight.
Merging layer model.layers.5.self_attn.k_proj.weight with LORA
Merging layer model.layers.5.self_attn.o_proj.weight with LORA


54it [00:10,  6.11it/s]

Merging layer model.layers.5.self_attn.q_proj.weight with LORA
Merging layer model.layers.5.self_attn.v_proj.weight with LORA
Replacing model.layers.6.input_layernorm.weight.


57it [00:11,  6.13it/s]

Merging layer model.layers.6.mlp.down_proj.weight with LORA


58it [00:11,  4.66it/s]

Merging layer model.layers.6.mlp.gate_proj.weight with LORA


62it [00:12,  5.43it/s]

Merging layer model.layers.6.mlp.up_proj.weight with LORA
Replacing model.layers.6.post_attention_layernorm.weight.
Merging layer model.layers.6.self_attn.k_proj.weight with LORA
Merging layer model.layers.6.self_attn.o_proj.weight with LORA


63it [00:12,  5.67it/s]

Merging layer model.layers.6.self_attn.q_proj.weight with LORA
Merging layer model.layers.6.self_attn.v_proj.weight with LORA
Replacing model.layers.7.input_layernorm.weight.


66it [00:13,  5.89it/s]

Merging layer model.layers.7.mlp.down_proj.weight with LORA


67it [00:13,  4.58it/s]

Merging layer model.layers.7.mlp.gate_proj.weight with LORA


71it [00:14,  5.89it/s]

Merging layer model.layers.7.mlp.up_proj.weight with LORA
Replacing model.layers.7.post_attention_layernorm.weight.
Merging layer model.layers.7.self_attn.k_proj.weight with LORA
Merging layer model.layers.7.self_attn.o_proj.weight with LORA


72it [00:14,  6.13it/s]

Merging layer model.layers.7.self_attn.q_proj.weight with LORA
Merging layer model.layers.7.self_attn.v_proj.weight with LORA
Replacing model.layers.8.input_layernorm.weight.


75it [00:14,  6.17it/s]

Merging layer model.layers.8.mlp.down_proj.weight with LORA


76it [00:15,  4.70it/s]

Merging layer model.layers.8.mlp.gate_proj.weight with LORA


80it [00:16,  5.99it/s]

Merging layer model.layers.8.mlp.up_proj.weight with LORA
Replacing model.layers.8.post_attention_layernorm.weight.
Merging layer model.layers.8.self_attn.k_proj.weight with LORA
Merging layer model.layers.8.self_attn.o_proj.weight with LORA


82it [00:16,  5.06it/s]


Merging layer model.layers.8.self_attn.q_proj.weight with LORA
Merging layer model.layers.8.self_attn.v_proj.weight with LORA


1it [00:00,  6.05it/s]

Replacing model.layers.10.input_layernorm.weight.


2it [00:00,  3.23it/s]

Merging layer model.layers.10.mlp.down_proj.weight with LORA


3it [00:01,  2.46it/s]

Merging layer model.layers.10.mlp.gate_proj.weight with LORA


4it [00:01,  2.37it/s]

Merging layer model.layers.10.mlp.up_proj.weight with LORA
Replacing model.layers.10.post_attention_layernorm.weight.
Merging layer model.layers.10.self_attn.k_proj.weight with LORA


8it [00:01,  5.39it/s]

Merging layer model.layers.10.self_attn.o_proj.weight with LORA
Merging layer model.layers.10.self_attn.q_proj.weight with LORA
Merging layer model.layers.10.self_attn.v_proj.weight with LORA
Replacing model.layers.11.input_layernorm.weight.


11it [00:02,  5.78it/s]

Merging layer model.layers.11.mlp.down_proj.weight with LORA


12it [00:02,  4.42it/s]

Merging layer model.layers.11.mlp.gate_proj.weight with LORA


16it [00:03,  5.92it/s]

Merging layer model.layers.11.mlp.up_proj.weight with LORA
Replacing model.layers.11.post_attention_layernorm.weight.
Merging layer model.layers.11.self_attn.k_proj.weight with LORA
Merging layer model.layers.11.self_attn.o_proj.weight with LORA


17it [00:03,  6.12it/s]

Merging layer model.layers.11.self_attn.q_proj.weight with LORA
Merging layer model.layers.11.self_attn.v_proj.weight with LORA
Replacing model.layers.12.input_layernorm.weight.


20it [00:04,  6.11it/s]

Merging layer model.layers.12.mlp.down_proj.weight with LORA


21it [00:04,  4.62it/s]

Merging layer model.layers.12.mlp.gate_proj.weight with LORA


25it [00:05,  5.94it/s]

Merging layer model.layers.12.mlp.up_proj.weight with LORA
Replacing model.layers.12.post_attention_layernorm.weight.
Merging layer model.layers.12.self_attn.k_proj.weight with LORA
Merging layer model.layers.12.self_attn.o_proj.weight with LORA


26it [00:05,  6.11it/s]

Merging layer model.layers.12.self_attn.q_proj.weight with LORA
Merging layer model.layers.12.self_attn.v_proj.weight with LORA
Replacing model.layers.13.input_layernorm.weight.


29it [00:05,  5.89it/s]

Merging layer model.layers.13.mlp.down_proj.weight with LORA


30it [00:06,  4.35it/s]

Merging layer model.layers.13.mlp.gate_proj.weight with LORA


34it [00:06,  5.68it/s]

Merging layer model.layers.13.mlp.up_proj.weight with LORA
Replacing model.layers.13.post_attention_layernorm.weight.
Merging layer model.layers.13.self_attn.k_proj.weight with LORA
Merging layer model.layers.13.self_attn.o_proj.weight with LORA


35it [00:07,  5.90it/s]

Merging layer model.layers.13.self_attn.q_proj.weight with LORA
Merging layer model.layers.13.self_attn.v_proj.weight with LORA
Replacing model.layers.14.input_layernorm.weight.


38it [00:07,  5.99it/s]

Merging layer model.layers.14.mlp.down_proj.weight with LORA


39it [00:08,  4.60it/s]

Merging layer model.layers.14.mlp.gate_proj.weight with LORA


43it [00:08,  5.86it/s]

Merging layer model.layers.14.mlp.up_proj.weight with LORA
Replacing model.layers.14.post_attention_layernorm.weight.
Merging layer model.layers.14.self_attn.k_proj.weight with LORA
Merging layer model.layers.14.self_attn.o_proj.weight with LORA


44it [00:08,  6.09it/s]

Merging layer model.layers.14.self_attn.q_proj.weight with LORA
Merging layer model.layers.14.self_attn.v_proj.weight with LORA
Replacing model.layers.15.input_layernorm.weight.


47it [00:09,  6.08it/s]

Merging layer model.layers.15.mlp.down_proj.weight with LORA


48it [00:09,  4.63it/s]

Merging layer model.layers.15.mlp.gate_proj.weight with LORA


52it [00:10,  5.90it/s]

Merging layer model.layers.15.mlp.up_proj.weight with LORA
Replacing model.layers.15.post_attention_layernorm.weight.
Merging layer model.layers.15.self_attn.k_proj.weight with LORA
Merging layer model.layers.15.self_attn.o_proj.weight with LORA


53it [00:10,  6.09it/s]

Merging layer model.layers.15.self_attn.q_proj.weight with LORA
Merging layer model.layers.15.self_attn.v_proj.weight with LORA
Replacing model.layers.16.input_layernorm.weight.


56it [00:11,  6.03it/s]

Merging layer model.layers.16.mlp.down_proj.weight with LORA


57it [00:11,  4.48it/s]

Merging layer model.layers.16.mlp.gate_proj.weight with LORA


58it [00:12,  3.69it/s]

Merging layer model.layers.16.mlp.up_proj.weight with LORA
Replacing model.layers.16.post_attention_layernorm.weight.
Merging layer model.layers.16.self_attn.k_proj.weight with LORA


62it [00:12,  5.84it/s]

Merging layer model.layers.16.self_attn.o_proj.weight with LORA
Merging layer model.layers.16.self_attn.q_proj.weight with LORA
Merging layer model.layers.16.self_attn.v_proj.weight with LORA
Replacing model.layers.17.input_layernorm.weight.


65it [00:12,  5.94it/s]

Merging layer model.layers.17.mlp.down_proj.weight with LORA


66it [00:13,  4.57it/s]

Merging layer model.layers.17.mlp.gate_proj.weight with LORA


70it [00:13,  5.83it/s]

Merging layer model.layers.17.mlp.up_proj.weight with LORA
Replacing model.layers.17.post_attention_layernorm.weight.
Merging layer model.layers.17.self_attn.k_proj.weight with LORA
Merging layer model.layers.17.self_attn.o_proj.weight with LORA


71it [00:14,  6.03it/s]

Merging layer model.layers.17.self_attn.q_proj.weight with LORA
Merging layer model.layers.17.self_attn.v_proj.weight with LORA
Replacing model.layers.18.input_layernorm.weight.


74it [00:14,  6.06it/s]

Merging layer model.layers.18.mlp.down_proj.weight with LORA


75it [00:15,  4.61it/s]

Merging layer model.layers.18.mlp.gate_proj.weight with LORA


79it [00:15,  5.85it/s]

Merging layer model.layers.18.mlp.up_proj.weight with LORA
Replacing model.layers.18.post_attention_layernorm.weight.
Merging layer model.layers.18.self_attn.k_proj.weight with LORA
Merging layer model.layers.18.self_attn.o_proj.weight with LORA


80it [00:15,  6.05it/s]

Merging layer model.layers.18.self_attn.q_proj.weight with LORA
Merging layer model.layers.18.self_attn.v_proj.weight with LORA
Replacing model.layers.19.input_layernorm.weight.


83it [00:16,  6.05it/s]

Merging layer model.layers.19.mlp.down_proj.weight with LORA


84it [00:16,  4.61it/s]

Merging layer model.layers.19.mlp.gate_proj.weight with LORA


88it [00:17,  5.84it/s]

Merging layer model.layers.19.mlp.up_proj.weight with LORA
Replacing model.layers.19.post_attention_layernorm.weight.
Merging layer model.layers.19.self_attn.k_proj.weight with LORA
Merging layer model.layers.19.self_attn.o_proj.weight with LORA


89it [00:17,  6.03it/s]

Merging layer model.layers.19.self_attn.q_proj.weight with LORA
Merging layer model.layers.19.self_attn.v_proj.weight with LORA


93it [00:18,  6.23it/s]

Merging layer model.layers.20.mlp.gate_proj.weight with LORA
Merging layer model.layers.20.self_attn.k_proj.weight with LORA
Merging layer model.layers.20.self_attn.o_proj.weight with LORA


94it [00:18,  6.20it/s]

Merging layer model.layers.20.self_attn.q_proj.weight with LORA
Merging layer model.layers.20.self_attn.v_proj.weight with LORA
Replacing model.layers.9.input_layernorm.weight.


97it [00:18,  6.14it/s]

Merging layer model.layers.9.mlp.down_proj.weight with LORA


98it [00:19,  4.63it/s]

Merging layer model.layers.9.mlp.gate_proj.weight with LORA


99it [00:19,  3.75it/s]

Merging layer model.layers.9.mlp.up_proj.weight with LORA
Replacing model.layers.9.post_attention_layernorm.weight.
Merging layer model.layers.9.self_attn.k_proj.weight with LORA
Merging layer model.layers.9.self_attn.o_proj.weight with LORA


104it [00:20,  5.13it/s]


Merging layer model.layers.9.self_attn.q_proj.weight with LORA
Merging layer model.layers.9.self_attn.v_proj.weight with LORA


1it [00:00,  5.62it/s]

Replacing model.layers.20.input_layernorm.weight.


2it [00:00,  3.10it/s]

Merging layer model.layers.20.mlp.down_proj.weight with LORA


3it [00:01,  2.67it/s]

Merging layer model.layers.20.mlp.up_proj.weight with LORA
Replacing model.layers.20.post_attention_layernorm.weight.
Replacing model.layers.21.input_layernorm.weight.


6it [00:01,  4.48it/s]

Merging layer model.layers.21.mlp.down_proj.weight with LORA


7it [00:01,  3.70it/s]

Merging layer model.layers.21.mlp.gate_proj.weight with LORA


11it [00:02,  5.65it/s]

Merging layer model.layers.21.mlp.up_proj.weight with LORA
Replacing model.layers.21.post_attention_layernorm.weight.
Merging layer model.layers.21.self_attn.k_proj.weight with LORA
Merging layer model.layers.21.self_attn.o_proj.weight with LORA


12it [00:02,  5.95it/s]

Merging layer model.layers.21.self_attn.q_proj.weight with LORA
Merging layer model.layers.21.self_attn.v_proj.weight with LORA
Replacing model.layers.22.input_layernorm.weight.


15it [00:03,  6.04it/s]

Merging layer model.layers.22.mlp.down_proj.weight with LORA


16it [00:03,  4.58it/s]

Merging layer model.layers.22.mlp.gate_proj.weight with LORA


20it [00:04,  5.92it/s]

Merging layer model.layers.22.mlp.up_proj.weight with LORA
Replacing model.layers.22.post_attention_layernorm.weight.
Merging layer model.layers.22.self_attn.k_proj.weight with LORA
Merging layer model.layers.22.self_attn.o_proj.weight with LORA


21it [00:04,  6.14it/s]

Merging layer model.layers.22.self_attn.q_proj.weight with LORA
Merging layer model.layers.22.self_attn.v_proj.weight with LORA
Replacing model.layers.23.input_layernorm.weight.


24it [00:04,  6.13it/s]

Merging layer model.layers.23.mlp.down_proj.weight with LORA


25it [00:05,  4.68it/s]

Merging layer model.layers.23.mlp.gate_proj.weight with LORA


29it [00:05,  5.97it/s]

Merging layer model.layers.23.mlp.up_proj.weight with LORA
Replacing model.layers.23.post_attention_layernorm.weight.
Merging layer model.layers.23.self_attn.k_proj.weight with LORA
Merging layer model.layers.23.self_attn.o_proj.weight with LORA


30it [00:06,  6.16it/s]

Merging layer model.layers.23.self_attn.q_proj.weight with LORA
Merging layer model.layers.23.self_attn.v_proj.weight with LORA
Replacing model.layers.24.input_layernorm.weight.


33it [00:06,  6.17it/s]

Merging layer model.layers.24.mlp.down_proj.weight with LORA


34it [00:06,  4.75it/s]

Merging layer model.layers.24.mlp.gate_proj.weight with LORA


35it [00:07,  3.86it/s]

Merging layer model.layers.24.mlp.up_proj.weight with LORA
Replacing model.layers.24.post_attention_layernorm.weight.
Merging layer model.layers.24.self_attn.k_proj.weight with LORA
Merging layer model.layers.24.self_attn.o_proj.weight with LORA


39it [00:07,  6.11it/s]

Merging layer model.layers.24.self_attn.q_proj.weight with LORA
Merging layer model.layers.24.self_attn.v_proj.weight with LORA
Replacing model.layers.25.input_layernorm.weight.


42it [00:08,  6.20it/s]

Merging layer model.layers.25.mlp.down_proj.weight with LORA


43it [00:08,  4.73it/s]

Merging layer model.layers.25.mlp.gate_proj.weight with LORA


47it [00:09,  5.74it/s]

Merging layer model.layers.25.mlp.up_proj.weight with LORA
Replacing model.layers.25.post_attention_layernorm.weight.
Merging layer model.layers.25.self_attn.k_proj.weight with LORA
Merging layer model.layers.25.self_attn.o_proj.weight with LORA


48it [00:09,  5.95it/s]

Merging layer model.layers.25.self_attn.q_proj.weight with LORA
Merging layer model.layers.25.self_attn.v_proj.weight with LORA
Replacing model.layers.26.input_layernorm.weight.


51it [00:10,  5.98it/s]

Merging layer model.layers.26.mlp.down_proj.weight with LORA


52it [00:10,  4.64it/s]

Merging layer model.layers.26.mlp.gate_proj.weight with LORA


56it [00:11,  5.95it/s]

Merging layer model.layers.26.mlp.up_proj.weight with LORA
Replacing model.layers.26.post_attention_layernorm.weight.
Merging layer model.layers.26.self_attn.k_proj.weight with LORA
Merging layer model.layers.26.self_attn.o_proj.weight with LORA


57it [00:11,  6.12it/s]

Merging layer model.layers.26.self_attn.q_proj.weight with LORA
Merging layer model.layers.26.self_attn.v_proj.weight with LORA
Replacing model.layers.27.input_layernorm.weight.


60it [00:11,  6.13it/s]

Merging layer model.layers.27.mlp.down_proj.weight with LORA


61it [00:12,  4.68it/s]

Merging layer model.layers.27.mlp.gate_proj.weight with LORA


65it [00:12,  5.92it/s]

Merging layer model.layers.27.mlp.up_proj.weight with LORA
Replacing model.layers.27.post_attention_layernorm.weight.
Merging layer model.layers.27.self_attn.k_proj.weight with LORA
Merging layer model.layers.27.self_attn.o_proj.weight with LORA


66it [00:12,  6.11it/s]

Merging layer model.layers.27.self_attn.q_proj.weight with LORA
Merging layer model.layers.27.self_attn.v_proj.weight with LORA
Replacing model.layers.28.input_layernorm.weight.


69it [00:13,  6.09it/s]

Merging layer model.layers.28.mlp.down_proj.weight with LORA


70it [00:13,  4.66it/s]

Merging layer model.layers.28.mlp.gate_proj.weight with LORA


74it [00:14,  5.73it/s]

Merging layer model.layers.28.mlp.up_proj.weight with LORA
Replacing model.layers.28.post_attention_layernorm.weight.
Merging layer model.layers.28.self_attn.k_proj.weight with LORA
Merging layer model.layers.28.self_attn.o_proj.weight with LORA


75it [00:14,  5.96it/s]

Merging layer model.layers.28.self_attn.q_proj.weight with LORA
Merging layer model.layers.28.self_attn.v_proj.weight with LORA
Replacing model.layers.29.input_layernorm.weight.


78it [00:15,  5.94it/s]

Merging layer model.layers.29.mlp.down_proj.weight with LORA


79it [00:15,  4.59it/s]

Merging layer model.layers.29.mlp.gate_proj.weight with LORA


83it [00:16,  5.87it/s]

Merging layer model.layers.29.mlp.up_proj.weight with LORA
Replacing model.layers.29.post_attention_layernorm.weight.
Merging layer model.layers.29.self_attn.k_proj.weight with LORA
Merging layer model.layers.29.self_attn.o_proj.weight with LORA


84it [00:16,  6.07it/s]

Merging layer model.layers.29.self_attn.q_proj.weight with LORA
Merging layer model.layers.29.self_attn.v_proj.weight with LORA
Replacing model.layers.30.input_layernorm.weight.


87it [00:16,  6.05it/s]

Merging layer model.layers.30.mlp.down_proj.weight with LORA


88it [00:17,  4.65it/s]

Merging layer model.layers.30.mlp.gate_proj.weight with LORA


92it [00:18,  5.89it/s]

Merging layer model.layers.30.mlp.up_proj.weight with LORA
Replacing model.layers.30.post_attention_layernorm.weight.
Merging layer model.layers.30.self_attn.k_proj.weight with LORA
Merging layer model.layers.30.self_attn.o_proj.weight with LORA


93it [00:18,  6.08it/s]

Merging layer model.layers.30.self_attn.q_proj.weight with LORA
Merging layer model.layers.30.self_attn.v_proj.weight with LORA


95it [00:18,  5.21it/s]

Merging layer model.layers.31.mlp.gate_proj.weight with LORA


96it [00:19,  4.13it/s]

Merging layer model.layers.31.mlp.up_proj.weight with LORA
Merging layer model.layers.31.self_attn.k_proj.weight with LORA


100it [00:19,  5.06it/s]

Merging layer model.layers.31.self_attn.o_proj.weight with LORA
Merging layer model.layers.31.self_attn.q_proj.weight with LORA
Merging layer model.layers.31.self_attn.v_proj.weight with LORA



1it [00:00,  5.88it/s]

Copying lm_head.weight.
Replacing model.layers.31.input_layernorm.weight.


5it [00:00,  8.77it/s]

Merging layer model.layers.31.mlp.down_proj.weight with LORA
Replacing model.layers.31.post_attention_layernorm.weight.
Copying model.norm.weight.





In [60]:
# save model config.
model_config = AutoConfig.from_pretrained(MODEL_NAME).to_dict()
# save model config
# model_config['rope_scaling'] = {"type" :"dynamic", "factor": 2.0}
model_config_filename = save_dir/"config.json"
with open(model_config_filename, "w+") as f: json.dump(model_config, f)

### Evaluate

In [61]:
import re
from datasets import load_dataset
from fastcore.parallel import parallel
from vllm import LLM, SamplingParams

2024-08-20 11:42:22,139	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [62]:
def extract_last_number_or_ratio(s):
    # Find all sequences of digits, possibly with leading currency symbols, decimal points, and ratios
    patterns = re.findall(r'[\$€£]?\d+(?:\.\d+)?(?:\:\d+(?:\.\d+)?)?', s)
    
    # Return the last pattern found, or None if there are no matches
    if patterns:
        return patterns[-1]
    else:
        return None

In [63]:
def exact_match_score(preds, labels):
    return sum(p==g for p,g in zip(preds, labels))/len(preds)

In [64]:
dataset = load_dataset("microsoft/orca-math-word-problems-200k")['train'].shuffle(seed=42)
dataset = dataset.select(range(len(dataset)-5000,len(dataset)))
short_answers_gt = parallel(extract_last_number_or_ratio, dataset['answer'], progress=True)

In [65]:
valid_dataset = dataset.select(range(500))

In [66]:
inputs = [f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset['question']]

In [67]:
labels = short_answers_gt[:500]

In [68]:
len(inputs), len(labels)

(500, 500)

In [69]:
NUM_GPUS = torch.cuda.device_count(); NUM_GPUS

1

In [70]:
TOKENIZER = "meta-llama/Meta-Llama-3-8B"
TOKENIZER = "meta-llama/Meta-Llama-3-8B-Instruct"

In [71]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)

In [72]:
def convert_to_chat_input(question):
    messages = [
        {"role": "system", "content": "You are an AI assistant that excels in solving math problems."},
        {"role": "user", "content": question},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [73]:
chat_inputs = [convert_to_chat_input(question) for question in valid_dataset['question']]

In [74]:
len(chat_inputs)

500

#### FINETUNED 

In [18]:
model_dir = "/workspace/models/"

In [75]:
# MODEL_NAME = "llama-3-8b-orca-math-10k-full" # 0.4
# MODEL_NAME = "llama-3-8b-orca-math-10k-bnb-qlora-merged" # 0.276
# MODEL_NAME = "llama-3-8b-orca-math-10k-bnb-qdora-merged" # 0.458

# MODEL_NAME = "llama-3-8b-orca-math-100k-bnb-qlora-merged"
# MODEL_NAME = "llama-3-8b-orca-math-100k-bnb-qdora-merged" # 0.558

# MODEL_NAME = "llama-3-8b-instruct-orca-math-10k-bnb-qlora-merged"
# MODEL_NAME = "llama-3-8b-instruct-orca-math-10k-bnb-qdora-merged"

# MODEL_NAME = "llama-3-8b-instruct-orca-math-10k-hqq-qdora-plus-merged"
# MODEL_NAME = "llama-3-8b-instruct-orca-math-10k-hqq-qdora-loftq-init-merged"
# MODEL_NAME = "llama-3-8b-instruct-orca-math-10k-hqq-qdora-plus-loftq-init-merged"

# MODEL_NAME = "/workspace/models/llama-3-8b-instruct-orca-math-10k-hqq-qlora-plus-merged"
# MODEL_NAME = "/workspace/models/llama-3-8b-instruct-orca-math-10k-hqq-qlora-loftq-init-merged"
# MODEL_NAME = "/workspace/models/llama-3-8b-instruct-orca-math-10k-hqq-qlora-plus-loftq-init-merged"

# MODEL_NAME = "/workspace/models/llama-3-8b-instruct-orca-math-10k-hqq-qdora-axis-1-merged"
# MODEL_NAME = "/workspace/models/llama-3-8b-instruct-orca-math-10k-full"
MODEL_NAME = "llama-3-8b-instruct-orca-math-10k-hqq-lora-ln-merged-vllm"

In [76]:
llm = LLM(model=os.path.join(model_dir,MODEL_NAME), tokenizer=TOKENIZER, 
          tensor_parallel_size=NUM_GPUS, dtype="bfloat16")

INFO 08-20 11:43:05 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='/workspace/models/llama-3-8b-instruct-orca-math-10k-hqq-lora-ln-merged-vllm', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/workspace/models/llama-3-8b-instruct-orca-math-10k-hqq-lora-ln-merged-vllm, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 08-20 11:43:06

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:01,  2.03it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.71it/s]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:01<00:00,  1.64it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  2.12it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.96it/s]


INFO 08-20 11:43:08 model_runner.py:732] Loading model weights took 14.9595 GB





INFO 08-20 11:43:10 gpu_executor.py:102] # GPU blocks: 28017, # CPU blocks: 2048
INFO 08-20 11:43:13 model_runner.py:1019] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-20 11:43:13 model_runner.py:1023] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-20 11:43:25 model_runner.py:1220] Graph capturing finished in 13 secs.


In [22]:
# base model
# outputs = llm.generate(inputs, SamplingParams(temperature=0.0, max_tokens=1024))

In [23]:
# chat model
outputs = llm.generate(chat_inputs, SamplingParams(temperature=0.0, max_tokens=1024, stop=["<|eot_id|>"]))

Processed prompts: 100%|██████████| 500/500 [01:26<00:00,  5.78it/s]


In [24]:
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]

In [25]:
exact_match_score(short_answers_pred, labels)

0.528

#### N-SHOT

In [None]:
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
llm = LLM(model=MODEL_NAME, tensor_parallel_size=NUM_GPUS, dtype="bfloat16")

In [17]:
# zero-shot
inputs = [f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset['question']]
outputs = llm.generate(inputs, SamplingParams(temperature=0.0, max_tokens=1024))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts: 100%|██████████| 500/500 [01:20<00:00,  6.18it/s]


0.228

In [17]:
# zero-shot (instruct)
outputs = llm.generate(chat_inputs, SamplingParams(temperature=0.0, max_tokens=1024))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts: 100%|██████████| 500/500 [01:07<00:00,  7.41it/s]


0.454

In [None]:
# 5-shot
few_shot_examples = [f"###Question:\n{ex['question']}\n###Answer:\n{ex['answer']}<stop>" for ex in 
                     dataset.select(range(len(dataset)-5,len(dataset)))]
few_shot_prompt = "\n\n".join(few_shot_examples)
inputs = [few_shot_prompt + "\n\n" + f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset['question']]
outputs = llm.generate(inputs, SamplingParams(temperature=0.0, 
                                              stop_token_ids=[tokenizer.eos_token_id], 
                                              stop=["<stop>"], 
                                              max_tokens=1024))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts:   9%|▉         | 47/500 [01:06<02:45,  2.73it/s] 

In [30]:
few_shot_examples = [f"###Question:\n{ex['question']}\n###Answer:\n{ex['answer']}<stop>" for ex in 
                     dataset.select(range(len(dataset)-5,len(dataset)))]

In [32]:
few_shot_prompt = "\n\n".join(few_shot_examples)

In [37]:
def fewshot_chat_input(question, answer=None):
    messages = [
        {"role": "system", "content": f"You are an AI assistant that excels in solving math problems. Here are few examples of math problems:\n\n{few_shot_prompt}"},
        {"role": "user", "content": question},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [38]:
fewshot_chat_inputs = [fewshot_chat_input(question)  for question in valid_dataset['question']]

In [40]:
outputs = llm.generate(fewshot_chat_inputs, SamplingParams(temperature=0.0, 
                                              stop_token_ids=[tokenizer.eos_token_id], 
                                              stop=["<|eot_id|>"],
                                              max_tokens=1024))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts: 100%|██████████| 500/500 [02:32<00:00,  3.27it/s]


0.452