Evaluations with zero-shot, 5-shot, full fine-tune, merged LoRA, merged DoRA.

In [1]:
import torch
from pathlib import Path
import os, json
from safetensors.torch import save_file
import copy
from tqdm import tqdm
import safetensors
import safetensors.torch
from glob import glob
from transformers import AutoConfig, AutoTokenizer
from transformers.utils import hub, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
from hqq.core.quantize import HQQLinear, HQQBackend, BaseQuantizeConfig, Quantizer

from hqq.core.quantize import HQQLinear, BaseQuantizeConfig, Quantizer
from hqq.backends.torchao import patch_hqq_to_aoint4
from fastcore.script import *


### Evaluate

In [2]:
import re
from datasets import load_dataset
from fastcore.parallel import parallel
from vllm import LLM, SamplingParams

In [3]:
def extract_last_number_or_ratio(s):
    # Find all sequences of digits, possibly with leading currency symbols, decimal points, and ratios
    patterns = re.findall(r'[\$€£]?\d+(?:\.\d+)?(?:\:\d+(?:\.\d+)?)?', s)
    
    # Return the last pattern found, or None if there are no matches
    if patterns:
        return patterns[-1]
    else:
        return None

In [4]:
def exact_match_score(preds, labels):
    return sum(p==g for p,g in zip(preds, labels))/len(preds)

In [5]:
dataset = load_dataset("microsoft/orca-math-word-problems-200k")['train'].shuffle(seed=42)
dataset = dataset.select(range(len(dataset)-5000,len(dataset)))
short_answers_gt = parallel(extract_last_number_or_ratio, dataset['answer'], progress=True)

In [6]:
valid_dataset = dataset.select(range(500))

In [7]:
inputs = [f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset['question']]

In [8]:
labels = short_answers_gt[:500]

In [9]:
len(inputs), len(labels)

(500, 500)

In [10]:
NUM_GPUS = torch.cuda.device_count(); NUM_GPUS

1

In [11]:
TOKENIZER = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [12]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)

In [13]:
def convert_to_chat_input(question):
    messages = [
        {"role": "system", "content": "You are an AI assistant that excels in solving math problems."},
        {"role": "user", "content": question},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [14]:
chat_inputs = [convert_to_chat_input(question) for question in valid_dataset['question']]

In [15]:
len(chat_inputs)

500

#### FINETUNED 

In [16]:
model_dir = "/workspace/models/"

In [17]:
# TODO: Don't cast to float16 for bitblas

In [18]:
# 4bit HQQ+DORA
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4bit/step_250/vllm_tinygemm" # 0.593
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4bit/step_1000/vllm_tinygemm" # 0.610

# 4/2bit HQQ+DORA
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit/step_250/vllm_bitblas" # 0.290
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit/step_1000/vllm_bitblas" # 0.380

# 4/2bit HQQ+DORA (merged)
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit/step_250/merged" # 0.242
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit/step_1000/merged" # 0.350

# 4bit HQQ+DORA+LN
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4bit-ln/step_250/vllm_tinygemm" # 0.562
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4bit-ln/step_1000/vllm_tinygemm" # 0.585

# 4bit HQQ+LN
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4bit-ln/step_250/vllm_tinygemm" # 0.605
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4bit-ln/step_1000/vllm_tinygemm" # 0.55

# 4/2bit HQQ+DORA+LN
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit-ln/step_250/vllm_bitblas" # 0.270
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit-ln/step_1000/vllm_bitblas" # 0.42

# 4/2bit HQQ+DORA+LN (merged)
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit-ln/step_250/merged" # 0.270
MODEL_NAME = "llama-3-1-8b-instruct-hqq-dora-4-2bit-ln/step_1000/merged" # 0.343

# 4(HQQ)bit 2(HQQ+DORA)bit
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4-dora-2bit/step_250/vllm_bitblas" # 0.257
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4-dora-2bit/step_1000/vllm_bitblas" # 0.330

# 4(HQQ)bit 2(HQQ+DORA)bit+LN
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4-dora-2bit-ln/step_250/vllm_bitblas" # 0.265
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4-dora-2bit-ln/step_1000/vllm_bitblas" # 0.375

# 4(HQQ)bit 2(HQQ)bit+LN
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4-hqq-2bit-ln/step_250/vllm_bitblas" # 0.02
# MODEL_NAME = "llama-3-1-8b-instruct-hqq-4-hqq-2bit-ln/step_1000/vllm_bitblas" # 0.03


In [19]:
llm = LLM(model=os.path.join(model_dir,MODEL_NAME), 
          tokenizer=TOKENIZER, 
          tensor_parallel_size=NUM_GPUS, 
          max_model_len=8192,
        #   quantization="bitblas",
          dtype="bfloat16"
          )

INFO 08-20 14:46:49 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='/workspace/models/llama-3-1-8b-instruct-hqq-dora-4-2bit-ln/step_1000/merged', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/workspace/models/llama-3-1-8b-instruct-hqq-dora-4-2bit-ln/step_1000/merged, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 08-20 14:46:

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 08-20 14:46:53 model_runner.py:732] Loading model weights took 14.9888 GB
INFO 08-20 14:46:54 gpu_executor.py:102] # GPU blocks: 27947, # CPU blocks: 2048
INFO 08-20 14:46:56 model_runner.py:1019] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-20 14:46:56 model_runner.py:1023] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-20 14:47:07 model_runner.py:1220] Graph capturing finished in 11 secs.


In [20]:
# base model
# outputs = llm.generate(inputs, SamplingParams(temperature=0.0, max_tokens=1024))

In [21]:
# chat model
outputs = llm.generate(chat_inputs[:128], SamplingParams(temperature=0.0, max_tokens=1024, stop=["<|eot_id|>"]))

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|██████████| 128/128 [00:20<00:00,  6.32it/s, est. speed input: 684.14 toks/s, output: 1828.81 toks/s] 


In [22]:
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]

In [23]:
exact_match_score(short_answers_pred, labels)

0.34375

#### N-SHOT

In [None]:
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
llm = LLM(model=MODEL_NAME, tensor_parallel_size=NUM_GPUS, dtype="bfloat16")

In [17]:
# zero-shot
inputs = [f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset['question']]
outputs = llm.generate(inputs, SamplingParams(temperature=0.0, max_tokens=1024))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts: 100%|██████████| 500/500 [01:20<00:00,  6.18it/s]


0.228

In [17]:
# zero-shot (instruct)
outputs = llm.generate(chat_inputs, SamplingParams(temperature=0.0, max_tokens=1024))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts: 100%|██████████| 500/500 [01:07<00:00,  7.41it/s]


0.454

In [None]:
# 5-shot
few_shot_examples = [f"###Question:\n{ex['question']}\n###Answer:\n{ex['answer']}<stop>" for ex in 
                     dataset.select(range(len(dataset)-5,len(dataset)))]
few_shot_prompt = "\n\n".join(few_shot_examples)
inputs = [few_shot_prompt + "\n\n" + f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset['question']]
outputs = llm.generate(inputs, SamplingParams(temperature=0.0, 
                                              stop_token_ids=[tokenizer.eos_token_id], 
                                              stop=["<stop>"], 
                                              max_tokens=1024))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts:   9%|▉         | 47/500 [01:06<02:45,  2.73it/s] 

In [30]:
few_shot_examples = [f"###Question:\n{ex['question']}\n###Answer:\n{ex['answer']}<stop>" for ex in 
                     dataset.select(range(len(dataset)-5,len(dataset)))]

In [32]:
few_shot_prompt = "\n\n".join(few_shot_examples)

In [37]:
def fewshot_chat_input(question, answer=None):
    messages = [
        {"role": "system", "content": f"You are an AI assistant that excels in solving math problems. Here are few examples of math problems:\n\n{few_shot_prompt}"},
        {"role": "user", "content": question},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [38]:
fewshot_chat_inputs = [fewshot_chat_input(question)  for question in valid_dataset['question']]

In [40]:
outputs = llm.generate(fewshot_chat_inputs, SamplingParams(temperature=0.0, 
                                              stop_token_ids=[tokenizer.eos_token_id], 
                                              stop=["<|eot_id|>"],
                                              max_tokens=1024))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts: 100%|██████████| 500/500 [02:32<00:00,  3.27it/s]


0.452