In [1]:
import re
import torch
import numpy as np
from datasets import load_dataset
from fastcore.parallel import parallel
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

In [2]:
def convert_to_chat_input(question):

    system =  """You are an AI assistant, answering multiple choice questions. 
Only output the letter (A,B,C,D,E,etc..) of the answer and nothing else."""
    
    messages = [
        {"role": "system", "content":system},
        {"role": "user", "content": question},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [3]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
NUM_GPUS = torch.cuda.device_count(); NUM_GPUS

1

In [5]:
eval_ds = load_dataset("pharaouk/dharma-2")['dharma_g1i5_shuffled']

In [6]:
set(eval_ds['output'])

{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'}

In [7]:
eval_ds[11]

{'input': 'The following are multiple choice questions (with answers) about  conceptual physics.\n\nA magnetic force can act on an electron even when it\nA. is at rest\nB. moves parallel to magnetic field lines\nC. Both of these\nD. Neither of these\nAnswer:',
 'output': 'D',
 'subject': 'MMLU'}

In [8]:
print(eval_ds[11]['input'])

The following are multiple choice questions (with answers) about  conceptual physics.

A magnetic force can act on an electron even when it
A. is at rest
B. moves parallel to magnetic field lines
C. Both of these
D. Neither of these
Answer:


In [9]:
prompts = [convert_to_chat_input(t) for t in eval_ds['input']]

### llama-3-8b-instruct

This is the baseline llama chat model.

`acc:0.58`

In [10]:
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
llm = LLM(model=MODEL_NAME, tensor_parallel_size=NUM_GPUS, dtype="bfloat16")

INFO 06-10 16:10:48 llm_engine.py:103] Initializing an LLM engine (v0.4.2) with config: model='meta-llama/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=meta-llama/Meta-Llama-3-8B-Instruct)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 06-10 16:10:49 selector.py:37] Using FlashAttention-2 backend.
INFO 06-10 16:10:50 weight_utils.py:199] Using model weights format ['*.safetensors']
INFO 06-10 16:10:54 model_runner.py:145] Loading model weights took 14.9595 GB
INFO 06-10 16:10:56 gpu_executor.py:83] # GPU blocks: 11780, # CPU blocks: 2048
INFO 06-10 16:10:58 model_runner.py:818] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-10 16:10:58 model_runner.py:822] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-10 16:11:05 model_runner.py:888] Graph capturing finished in 7 secs.


In [11]:
outputs = llm.generate(prompts, SamplingParams(temperature=0.0, max_tokens=1, stop=["<|eot_id|>"]))

Processed prompts: 100%|██████████| 300/300 [00:06<00:00, 44.27it/s, Generation Speed: 44.27 toks/s]


In [12]:
preds = [o.outputs[0].text for o in outputs]

In [13]:
acc = np.mean([p==a for p,a in zip(preds, eval_ds['output'])])
acc

0.59

### llama-3-8b-instruct-hqq

This is the model with just HQQ quantization.

`acc:0.6`

In [10]:
model_dir = "/workspace/models/llama-3-8b-instruct-hqq-dora-plus-plus-only-hqq-vllm"
llm = LLM(model=model_dir, tokenizer="meta-llama/Meta-Llama-3-8B-Instruct", 
            dtype="bfloat16", tensor_parallel_size=NUM_GPUS, enforce_eager=False,
            quantization="torchao", gpu_memory_utilization=0.9)

INFO 06-10 16:00:06 llm_engine.py:103] Initializing an LLM engine (v0.4.2) with config: model='/workspace/models/llama-3-8b-instruct-hqq-dora-plus-plus-only-hqq-vllm', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=torchao, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=/workspace/models/llama-3-8b-instruct-hqq-dora-plus-plus-only-hqq-vllm)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 06-10 16:00:07 selector.py:37] Using FlashAttention-2 backend.
INFO 06-10 16:00:12 model_runner.py:145] Loading model weights took 9.3189 GB
INFO 06-10 16:00:44 gpu_executor.py:83] # GPU blocks: 14326, # CPU blocks: 2048
INFO 06-10 16:00:46 model_runner.py:818] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-10 16:00:46 model_runner.py:822] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-10 16:01:58 model_runner.py:888] Graph capturing finished in 72 secs.


In [14]:
outputs = llm.generate(prompts, SamplingParams(temperature=0.0, max_tokens=1, stop=["<|eot_id|>"]))

Processed prompts: 100%|██████████| 300/300 [00:06<00:00, 44.42it/s, Generation Speed: 44.42 toks/s]


In [12]:
preds = [o.outputs[0].text for o in outputs]

In [13]:
acc = np.mean([p==a for p,a in zip(preds, eval_ds['output'])])
acc

0.6

### llama-3-8b-instruct-hqq-dora

This is the model with HQQ quantization and HQQ++ dataset dora finetuning.

`acc:0.6`

In [10]:
model_dir = "/workspace/models/llama-3-8b-instruct-hqq-dora-plus-plus-qdora-vllm/"
llm = LLM(model=model_dir, tokenizer="meta-llama/Meta-Llama-3-8B-Instruct", 
            dtype="bfloat16", tensor_parallel_size=NUM_GPUS, enforce_eager=False,
            quantization="torchao", gpu_memory_utilization=0.9)

INFO 06-10 16:05:18 llm_engine.py:103] Initializing an LLM engine (v0.4.2) with config: model='/workspace/models/llama-3-8b-instruct-hqq-dora-plus-plus-qdora-vllm/', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=torchao, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=/workspace/models/llama-3-8b-instruct-hqq-dora-plus-plus-qdora-vllm/)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 06-10 16:05:19 selector.py:37] Using FlashAttention-2 backend.
INFO 06-10 16:05:38 model_runner.py:145] Loading model weights took 9.5822 GB


[rank0]:W0610 16:05:43.355000 139982587503744 torch/_dynamo/convert_frame.py:824] WON'T CONVERT dora_layer /workspace/git/vllm_fork/vllm/model_executor/layers/quantization/torchao.py line 474 
[rank0]:W0610 16:05:43.355000 139982587503744 torch/_dynamo/convert_frame.py:824] due to: 
[rank0]:W0610 16:05:43.355000 139982587503744 torch/_dynamo/convert_frame.py:824] Traceback (most recent call last):
[rank0]:W0610 16:05:43.355000 139982587503744 torch/_dynamo/convert_frame.py:824]   File "/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py", line 786, in _convert_frame
[rank0]:W0610 16:05:43.355000 139982587503744 torch/_dynamo/convert_frame.py:824]     result = inner_convert(
[rank0]:W0610 16:05:43.355000 139982587503744 torch/_dynamo/convert_frame.py:824]   File "/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py", line 400, in _convert_frame_assert
[rank0]:W0610 16:05:43.355000 139982587503744 torch/_dynamo/convert_frame.py:824]     return _compil

INFO 06-10 16:05:58 gpu_executor.py:83] # GPU blocks: 13866, # CPU blocks: 2048
INFO 06-10 16:05:59 model_runner.py:818] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-10 16:05:59 model_runner.py:822] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-10 16:07:14 model_runner.py:888] Graph capturing finished in 75 secs.


In [15]:
outputs = llm.generate(prompts, SamplingParams(temperature=0.0, max_tokens=1, stop=["<|eot_id|>"]))

Processed prompts: 100%|██████████| 300/300 [00:11<00:00, 26.32it/s, Generation Speed: 26.32 toks/s]


In [12]:
preds = [o.outputs[0].text for o in outputs]

In [13]:
acc = np.mean([p==a for p,a in zip(preds, eval_ds['output'])])
acc

0.6033333333333334