Evaluations with zero-shot, 5-shot, full fine-tune, merged LoRA, merged DoRA.

In [1]:
from pathlib import Path
import os, json
from safetensors.torch import save_file
import copy
from tqdm import tqdm
import safetensors
import safetensors.torch
from glob import glob
from transformers import AutoConfig, AutoTokenizer
from transformers.utils import hub, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
from hqq.core.quantize import HQQLinear, HQQBackend, BaseQuantizeConfig, Quantizer
from fastcore.script import *

import bitsandbytes as bnb
from bitsandbytes.nn.modules import Params4bit
import torch

### Evaluate

In [2]:
import re
from datasets import load_dataset
from fastcore.parallel import parallel
from vllm import LLM, SamplingParams

In [3]:
def extract_last_number_or_ratio(s):
    # Find all sequences of digits, possibly with leading currency symbols, decimal points, and ratios
    patterns = re.findall(r'[\$€£]?\d+(?:\.\d+)?(?:\:\d+(?:\.\d+)?)?', s)
    
    # Return the last pattern found, or None if there are no matches
    if patterns:
        return patterns[-1]
    else:
        return None

In [4]:
def exact_match_score(preds, labels):
    return sum(p==g for p,g in zip(preds, labels))/len(preds)

In [5]:
dataset = load_dataset("microsoft/orca-math-word-problems-200k")['train'].shuffle(seed=42)
dataset = dataset.select(range(len(dataset)-5000,len(dataset)))
short_answers_gt = parallel(extract_last_number_or_ratio, dataset['answer'], progress=True)

In [6]:
valid_dataset = dataset.select(range(500))

In [7]:
inputs = [f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset['question']]

In [8]:
labels = short_answers_gt[:500]

In [9]:
len(inputs), len(labels)

(500, 500)

In [10]:
NUM_GPUS = torch.cuda.device_count(); NUM_GPUS

4

In [11]:
TOKENIZER = "meta-llama/Meta-Llama-3-8B"

In [12]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


#### FINETUNED 

In [12]:
model_dir = "/workspace/models/"

In [None]:
# MODEL_NAME = "llama-3-8b-orca-math-10k-full" # 0.4
# MODEL_NAME = "llama-3-8b-orca-math-10k-bnb-qlora-merged" # 0.276
# MODEL_NAME = "llama-3-8b-orca-math-10k-bnb-qdora-merged" # 0.458

MODEL_NAME = "llama-3-8b-orca-math-100k-bnb-qlora-merged"
# MODEL_NAME = "llama-3-8b-orca-math-100k-bnb-qdora-merged" # 0.558

In [None]:
llm = LLM(model=os.path.join(model_dir,MODEL_NAME), tokenizer=TOKENIZER, 
          tensor_parallel_size=NUM_GPUS, dtype="bfloat16")

In [15]:
outputs = llm.generate(inputs, SamplingParams(temperature=0.0, max_tokens=1024))

Processed prompts: 100%|██████████| 500/500 [01:19<00:00,  6.33it/s]


In [16]:
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]

In [17]:
exact_match_score(short_answers_pred, labels)

0.322

#### N-SHOT

In [13]:
MODEL_NAME = "meta-llama/Meta-Llama-3-8B"
llm = LLM(model=MODEL_NAME, tensor_parallel_size=NUM_GPUS, dtype="bfloat16")

2024-05-02 12:10:55,912	INFO worker.py:1749 -- Started a local Ray instance.


INFO 05-02 12:10:57 llm_engine.py:98] Initializing an LLM engine (v0.4.1) with config: model='meta-llama/Meta-Llama-3-8B', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=4, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 05-02 12:11:22 utils.py:608] Found nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1
[36m(RayWorkerWrapper pid=208091)[0m INFO 05-02 12:11:22 utils.py:608] Found nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 05-02 12:11:23 selector.py:28] Using FlashAttention backend.
[36m(RayWorkerWrapper pid=208310)[0m INFO 05-02 12:11:27 selector.py:28] Using FlashAttention backend.
[36m(RayWorkerWrapper pid=208310)[0m INFO 05-02 12:11:22 utils.py:608] Found nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m
INFO 05-02 12:11:28 pynccl_utils.py:43] vLLM is using nccl==2.18.1
[36m(RayWorkerWrapper pid=208091)[0m INFO 05-02 12:11:28 pynccl_utils.py:43] vLLM is using nccl==2.18.1
INFO 05-02 1

In [17]:
# zero-shot
inputs = [f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset['question']]
outputs = llm.generate(inputs, SamplingParams(temperature=0.0, max_tokens=1024))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts: 100%|██████████| 500/500 [01:20<00:00,  6.18it/s]


0.228

In [None]:
# 5-shot
few_shot_examples = [f"###Question:\n{ex['question']}\n###Answer:\n{ex['answer']}<stop>" for ex in 
                     dataset.select(range(len(dataset)-5,len(dataset)))]
few_shot_prompt = "\n\n".join(few_shot_examples)
inputs = [few_shot_prompt + "\n\n" + f"###Question:\n{question}\n###Answer:\n" for question in valid_dataset['question']]
outputs = llm.generate(inputs, SamplingParams(temperature=0.0, 
                                              stop_token_ids=[tokenizer.eos_token_id], 
                                              stop=["<stop>"], 
                                              max_tokens=1024))
short_answers_pred = [extract_last_number_or_ratio(o.outputs[0].text) for o in outputs]
exact_match_score(short_answers_pred, labels)

Processed prompts:   9%|▉         | 47/500 [01:06<02:45,  2.73it/s] 