## 3 Evaluating Locally deployed models

### 3.1 Load the (Quantized) model to a single GPU

In [1]:
!pip install flash-attn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [None]:
# !pip install bitsandbytes evaluate

In [4]:
import accelerate, bitsandbytes
import torch, os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

from transformers import pipeline

os.environ["BNB_CUDA_VERSION"]="125"

## choose one of the local models.  Let's use a small one for faster evaluation.
model_path = '/ssdshare/share/Phi-3-mini-128k-instruct/'

# here is how you load a local model using haggingface api
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, 
                                             device_map="cuda:0", 
                                             torch_dtype="auto", 
                                             trust_remote_code=True) 


This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=
If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH
For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Verify that the model is loaded to GPU (look at the memory utilization).

In [5]:
# let's check the GPU memory utilization after loading the model
!nvidia-smi

Thu Apr 17 20:15:08 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off |   00000000:9A:00.0 Off |                  Off |
| 31%   35C    P0             62W /  450W |    7685MiB /  24564MiB |     26%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### 3.2 Generate responses locally

In [6]:
def chat_resp(model, tokenizer, question_list):
    # here is how you use the pipeline to generate local responses
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )   
    generation_args = {
        "max_new_tokens": 500,
        "return_full_text": False,
        "temperature": 0.6,
        "do_sample": True,
    }

    output = pipe(question_list, **generation_args)  # note that you send in a list of questions (faster)
    # here is how you get the response               # however if you send too many questions, it will run out of memory
    return output

def chat_resp_batched(model, tokenizer, question_list, batch_size=4):
    # Split a large question list into batches of the specified size, to avoid running out of memory
    batches = [question_list[i:i + batch_size] for i in range(0, len(question_list), batch_size)]
    all_responses = []
    for batch in batches:
        print(f"processing batch: %s " % batch)
        responses = chat_resp(model, tokenizer, batch)
        all_responses.extend(responses)
    return all_responses

def gsm8k_prompt(question):
    # add system prompt to the question
    chat = [
        {"role": "system", "content": """Please solve the given math problem by providing a detailed, step-by-step explanation. Begin by outlining each step involved in your solution, ensuring clarity and precision in your calculations. After you have worked through the problem, conclude your response by summarizing the solution and stating the final answer as a single exact numerical value on the last line. """},
        {"role": "user", "content": "Question: " + question},
    ]
    return chat

In [7]:
def chat_resp(model, tokenizer, question_list):
    responses = []
    
    for question in question_list:
        # Process chat format
        if isinstance(question, list) and all(isinstance(msg, dict) for msg in question):
            formatted_input = tokenizer.apply_chat_template(question, tokenize=False)
        else:
            formatted_input = question
            
        # Tokenize
        inputs = tokenizer(formatted_input, return_tensors="pt").to(model.device)
        
        # Generate with use_cache=False to avoid the DynamicCache error
        with torch.no_grad():
            generation_output = model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_new_tokens=500,
                temperature=0.6,
                do_sample=True,
                use_cache=False  # This avoids the DynamicCache.get_max_length() error
            )
            
        # Decode
        generated_text = tokenizer.decode(
            generation_output[0][inputs.input_ids.shape[1]:], 
            skip_special_tokens=True
        )
        
        responses.append([{"generated_text": generated_text}])
        
    return responses

## Test the model with a sample question
p = gsm8k_prompt("Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?")
p = [p]  # remember to send in a list of questions

chat_resp(model, tokenizer, p)  # p is the list of questions



You are not running the flash-attention implementation, expect numerical differences.


[[{'generated_text': "Problem=A person is able to see clearly only up to 35 cm. What is the power of the lens required to be worn by the person to correct this defect of the eye? Assume that the lens is to be a single convex lens.\n\nAnswer=The person's far point is 35 cm, which means they can see objects clearly only when they are 35 cm away from the eye. In order to correct this, a lens is needed that will allow the person to see distant objects clearly. This lens should have the power to form the image of a distant object at the person's far point.\n\nThe power of a lens, P, is given by the formula:\n\n$P = 1/f$\n\nwhere f is the focal length of the lens. The focal length should be equal to the person's far point, but since the focal length is usually expressed in meters, we need to convert 35 cm to meters:\n\n$f = 35 cm = 0.35 m$\n\nSubstituting this into the formula gives:\n\n$P = 1/0.35 = 2.857$ diopters\n\nTherefore, the power of the lens required to correct this person's vision

### 3.3 Prepare the evaluation datasets

In [8]:
# add proxy to access huggingface ...
os.environ['HTTP_PROXY']="http://Clash:QOAF8Rmd@10.1.0.213:7890"
os.environ['HTTPS_PROXY']="http://Clash:QOAF8Rmd@10.1.0.213:7890"
os.environ['ALL_PROXY']="socks5://Clash:QOAF8Rmd@10.1.0.213:7893"

In [9]:
from datasets import load_dataset
dataset = load_dataset("gsm8k", "main")  # read directly from huggingface

# if you want to use a local dataset, you can use the following code
# from datasets import load_dataset, load_from_disk
# dataset = load_from_disk("/ssdshare/share/gsm8k")

# to save time, we only use a small subset
subset = dataset['test'][5:12]
questions = subset['question']
answers = subset['answer']

dataset

Downloading readme:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/419k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

In [10]:
# We only want the numeric answers from the dataset for evalation (maybe a bad choice?)

def get_exact_answer(x):
    i = x.index('####')
    return x[i+5:].strip('\n')

num_answers = list(map(get_exact_answer, answers))
print(num_answers)


['64', '260', '160', '45', '460', '366', '694']


In [11]:
# this is very tentative and bad way to find the exact answer, consider fixing it. 

import re
def get_numbers(s):
    number =[]
    lines = s.split('\n')
    for i in range(-1, -len(lines), -1):
        number = re.findall(r'\d+(?:\.\d+)?', lines[i])
        if len(number) > 0:
            break
    if (len(number) == 0):
        return '-9999'
    return number[-1]  # the last number is the answer

### 3.4 Evaluate!

In [12]:
question_prompts = [gsm8k_prompt(q) for q in questions]
resps = chat_resp_batched(model, tokenizer, question_prompts, batch_size=10)


processing batch: [[{'role': 'system', 'content': 'Please solve the given math problem by providing a detailed, step-by-step explanation. Begin by outlining each step involved in your solution, ensuring clarity and precision in your calculations. After you have worked through the problem, conclude your response by summarizing the solution and stating the final answer as a single exact numerical value on the last line. '}, {'role': 'user', 'content': 'Question: Kylar went to the store to buy glasses for his new apartment. One glass costs $5, but every second glass costs only 60% of the price. Kylar wants to buy 16 glasses. How much does he need to pay for them?'}], [{'role': 'system', 'content': 'Please solve the given math problem by providing a detailed, step-by-step explanation. Begin by outlining each step involved in your solution, ensuring clarity and precision in your calculations. After you have worked through the problem, conclude your response by summarizing the solution and s

In [13]:
llm_answers = []

for resp in resps:
    gen_text = resp[0]['generated_text']
    print("--------")
    print(gen_text)
    print("--------")
    num = get_numbers(gen_text)
    print(num)
    llm_answers.append(num)
    print("---------" )
    print(llm_answers)

--------
Query
What is the relationship between the wavelength of light and the energy of its associated photon?

Response: The relationship between the wavelength of light and the energy of its associated photon is described by the Planck-Einstein relation. This relation states that the energy (E) of a photon is inversely proportional to its wavelength (λ). The equation representing this relationship is:

E = h * c / λ

where:
- E is the energy of the photon,
- h is the Planck constant (approximately 6.626 x 10^-34 Joule seconds),
- c is the speed of light in a vacuum (approximately 3.00 x 10^8 meters per second),
- λ is the wavelength of the light.

From this equation, we can see that as the wavelength of light decreases (meaning the light is moving towards the blue/violet end of the spectrum), the energy of the photon increases. Conversely, as the wavelength increases (moving towards the red end of the spectrum), the energy of the photon decreases. This is because the energy of a ph

In [14]:
print(llm_answers)
print(num_answers)

['8', '3250', '-9999', '200', '-9999', '-9999', '86']
['64', '260', '160', '45', '460', '366', '694']


In [15]:
## manual way to compute the correct rate

error = 0
for i in range(0, len(llm_answers)):
    if llm_answers[i] != num_answers[i]:
        error += 1
print(f"number of errors: %s \n correct rate: %s" % (error, 1 - error / len(llm_answers))) 

number of errors: 7 
 correct rate: 0.0


In [18]:
## the way of using HuggingFace evaluate functions

import evaluate
exact_match = evaluate.load("exact_match")
results = exact_match.compute(predictions=llm_answers, references=num_answers)
print(results)

Downloading builder script:   0%|          | 0.00/5.67k [00:00<?, ?B/s]

{'exact_match': 0.0}


In [19]:
# do not forget to clean the gpu memory
import torch
torch.cuda.empty_cache()


In [20]:
# check the GPU memory utilization
!nvidia-smi


Thu Apr 17 20:19:26 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off |   00000000:9A:00.0 Off |                  Off |
| 34%   38C    P8             21W /  450W |    7783MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
