In [1]:
import re
import torch
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm 
from time import sleep
from sklearn.metrics import mean_absolute_error
from huggingface_hub import notebook_login

In [2]:
# Helper function for debugging
def dprint(s, debug):
    if debug:
        print(s)

In [8]:
# --- Check Device ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [11]:
# --- Model Loading ---
# We'll run our models using Hugging Face's transformers library on HPC/ Google Colab/ Lightning.ai
# The Llama models are gated, meaning you must request access on their Hugging Face pages.
# Once you have access, you need to log in here to download the model weights.

# Run this command in your terminal when you are running this notebook for the 1st time
# git config --global credential.helper store

print("Please log in to your Hugging Face account.")
notebook_login()

Please log in to your Hugging Face account.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
# This will be our primary model for most of the assignment.
model_id_1 = "meta-llama/Llama-2-7b-chat-hf"

In [12]:
print(f"\nLoading tokenizer for {model_id_1}...")
# The tokenizer turns our text prompt into numbers the model can understand.
tokenizer = AutoTokenizer.from_pretrained(model_id_1)

print(f"Loading model: {model_id_1}...")
# This downloads the model weights to your environment.
# torch_dtype=torch.bfloat16 uses half-precision floats to save memory.
# device_map="auto" automatically puts the model on the GPU if available.
model = AutoModelForCausalLM.from_pretrained(
    model_id_1,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
print(f"{model_id_1} model loaded successfully!")


Loading tokenizer for meta-llama/Llama-2-7b-chat-hf...


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Loading model: meta-llama/Llama-2-7b-chat-hf...


config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

ValueError: Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` requires `accelerate`. You can install it with `pip install accelerate`

In [8]:
def call_model(prompt, student_configs, post_processing_fn, model_obj, tokenizer_obj, debug=False):
    """
    Generates a response using the provided local Hugging Face model and tokenizer.
    """
    # 1. Tokenize the input prompt
    inputs = tokenizer_obj(prompt, return_tensors="pt").to(device)

    hf_configs = student_configs.copy()
    if 'max_tokens' in hf_configs:
        # `generate` uses `max_new_tokens` to specify the length of the output
        hf_configs['max_new_tokens'] = hf_configs.pop('max_tokens')
    if 'stop' in hf_configs:
        del hf_configs['stop'] # Stop sequences are handled differently; we'll ignore for simplicity

    # 2. Generate output tokens
    outputs = model_obj.generate(**inputs, **hf_configs).to(device)
    
    # 3. Decode the generated tokens back to a string
    # We slice the output to only get the newly generated text, not the original prompt
    result_new = tokenizer_obj.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    
    dprint("************ Prompt ************", debug)
    dprint(prompt, debug)
    dprint("\n************ Raw Response ************", debug)
    dprint(result_new, debug)

    # 4. Apply post-processing to extract the final answer
    final_output = post_processing_fn(result_new)
    
    dprint("\n************ Final Output ************", debug)
    dprint(final_output, debug)

    return final_output

In [9]:
def get_addition_pairs(lower_bound, upper_bound, rng):
    """Generates two random integers within a specified range."""
    int_a = int(np.ceil(rng.uniform(lower_bound, upper_bound)))
    int_b = int(np.ceil(rng.uniform(lower_bound, upper_bound)))
    return int_a, int_b

def test_range(added_prompt, prompt_configs, rng, model_obj, tokenizer_obj, n_sample=30,
               lower_bound=1, upper_bound=10, fixed_pairs=None,
               pre_processing=lambda x:x, post_processing=lambda y:y,
               debug=False):
    """
    Tests a language model's addition performance over a range of numbers.

    Args:
        added_prompt (tuple): A tuple containing the prefix and suffix for the prompt.
        prompt_configs (dict): Configuration parameters for the model's generate function.
        rng (numpy.random.Generator): A random number generator instance.
        model_obj (transformers.PreTrainedModel): The loaded Hugging Face model object.
        tokenizer_obj (transformers.PreTrainedTokenizer): The loaded Hugging Face tokenizer object.
        n_sample (int): The number of random pairs to generate if fixed_pairs is None.
        lower_bound (int): The lower bound for number generation.
        upper_bound (int): The upper bound for number generation.
        fixed_pairs (list, optional): A list of specific integer tuples to test.
        pre_processing (function): A function to apply to the input string before prompting.
        post_processing (function): A function to extract the integer answer from the model's output.
        debug (bool): If True, prints detailed debugging information.

    Returns:
        dict: A dictionary containing performance metrics (res, acc, mae, prompt_length).
    """
    # --- Lists for storing results ---
    int_as = []
    int_bs = []
    answers = []
    model_responses = []
    correct = []
    prompts = []
    
    # --- Determine the test cases ---
    iterations = range(n_sample) if fixed_pairs is None else fixed_pairs
    
    for v in tqdm(iterations):
        if fixed_pairs is None:
            # Generate two new numbers if no fixed pairs are provided
            int_a, int_b = get_addition_pairs(lower_bound=lower_bound, upper_bound=upper_bound, rng=rng)
        else:
            # Use the provided fixed pairs
            int_a, int_b = v
            
        # --- Construct the prompt for two numbers ---
        fixed_prompt = f'{int_a}+{int_b}'
        fixed_prompt = pre_processing(fixed_prompt)
        
        prefix, suffix = added_prompt
        prompt = prefix + fixed_prompt + suffix
        
        # --- Get the model's response ---
        model_response = call_model(prompt, prompt_configs, post_processing, model_obj, tokenizer_obj, debug=debug)
        
        # --- Calculate the correct answer for two numbers ---
        answer = int_a + int_b
        
        # --- Append all results for analysis ---
        int_as.append(int_a)
        int_bs.append(int_b)
        prompts.append(prompt)
        answers.append(answer)
        model_responses.append(model_response)
        correct.append((answer == model_response))
        sleep(0.1)

    # --- Create a DataFrame to display the results for two numbers ---
    df = pd.DataFrame({
        'int_a': int_as, 
        'int_b': int_bs, 
        'prompt': prompts, 
        'answer': answers, 
        'response': model_responses, 
        'correct': correct
    })
    print(df)
    
    # --- Calculate and return performance metrics ---
    mae = mean_absolute_error(df['answer'], df['response'])
    acc = df.correct.sum() / len(df)
    prompt_length = len(prefix) + len(suffix)
    res = acc * (1 / prompt_length) * (1 - mae / (1 * 10**4))
    
    return {'res': res, 'acc': acc, 'mae': mae, 'prompt_length': prompt_length}

###  Part 1. Zero Shot Addition

**Example: Zero-shot single-digit addition**

In [None]:
# All of this remains the same
added_prompt = ('Question: What is ', '?\\nAnswer: ')
prompt_config = {'max_tokens': 2,
                'temperature': 0.7,
                'top_k': 50,
                'top_p': 0.6,
                'repetition_penalty': 1,
                'stop': []}

def your_pre_processing(input_string):
    return input_string

def your_post_processing(output_string):
    only_digits = re.sub(r"\D", "", output_string)
    try:
        res = int(only_digits)
    except:
        res = 0
    return res

# The model name string is no longer passed to the function
# It was used in the previous cell to load the 'model' and 'tokenizer' objects
print(f"Testing model: {model_id_1}")
seed = 0
rng = np.random.default_rng(seed)

# This is the only line that changes
res = test_range(
    added_prompt=added_prompt,
    prompt_configs=prompt_config,
    rng=rng,
    model_obj=model, 
    tokenizer_obj=tokenizer,
    n_sample=10,
    lower_bound=1,
    upper_bound=10,
    fixed_pairs=None,
    pre_processing=your_pre_processing,
    post_processing=your_post_processing,
    debug=False
)
print(res)

**Example: Zero-shot 7-digit addition**

In [None]:

prompt_config['max_tokens'] = 8
rng = np.random.default_rng(seed)

# The call to test_range is updated to pass the model and tokenizer objects.
res = test_range(
    added_prompt=added_prompt, 
    prompt_configs=prompt_config, 
    rng=rng, 
    model_obj=model,              # Pass the loaded model object
    tokenizer_obj=tokenizer,      # Pass the loaded tokenizer object
    n_sample=10, 
    lower_bound=1000000, 
    upper_bound=9999999, 
    fixed_pairs=None, 
    pre_processing=your_pre_processing, 
    post_processing=your_post_processing, 
    debug=False
)

print(res)

-----------

**Q1a.** In your opinion, what are some factors that cause language model performance to deteriorate from 1 digit to 7 digits?

Answer: 

-----------

**Q1b**. Play around with the config parameters ('max_tokens','temperature','top_k','top_p','repetition_penalty')
* What does each parameter represent?
* How does increasing each parameter change the generation?

Answer: 

Q1c. Do 7-digit addition with Qwen3 8B.

* How does the performance change?
* What are some factors that cause this change?

In [None]:
# --- Before loading Qwen 3, offload Llama 2 to free up VRAM ---

# 1. Delete the model and tokenizer variables from memory.
# Replace 'model' and 'tokenizer' with the actual variable names you used for Llama 2.
del model
del tokenizer

# 2. Run Python's garbage collector and empty PyTorch's CUDA cache.
# This is the crucial step to actually release the GPU memory.
gc.collect()
torch.cuda.empty_cache()

print("Llama 2 model offloaded and GPU memory cleared.")

In [None]:
# --- Load Qwen 3 8B ---
# This is a different model, so we need to load its specific tokenizer and weights.
model_id_2 = "Qwen/Qwen3-8B"

print(f"\nLoading tokenizer for {model_id_2}...")
tokenizer_2 = AutoTokenizer.from_pretrained(model_id_2)

print(f"Loading model: {model_id_2}...")
model_2 = AutoModelForCausalLM.from_pretrained(
    model_id_2,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
print(f"{model_id_2} model loaded successfully!")

In [None]:
# --- Test on 7-digit addition ---
prompt_config['max_tokens'] = 8

print(prompt_config)
rng = np.random.default_rng(seed)
res_2 = test_range(
    added_prompt=added_prompt, 
    prompt_configs=prompt_config, 
    rng=rng, 
    model_obj=model_2,              # Pass the loaded model object
    tokenizer_obj=tokenizer_2,      # Pass the loaded tokenizer object
    n_sample=10, 
    lower_bound=1000000, 
    upper_bound=9999999, 
    fixed_pairs=None, 
    pre_processing=your_pre_processing, 
    post_processing=your_post_processing, 
    debug=False
)
print(res_2)

-----------

Answer: 

-----------

**Q1d.** Here we're giving our language model the prior that the sum of two 7-digit numbers must have a maximum of 8 digits. (by setting max_token=8). What if we remove this prior by increasing the max_token to 20? 
* Does the model perform well?
* What are some reasons why?

Answer: 

In [None]:

prompt_config['max_tokens'] = 20
rng = np.random.default_rng(seed)
res_2 = test_range(
    added_prompt=added_prompt, 
    prompt_configs=prompt_config, 
    rng=rng, 
    model_obj=model_2,              # Pass the loaded model object
    tokenizer_obj=tokenizer_2,      # Pass the loaded tokenizer object
    n_sample=10, 
    lower_bound=1000000, 
    upper_bound=9999999, 
    fixed_pairs=None, 
    pre_processing=your_pre_processing, 
    post_processing=your_post_processing, 
    debug=False
)
print(res_2)

In [None]:
# 1. Delete the model and tokenizer variables from memory.
# Replace 'model' and 'tokenizer' with the actual variable names you used for Llama 2.
del model_2
del tokenizer_2

# 2. Run Python's garbage collector and empty PyTorch's CUDA cache.
# This is the crucial step to actually release the GPU memory.
gc.collect()
torch.cuda.empty_cache()

print(f"{model_id_2} offloaded and GPU memory cleared.")

### Part 2. In Context Learning

We will try to improve the performance of 7-digit addition via in-context learning.
We will use [llama-2-7b]. Below is a simple example.

In [None]:
print(f"\nLoading tokenizer for {model_id_1}...")
# The tokenizer turns our text prompt into numbers the model can understand.
tokenizer = AutoTokenizer.from_pretrained(model_id_1)

print(f"Loading model: {model_id_1}...")
# This downloads the model weights to your environment.
# torch_dtype=torch.bfloat16 uses half-precision floats to save memory.
# device_map="auto" automatically puts the model on the GPU if available.
model = AutoModelForCausalLM.from_pretrained(
    model_id_1,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
print(f"{model_id_1} model loaded successfully!")

In [None]:

added_prompt = ('Question: What is 3+7?\nAnswer: 10\n Question: What is ', '?\nAnswer: ') # Question: What is a+b?\nAnswer:
prompt_config = {'max_tokens': 8,
                'temperature': 0.7,
                'top_k': 50,
                'top_p': 0.6,
                'repetition_penalty': 1,
                'stop': []}
rng = np.random.default_rng(seed)
res = test_range(
    added_prompt=added_prompt, 
    prompt_configs=prompt_config, 
    rng=rng, 
    model_obj=model,              # Pass the loaded model object
    tokenizer_obj=tokenizer,      # Pass the loaded tokenizer object
    n_sample=10, 
    lower_bound=1000000, 
    upper_bound=9999999, 
    fixed_pairs=None, 
    pre_processing=your_pre_processing, 
    post_processing=your_post_processing, 
    debug=False
)

print(res)

**Q2a**.
* How does the performance change with the baseline in-context learning prompt? (compare with "Example: Zero-shot 7-digit addition" in Q1)
* What are some factors that cause this change?

Answer: 

-----------

Now we will remove the prior on output length and re-evaluate the performance of our baseline one-shot learning prompt. We need to modify our post processing function to extract the answer from the output sequence. In this case, it is the number in the first line that starts with "Answer: ".

**Q2b**.
* Modify the post processing function
* How does the performance change when we relax the output length constraint? (compare with Q2a)
* What are some factors that cause this change?

Answer: 

In [None]:
#Write your updated post processing function here


In [None]:
prompt_config['max_tokens'] = 50 # changed from 8, assuming we don't know the output length
                
rng = np.random.default_rng(seed)
res = test_range(
    added_prompt=added_prompt, 
    prompt_configs=prompt_config, 
    rng=rng, 
    model_obj=model,              # Pass the loaded model object
    tokenizer_obj=tokenizer,      # Pass the loaded tokenizer object
    n_sample=10, 
    lower_bound=1000000, 
    upper_bound=9999999, 
    fixed_pairs=None, 
    pre_processing=your_pre_processing, 
    post_processing=your_post_processing, 
    debug=False
)
print(res)

-----------

**Q2c.** Let's change our one-shot learning example to something more "in-distribution". Previously we were using 1-digit addition as an example. Let's change it to 7-digit addition (1234567+1234567=2469134). 
* Evaluate the performance with max_tokens = 8.
* Evaluate the performance with max_tokens = 50.
* How does the performance change from 1-digit example to 7-digit example?

Answer: 

In [None]:

prompt_config['max_tokens'] = 8 
added_prompt = ('Question: What is 1234567+1234567?\nAnswer: 2469134\nQuestion: What is ', '?\nAnswer: ') # Question: What is a+b?\nAnswer:
res = test_range(
    added_prompt=added_prompt, 
    prompt_configs=prompt_config, 
    rng=rng, 
    model_obj=model,              # Pass the loaded model object
    tokenizer_obj=tokenizer,      # Pass the loaded tokenizer object
    n_sample=10, 
    lower_bound=1000000, 
    upper_bound=9999999, 
    fixed_pairs=None, 
    pre_processing=your_pre_processing, 
    post_processing=your_post_processing, 
    debug=False
)
print(res)

In [None]:

prompt_config['max_tokens'] = 50 
res = test_range(
    added_prompt=added_prompt, 
    prompt_configs=prompt_config, 
    rng=rng, 
    model_obj=model,              # Pass the loaded model object
    tokenizer_obj=tokenizer,      # Pass the loaded tokenizer object
    n_sample=10, 
    lower_bound=1000000, 
    upper_bound=9999999, 
    fixed_pairs=None, 
    pre_processing=your_pre_processing, 
    post_processing=your_post_processing, 
    debug=False
)
print(res)

-----------

**Q2d.** Let's look at a specific example with large absolute error. 
* Run the cell at least 5 times. Does the error change each time? Why?
* Can you think of a prompt to reduce the error?
* Why do you think it would work?
* Does it work in practice? Why or why not?

Answer:

In [None]:
test_range(
    added_prompt=added_prompt, 
    prompt_configs=prompt_config, 
    rng=rng, 
    fixed_pairs=[(9090909,1010101)], 
    pre_processing=your_pre_processing, 
    post_processing=your_post_processing, 
    model_obj=model, 
    tokenizer_obj=tokenizer, 
    debug=True
)