In [18]:
!pip install transformers datasets accelerate protobuf sentencepiece

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [19]:
from huggingface_hub import login 
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# You can change this to mistral
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16
).eval()

print(f"Loaded {model_id}")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loaded meta-llama/Meta-Llama-3-8B-Instruct


In [22]:
from datasets import load_dataset

gsm8k = load_dataset("gsm8k", "main", split="test[:5]")
strategyqa = load_dataset("tasksource/strategy-qa", split="train[:5]")
arc = load_dataset("ai2_arc", "ARC-Challenge", split="test[:5]")

print("Loaded GSM8K, StrategyQA, ARC-Challenge")


Loaded GSM8K, StrategyQA, ARC-Challenge


In [23]:
def generate_answer(prompt, max_new_tokens=128):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [24]:
import re

def extract_number(text):
    matches = re.findall(r"\b\d+\b", text)
    return matches[-1] if matches else "?"

gsm8k_outputs = []
for ex in gsm8k:
    prompt = f"<|user|>\n{ex['question']}\n<|assistant|>\n"
    full_output = generate_answer(prompt)
    model_answer = extract_number(full_output)
    
    gsm8k_outputs.append({
        "question": ex["question"],
        "expected_answer": ex["answer"].strip().split("####")[-1].strip(),  # gold number
        "model_reasoning": full_output,
        "model_answer": model_answer
    })

print("GSM8K Outputs:")
gsm8k_outputs


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


GSM8K Outputs:


[{'question': "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
  'expected_answer': '18',
  'model_reasoning': "<|user|>\nJanet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\n<|assistant|>\nLet's first find out how many eggs Janet has left after eating 3 for breakfast and baking 4 for muffins.\n\nJanet's ducks lay 16 eggs per day. She eats 3 eggs for breakfast, so she has 16 - 3 = 13 eggs left. Then, she bakes 4 eggs for muffins, so she has 13 - 4 = 9 eggs left.\n\nJanet sells the remaining 9 eggs at the farmers' market. She sells

In [25]:
gsm8k_correct = sum(x["model_answer"] == x["expected_answer"] for x in gsm8k_outputs)
gsm8k_acc = gsm8k_correct / len(gsm8k_outputs)
print(f"GSM8K Accuracy: {gsm8k_acc:.2%} ({gsm8k_correct}/{len(gsm8k_outputs)})")

GSM8K Accuracy: 80.00% (4/5)


In [26]:
def extract_yes_no(text):
    text = text.lower()
    if "yes" in text:
        return "yes"
    elif "no" in text:
        return "no"
    return "?"

strategyqa_outputs = []
for ex in strategyqa:
    q = ex["question"]
    correct = "yes" if ex["answer"] else "no"
    prompt = f"<|user|>\n{q}\nAnswer Yes or No.\n<|assistant|>\n"
    full_output = generate_answer(prompt)
    model_answer = extract_yes_no(full_output)

    strategyqa_outputs.append({
        "question": q,
        "expected_answer": correct,
        "model_reasoning": full_output,
        "model_answer": model_answer
    })

print("StrategyQA outputs:")
strategyqa_outputs


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


StrategyQA outputs:


[{'question': 'Are more people today related to Genghis Khan than Julius Caesar?',
  'expected_answer': 'yes',
  'model_reasoning': '<|user|>\nAre more people today related to Genghis Khan than Julius Caesar?\nAnswer Yes or No.\n<|assistant|>\nYes. According to a 2003 study published in the journal "European Journal of Human Genetics", it is estimated that around 16 million men alive today are direct descendants of Genghis Khan, making him one of the most widely genealogically connected individuals in history. This is because Genghis Khan and his descendants intermarried and had many children, spreading their genetic lineage across a vast territory. In contrast, Julius Caesar\'s direct descendants are believed to be relatively few, and his genetic legacy is not as widespread. So, yes, more people today are likely related to Genghis Khan than Julius Caesar.',
  'model_answer': 'yes'},
 {'question': 'Could the members of The Police perform lawful arrests?',
  'expected_answer': 'no',
  '

In [27]:
strategyqa_correct = sum(x["model_answer"] == x["expected_answer"] for x in strategyqa_outputs)
strategyqa_acc = strategyqa_correct / len(strategyqa_outputs)
print(f"StrategyQA Accuracy: {strategyqa_acc:.2%} ({strategyqa_correct}/{len(strategyqa_outputs)})")

StrategyQA Accuracy: 40.00% (2/5)


In [28]:
def extract_letter(text):
    text = text.strip().upper()
    for ch in ["A", "B", "C", "D", "E"]:
        if ch in text:
            return ch
    return "?"

arc_outputs = []
for ex in arc:
    q = ex['question']
    choices_text = ex['choices']['text']
    choices_label = ex['choices']['label']
    correct = ex["answerKey"]
    
    # Format: label: choice text
    formatted_choices = "\n".join([f"{label}. {text}" for label, text in zip(choices_label, choices_text)])
    
    prompt = f"<|user|>\n{q}\n{formatted_choices}\nAnswer with A, B, C, D, or E.\n<|assistant|>\n"
    full_output = generate_answer(prompt)
    model_answer = extract_letter(full_output)

    arc_outputs.append({
        "question": q,
        "expected_answer": correct,
        "model_reasoning": full_output,
        "model_answer": model_answer
    })

print("ARC outputs:")
arc_outputs


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


ARC outputs:


[{'question': 'An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?',
  'expected_answer': 'C',
  'model_reasoning': "<|user|>\nAn astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?\nA. Planetary density will decrease.\nB. Planetary years will become longer.\nC. Planetary days will become shorter.\nD. Planetary gravity will become stronger.\nAnswer with A, B, C, D, or E.\n<|assistant|>\nThe correct answer is C. Planetary days will become shorter.\n\nWhen a meteorite impacts a planet, it can transfer momentum to the planet, causing it to increase its rotation rate. This is because the meteorite's momentum is transferred to the planet, causing it to spin faster. As a result, the planet's rotation period (i.e., the length of its day) will decrease, making its days shorter.\n\nThe other options are not directly related 

In [29]:
arc_correct = sum(x["model_answer"] == x["expected_answer"] for x in arc_outputs)
arc_acc = arc_correct / len(arc_outputs)
print(f"ARC Accuracy: {arc_acc:.2%} ({arc_correct}/{len(arc_outputs)})")

ARC Accuracy: 0.00% (0/5)


In [30]:
import os, json

os.makedirs("outputs_llama", exist_ok=True)

with open("outputs_llama/gsm8k_outputs.json", "w") as f:
    json.dump(gsm8k_outputs, f, indent=2)

with open("outputs_llama/strategyqa_outputs.json", "w") as f:
    json.dump(strategyqa_outputs, f, indent=2)

with open("outputs_llama/arc_outputs.json", "w") as f:
    json.dump(arc_outputs, f, indent=2)

print("LLaMA-3 outputs saved to outputs_llama/")


LLaMA-3 outputs saved to outputs_llama/
