In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# ============================================
# Benchmark meta-llama/Llama-3.2-3B-Instruct on GSM-Symbolic with HF token
# ============================================

!pip install -q transformers torch tqdm accelerate bitsandbytes huggingface_hub pandas

import os
import json
import time
import re
import random
import numpy as np
from tqdm import tqdm
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from huggingface_hub import login

# ---------- Setup ----------
os.makedirs("/kaggle/working/results", exist_ok=True)
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# Set random seeds for reproducibility
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

# ---------- Hugging Face Token ----------
HF_TOKEN = os.environ.get("HUGGINGFACE_TOKEN", "")
login(token=HF_TOKEN)

# ---------- Utility functions ----------
def normalize_answer(ans: str) -> str:
    return ans.strip().lower().replace(",", "")

def is_answer_correct(pred, gold):
    """Compare numeric answers, ignoring formatting and symbols like % or commas."""
    pred_nums = re.findall(r"[-+]?\d*\.?\d+", pred)
    gold_nums = re.findall(r"[-+]?\d*\.?\d+", gold)

    if not pred_nums or not gold_nums:
        return pred.strip().lower() == gold.strip().lower()

    try:
        pred_val = float(pred_nums[-1])
        gold_val = float(gold_nums[-1])
        return abs(pred_val - gold_val) < 1e-3
    except ValueError:
        return False

# ---------- Benchmark class ----------
class LlamaBenchmark:
    def __init__(self, model_name="meta-llama/Llama-3.2-3B-Instruct",
                 device=None, max_new_tokens=512, temperature=0.0, hf_token=None):
        print(f"🔹 Loading model: {model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True,
            use_auth_token=hf_token
        )
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True,
            use_auth_token=hf_token
        )
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.max_new_tokens = max_new_tokens
        self.temperature = temperature

    def make_prompt(self, question):
        return (
            "You are a math reasoning assistant. "
            "Solve the problem step by step and end with a numeric answer'Answer: {number}'.\n\n"
            f"Problem: {question}\n\nSolution:\n"
        )

    def extract_answer(self, text):
        """Extract final numeric or text answer robustly."""
        if re.search(r"(final answer|answer is)[:=]", text, re.I):
            ans = re.split(r"(final answer|answer is)[:=]", text, flags=re.I)[-1].strip()
            return ans.split("\n")[0]
        if "Answer:" in text:
            ans = text.split("Answer:")[-1].strip()
            return ans.split("\n")[0]
        nums = re.findall(r"[-+]?\d*\.?\d+", text)
        return nums[-1].strip() if nums else text.strip().split("\n")[-1]

    @torch.inference_mode()
    def run_one(self, question):
        prompt = self.make_prompt(question)
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        start = time.time()

        # --- Deterministic first pass ---
        gen_config = GenerationConfig(
            max_new_tokens=self.max_new_tokens,
            do_sample=False,
            pad_token_id=self.tokenizer.eos_token_id
        )

        output = self.model.generate(
            **inputs,
            generation_config=gen_config
        )
        latency = time.time() - start
        decoded = self.tokenizer.decode(output[0], skip_special_tokens=True)
        pred = self.extract_answer(decoded)

        # --- Retry with sampling if output is junk ---
        if len(pred) < 2 or pred in {"'", '"', ".", ":", ";"}:
            gen_config_retry = GenerationConfig(
                max_new_tokens=self.max_new_tokens,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )

            output = self.model.generate(
                **inputs,
                generation_config=gen_config_retry
            )
            decoded = self.tokenizer.decode(output[0], skip_special_tokens=True)
            pred = self.extract_answer(decoded)

        return decoded, pred, latency

    def benchmark(self, dataset, save_path, q_key="question", a_key="answer", num_examples=None, checkpoint_interval=50):
        correct, latencies = 0, []
        total_examples = num_examples or len(dataset)
        os.makedirs(os.path.dirname(save_path), exist_ok=True)

        # Resume support
        processed = set()
        if os.path.exists(save_path):
            with open(save_path, "r", encoding="utf-8") as f:
                for line in f:
                    try:
                        data = json.loads(line)
                        processed.add(data["index"])
                        if data.get("correct"):
                            correct += 1
                        latencies.append(data.get("latency", 0))
                    except:
                        continue
            print(f"🔄 Resuming from {len(processed)} completed examples...")

        with open(save_path, "a", encoding="utf-8") as f:
            for i, ex in enumerate(tqdm(dataset, desc=f"Running {save_path}")):
                if i in processed:
                    continue
                if num_examples and i >= num_examples:
                    break

                q, gold = ex[q_key], str(ex[a_key])
                decoded, pred, t = self.run_one(q)
                ok = is_answer_correct(pred, gold)

                result = {
                    "index": i,
                    "question": q,
                    "gold_answer": gold,
                    "prediction": pred,
                    "correct": ok,
                    "latency": t,
                }

                f.write(json.dumps(result) + "\n")
                f.flush()
                os.fsync(f.fileno())

                if i % checkpoint_interval == 0:
                    print(json.dumps(result, indent=2))

                correct += ok
                latencies.append(t)

                # Save interim summaries
                if i % 500 == 0 and i > 0:
                    interim = {"processed": i, "correct": correct, "total": len(latencies)}
                    with open(save_path.replace(".jsonl", "_summary.json"), "w") as s:
                        json.dump(interim, s, indent=2)

        total = len(latencies)
        acc = round(100 * correct / total, 2) if total else 0
        avg_t = round(sum(latencies) / total, 3) if total else 0
        print(f"\n✅ {save_path}: {correct}/{total} correct → {acc}% | avg latency {avg_t}s")

        return {"total": total, "correct": correct, "accuracy": acc, "avg_latency": avg_t}

# ---------- Load GSM-Symbolic dataset using pandas ----------
hf_path = "hf://datasets/apple/GSM-Symbolic/main/test.jsonl"
gsm_df = pd.read_json(hf_path, lines=True)
print("Columns:", gsm_df.columns)

# Convert to list of dicts for benchmark
gsm_symbolic = gsm_df.to_dict(orient="records")

# ---------- Run benchmark ----------
bench = LlamaBenchmark("meta-llama/Llama-3.2-3B-Instruct", hf_token=HF_TOKEN)

gsm_metrics = bench.benchmark(
    gsm_symbolic,
    "/kaggle/working/results/llama3_gsm_symbolic.jsonl",
    q_key="question",
    a_key="answer",
    checkpoint_interval=200
)

# ---------- Save summary ----------
summary = {"gsm_symbolic": gsm_metrics}
with open("/kaggle/working/results/summary_llama3.json", "w") as f:
    json.dump(summary, f, indent=2)

print("\n📂 All results stored in /kaggle/working/results/")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Columns: Index(['id', 'instance', 'question', 'answer', 'original_id',
       'original_question', 'original_answer', 'canary'],
      dtype='object')
🔹 Loading model: meta-llama/Llama-3.2-3B-Instruct




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

🔄 Resuming from 826 completed examples...


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  20%|██        | 1001/5000 [23:32<6:29:47,  5.85s/it]

{
  "index": 1000,
  "question": "Liam picks 205 pineapples on Wednesday. Then he picks 97 pineapples on Thursday. On Friday, he picks quadruple the number of pineapples he did on Wednesday. How many pineapples does Liam have?",
  "gold_answer": "Combining Wednesday and Thursday, Liam has 205 pineapples + 97 pineapples = 302 pineapples.\nOn Friday, he picks 4 * 205 pineapples = 820 pineapples.\nAltogether, Liam has 302 pineapples + 820 pineapples = 1122 pineapples.\n#### 1122",
  "prediction": "1122",
  "correct": true,
  "latency": 2.8977835178375244
}


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  24%|██▍       | 1201/5000 [43:05<5:03:26,  4.79s/it] 

{
  "index": 1200,
  "question": "According to its nutritional info, a bag of breads has 475 calories per serving. If a 275 ounces bag has 5 servings, how many ounces can you eat if your daily calorie target is 2395 and you have already consumed 1350 calories?",
  "gold_answer": "If the total calorie target is 2395 and I have consumed 1350 calories then I have 2395-1350 = 1045 calories left to eat\nIf each serving of breads has 475 calories and I only have 1045 calories left to eat, then I can only eat 1045/475 of a serving = 11/5 of a serving\nWe also know that a 275 ounces bag of breads has 5 servings, hence each serving has 275 ounces/5 = 55 ounces\nIf I can only eat 11/5 of a serving, then I can eat only 55 * 11/5 = 55*11/5 = 121 ounces\n#### 121",
  "prediction": "605 {number}",
  "correct": false,
  "latency": 4.6278111934661865
}


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  28%|██▊       | 1401/5000 [57:30<4:10:46,  4.18s/it]

{
  "index": 1400,
  "question": "A clinic has a capacity of 1800 rooms with 1/4 occupied. Due to the natural disaster, 40 patients are admitted into the clinic each day. Calculate the total number of unoccupied rooms in the clinic after 3 weeks.",
  "gold_answer": "If 1/4 of the total capacity of the clinic rooms is occupied, it means 1/4 * 1800 = 450 rooms have patients using them.\nThe total number of rooms in the clinic without new admissions is 1800 rooms - 450 rooms = 1350 rooms.\nIf 40 people are admitted each day, the total number of patients in the clinic after one week is 40 patients/day * 7 days/week = 280 patients.\nAfter 3 weeks, the total number of patients admitted into the clinic is 280 patients/week * 3 weeks = 840 patients, who each use one room.\nIf there were 1350 unoccupied rooms in the clinic before the new admissions, the total number is reduced to 1350 rooms - 840 rooms = 510 unoccupied rooms.\n#### 510",
  "prediction": "510 {number}",
  "correct": true,
  "lat

Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  32%|███▏      | 1601/5000 [1:13:07<4:21:39,  4.62s/it]

{
  "index": 1600,
  "question": "Mrs. Lee is looking for a townhouse that will not go beyond her $360000 budget. She saw a property that has a selling price of $340000. On top of that, the buyer has to pay a brokerage fee which is 3% of the selling price, and also the transfer fee that is 10% of the selling price. How much more is the total price of the townhouse than Mrs. Lee's budget?",
  "gold_answer": "The brokerage fee is $340000 x 3/100 = $10200.\nThe transfer fee is $340000 x 10/100 = $34000.\nThe total price of the townhouse is $340000 + $10200 + $34000 = $384200.\nSo, it is $384200 - $360000 = $24200 more than Mrs. Lee's budget.\n#### 24200",
  "prediction": "14000",
  "correct": false,
  "latency": 4.897337198257446
}


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  36%|███▌      | 1801/5000 [1:34:10<4:00:40,  4.51s/it] 

{
  "index": 1800,
  "question": "Ava went to the cafe and bought various types of treats. She bought 1 dozen danishes which cost $17 per dozen, 8 dozen tarts which cost $74 per dozen, and 3 dozen scones for $113 per dozen. How much was the total cost?",
  "gold_answer": "The total charge for the danishes was 1 x $17 = $17.\nThe total charge for the tarts was 8 x $74 = $592.\nThe total charge for the scones was 3 x $113 = $339.\nTherefore the total amount Ava paid for the treats was $17 + $592 + $339 = $948.\n#### 948",
  "prediction": "948",
  "correct": true,
  "latency": 5.2434375286102295
}


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  40%|████      | 2001/5000 [1:53:18<6:45:50,  8.12s/it] 

{
  "index": 2000,
  "question": "Daphne went to the coast for vacation. Her parents gave her \u20a31400 to buy whatever she wanted. At the seaside store, cotton candy was on sale for \"Buy 15 pounds at \u20a323 per pound, get 5 pounds 1/4 off.\" She scooped up 22 pounds. She also bought a mixed bag of coral pieces for \u20a311.75 and 11 pins that were \u20a321.0 each. How much money does Daphne have left?",
  "gold_answer": "cotton candy is 15 pounds for \u20a3 23 and gets 5 pounds 1/4 off. So 1/4 off of 5 pounds is \u20a35/4*23 = \u20a3115/4. The rest of 2 pounds does not have discount and come at 46 so total is 15*23 + 5*3/4*23 + 2*23 = 1909/4\n11 pins at \u20a321.0 each is 11*21.0=\u20a3231.0\nWhen you add all her purchases, \u20a31909/4+\u20a311.75+\u20a3231.0 = \u20a3720)\nShe had \u20a31400 and spent \u20a3720 so she had \u20a31400-\u20a3720 = \u20a3680 left over\n#### 680",
  "prediction": "726.0",
  "correct": false,
  "latency": 11.814285039901733
}


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  44%|████▍     | 2201/5000 [2:15:03<2:51:34,  3.68s/it]

{
  "index": 2200,
  "question": "There are 18 girls in the park. If there are two-thirds the number of boys in the park, how many kids are in the park?",
  "gold_answer": "There are 18 girls x 2/3 boys/girls = 12 boys in the park.\nIn total there are 18 girls + 12 boys =  30 kids in the park\n#### 30",
  "prediction": "30",
  "correct": true,
  "latency": 3.8752365112304688
}


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  48%|████▊     | 2401/5000 [2:40:23<5:27:24,  7.56s/it] 

{
  "index": 2400,
  "question": "Kayla can peel 6 mushrooms a minute and saute 40 mushrooms in 17 minutes. How long will it take her to peel and saute 120 mushrooms?",
  "gold_answer": "First find how long it takes Kayla to peel the mushroom: 120 mushroom / 6 mushroom/minute = 20 minutes\nThen find how many batches of mushroom she needs to cook: 120 mushroom / 40 mushroom/batch = 3 batches\nThen multiply the number of batches by the time per batch to find the total cook time: 3 batches * 17 minutes/batch = 51 minutes\nThen add the peeling time to find the total time Kayla spends: 51 minutes + 20 minutes = 71 minutes\n#### 71",
  "prediction": "23 {number}",
  "correct": false,
  "latency": 7.177589178085327
}


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  52%|█████▏    | 2601/5000 [3:03:43<4:11:29,  6.29s/it]

{
  "index": 2600,
  "question": "Qasim has 5 \u20a350 bills. He buys 6 coloring books for \u20a38 each. He also buys 5 packs of trading cards for \u20a34 each. How much money does he have left?",
  "gold_answer": "Qasim starts off with 5 * \u20a350 = \u20a3250.\nQasim spends 6 coloring books * \u20a38 = \u20a348 on coloring books.\nQasim spends 5 packs of trading cards * \u20a34 = \u20a320 on trading cards.\nTotal Qasim has spent \u20a348 + \u20a320 = \u20a368.\nQasim has \u20a3250 - \u20a368 = \u20a3182 remaining.\n#### 182",
  "prediction": "182",
  "correct": true,
  "latency": 5.9665367603302
}


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  56%|█████▌    | 2801/5000 [3:18:59<2:23:55,  3.93s/it]

{
  "index": 2800,
  "question": "A carton of lemonade is 13 ozs of lemonade. Pavel drinks 4 cartons of lemonade. If lemonade has 8 calories per oz how many calories did he consume?",
  "gold_answer": "He drank 13*4=52 ozs of lemonade.\nSo he drank 52*8=416 calories of lemonade\n#### 416",
  "prediction": "416 {number}",
  "correct": true,
  "latency": 2.6859335899353027
}


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  60%|██████    | 3001/5000 [3:37:31<3:38:03,  6.54s/it]

{
  "index": 3000,
  "question": "Sanjay works a 7-hour shift each day, 3 days a week. He earns \u20ba17 per hour and gets a \u20ba582 bonus each week if the company performs well. How much money did Sanjay make in June if the company performed very well for the whole month?",
  "gold_answer": "In a day, Sanjay makes 7 * 17 = \u20ba119\nIf he works 3 days a week, the total number of days for the whole month is 3 * 4= 12 days.\nSince he makes \u20ba119 per day, the total amount for the whole month is 12 * 119= \u20ba1428.\nHe also got a 582 * 4 = \u20ba2328 bonus because the company performed well in all the weeks of June.\nAt the end of June, he earned 1428 + 2328 = \u20ba3756.\n#### 3756",
  "prediction": "3756",
  "correct": true,
  "latency": 6.2485671043396
}


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  64%|██████▍   | 3201/5000 [3:52:45<2:24:29,  4.82s/it]

{
  "index": 3200,
  "question": "Beatriz has 9 jars of chocolate chips in her storage room. Each jar of chocolate chips can decorate 11 muffins. Beatriz wants to bake enough muffins to use up all of her chocolate chips. If each rack holds 11 muffins, how many racks worth of muffins should she bake?",
  "gold_answer": "She has enough chocolate chips for 9 * 11 = 99 muffins.\nShe needs 99 / 11 = 9 racks to bake all of the muffins.\n#### 9",
  "prediction": "9 {number} = 9.",
  "correct": true,
  "latency": 3.478898286819458
}


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  68%|██████▊   | 3401/5000 [4:08:45<1:57:45,  4.42s/it]

{
  "index": 3400,
  "question": "It takes Sakura 28 minutes to finish a brain teaser and 4 minutes to finish a chess puzzle. Over the vacation she solved 4 brain teasers and 7 chess puzzles. How much time did she spend playing these games?",
  "gold_answer": "It takes 28 minutes to complete a brain teaser and she completed 4 for a total of 28*4 = 112 minutes\nIt takes 4 minutes to complete a chess puzzle and she completed 7 for a total of 4*7 = 28 minutes\nShe spent 112 minutes on brain teasers and 28 minutes on chess puzzles for a total of 112+28 = 140 minutes\n#### 140",
  "prediction": "140 {number}",
  "correct": true,
  "latency": 5.016479253768921
}


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  76%|███████▌  | 3801/5000 [4:48:07<1:26:13,  4.32s/it]

{
  "index": 3800,
  "question": "Wei caught 4 trouts last Sunday, the first trout he caught weighs 70 kilograms, the second trout he caught weighs 33 kilograms, and the last trout he caught weighs 32 kilograms. If a kilogram of trout costs \u20ac1.0, how much will he earn after selling all the trouts to the market?",
  "gold_answer": "Wei will earn 70 x \u20ac1.0 = \u20ac70.0 from the first trout.\nHe will earn 33 x \u20ac1.0 = \u20ac33.0 for the second trout.\nThe rest of the trouts are 4-2 = 2. He will earn 32 x \u20ac1.0 = \u20ac32.0 per each of them. So he will earn 2 * 32.0 = =64.0\nTherefore, the total amount he will earn for all the trouts is \u20ac70.0 + \u20ac33.0 + \u20ac64.0= \u20ac167.\n#### 167",
  "prediction": "135 {number}",
  "correct": false,
  "latency": 3.4385569095611572
}


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  80%|████████  | 4001/5000 [5:01:15<1:31:38,  5.50s/it]

{
  "index": 4000,
  "question": "The vending machines sell cookies for 50 cents and pretzels for 75 cents. Jamal spent $1200 and got 6 bags of cookies and had 7% of his money left in change. How many pretzels did he buy?",
  "gold_answer": "Jamal got $84.0 in change because 1200 x 7/100 = 84\nJamal spent $1116 because 1200 - 84 = 1116\nJamal spent $3.0 on cookies because 6 x 0.5 = 3.0\nJamal spent 1113.0 on pretzels because 1116.0 - 3.0 = 1113.0\nJamal bought 1484 pretzels because 1113.0 / 0.75 = 1484\n#### 1484",
  "prediction": "120",
  "correct": false,
  "latency": 11.08226728439331
}


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  84%|████████▍ | 4201/5000 [5:24:17<1:02:34,  4.70s/it]

{
  "index": 4200,
  "question": "Kayla buys 10 shares of a stock for \u20ac56 each. The stock price increases 30% the first year Kayla holds it, then decreases 25% in the second year. What is the final value of all Kayla's shares?",
  "gold_answer": "First find the initial total value of Kayla's purchase: 10 shares * \u20ac56/share = \u20ac560\nThen find the amount of the first price increase: \u20ac560 * 0.3 = \u20ac168\nAdd that amount to the initial value to find the value after the first year: \u20ac560 + \u20ac168.0 = \u20ac728\nThen multiply that amount by 25% to find the amount of the decrease in the second year: \u20ac728.0 * 25% = \u20ac182\nThen subtract that amount from the value after the first year to find the final value: \u20ac728 - \u20ac182 = \u20ac546\n#### 546",
  "prediction": "546 {number}",
  "correct": true,
  "latency": 6.08382773399353
}


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  88%|████████▊ | 4401/5000 [5:42:18<48:08,  4.82s/it]  

{
  "index": 4400,
  "question": "Keisha can run eight times faster than she can walk, but she can skip at a rate of speed that is 3/4 as fast as she can run. If she can skip at 3 miles per hour, how many miles can she travel in 8 hours if she spends one-half of the time running and 1/2 of the time walking?",
  "gold_answer": "If Keisha can skip at 3/4 the speed she can run, then she can run at 3*4/3=4 miles per hour.\nAnd since she can run at a speed that is 8 times faster than she can walk, this means she can walk at 4/8=1/2 miles per hour.\nIf 1/2 of the time is spent walking, then she walks for 8*1/2=4 hours.\nIf 1/2 of the time is spent running, then she runs for 8-4=4 hours.\nThus, she runs for 4 hours at 4 miles per hour, or 4*4=16 miles.\nShe walks for 4 hours at 1/2 miles per hour, or 4*1/2=2 miles.\nThus, altogether, she travels 16+2=18 miles.\n#### 18",
  "prediction": "99 {number}",
  "correct": false,
  "latency": 7.040207624435425
}


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  92%|█████████▏| 4601/5000 [6:05:32<52:40,  7.92s/it]  

{
  "index": 4600,
  "question": "Jin slew 150 goblins with his mighty sword, while DeShawn, using a spear, slew five-sixths as many goblins as Jin. Using a rusty iron axe, Gabriel slew two times as many goblins as DeShawn. But Oscar, having forgotten his sword at home, slew three-sixths as many goblins as Gabriel using a nail file. How many goblins has Oscar slain?",
  "gold_answer": "DeShawn slew 5/6 as many goblins as Jin, or 150*5/6=125 goblins.\nGabriel slew 2 as many goblins as DeShawn, or 2*125=250 goblins.\nOscar slew 1/2 as many goblins as Gabriel, or 250*1/2=125 goblins.\n#### 125",
  "prediction": "125",
  "correct": true,
  "latency": 6.87434983253479
}


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl:  96%|█████████▌| 4801/5000 [6:36:01<22:31,  6.79s/it]  

{
  "index": 4800,
  "question": "Mia's aunt said that she had \u20ba300 budgeted for her birthday party. She wants to make sure she and her friends all get to play one round of laser tag, have \u20ba5 in pinball tokens, and get to ride the roller coaster seven times. A round of laser tag is \u20ba10. The roller coaster cost \u20ba5 a ride. How many friends can she invite?",
  "gold_answer": "The roller coaster will cost \u20ba35 per person because 5 x 7 = 35\nEach person costs \u20ba50 because 5 + 10 + 35 = 50\n6 total people can attend because 300 / 50 = 6\nShe can invite 5 friends because 6 - 1 =5\n#### 5",
  "prediction": "25",
  "correct": false,
  "latency": 3.5877671241760254
}


Running /kaggle/working/results/llama3_gsm_symbolic.jsonl: 100%|██████████| 5000/5000 [7:00:14<00:00,  5.04s/it]


✅ /kaggle/working/results/llama3_gsm_symbolic.jsonl: 3493/5000 correct → 69.86% | avg latency 5.703s

📂 All results stored in /kaggle/working/results/





In [4]:
!zip -r results.zip /kaggle/working/
from IPython.display import FileLink
FileLink('results.zip')

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/.virtual_documents/ (stored 0%)
  adding: kaggle/working/results/ (stored 0%)
  adding: kaggle/working/results/summary_llama3.json (deflated 21%)
  adding: kaggle/working/results/llama3_gsm_symbolic_summary.json (deflated 15%)
  adding: kaggle/working/results/llama3_gsm_symbolic.jsonl (deflated 88%)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
