# EVALUATION LOOP

In [1]:
# ------------------------------------------------------------
# 0.  Imports & config – nothing here should clash with yours
# ------------------------------------------------------------
from tqdm import tqdm
import torch
from mathbert_encoder import MathBERTEncoder
import retriever_cosine as rc
# from import retrieve_top_k_cosine, retrieve_sample_k_cosine
from response_sampler import sample_responses_per_demo
from reward_aggregator import compute_demo_accuracy
from icl_model_wrapper import OpenAIICLModel
from grpo_optimizer import grpo_step
from datasets import load_dataset
from dotenv import load_dotenv
import os
from transformers import get_linear_schedule_with_warmup
from importlib import reload

reload(rc)

load_dotenv()

API_KEY = os.getenv("OPENAI_API_KEY")
DEVICE  = "cuda" if torch.cuda.is_available() else "cpu"
K       = 3                # demos per query
NUM_SAMPLES = 5             # model completions per query
TEMPERATURE = 0.7           # keep same as training loop

# ------------------------------------------------------------
# 1.  Initialise encoder **in eval mode** (weights frozen)
# ------------------------------------------------------------
encoder = MathBERTEncoder(device=DEVICE, trainable=False)
encoder.eval()                                   # no grads!

icl_model = OpenAIICLModel(api_key=API_KEY,
                           model_name="gpt-4.1-nano",
                           temperature=TEMPERATURE)

# ------------------------------------------------------------
# 2.  Load/define the inference set
#     If you already saved a slice elsewhere, just load it.
# ------------------------------------------------------------
gsm8k_data = load_dataset("gsm8k", "main")["train"].select(range(256))  # slice first 200 examples
gsm8k_to_infer = load_dataset("gsm8k", "main")["train"] \
                     .select(range(5))          # ⬅ change as needed

# ------------------------------------------------------------
# 3.  Evaluation loop
# ------------------------------------------------------------
total, correct = 0, 0
all_losses     = []          # optional – to compare prompt quality

for idx in tqdm(range(len(gsm8k_to_infer)), desc="Baseline eval"):
    item       = gsm8k_to_infer[idx]
    Q_inf, A_gt = item["question"], item["answer"]

    # Build demo pool (everything except current query)
    demos = [(d["question"], d["answer"])
             for j, d in enumerate(gsm8k_data) if j != idx]

    # Encode query + candidate demos
    with torch.no_grad():
        q_emb     = encoder.encode([Q_inf], detach=True).squeeze(0)
        demo_embs = encoder.encode([q for (q, _) in demos], detach=True)

    # ------- ORIGINAL cosine retrieval -------------------------
    top_k, _ = rc.retrieve_top_k_cosine(
        q_emb, demo_embs, k=min(K, len(demos))
    )
    print(f"🔍 Top-K Indices: {top_k}")
    selected_demos = [demos[i] for i in top_k]          # length = 2

    # ------- Run the ICL model --------------------------------
    responses_nested = sample_responses_per_demo(
        demo_tuples = selected_demos,
        Q_inf       = Q_inf,
        icl_model   = icl_model,
        num_samples = NUM_SAMPLES,
        parallel=True
    )
    print(responses_nested)
    flat_responses = [r for demo_resps in responses_nested for r in demo_resps]
    acc = compute_demo_accuracy(flat_responses, A_gt)
    print(acc) 
    correct += acc
    total   += 1

baseline_acc = correct / total
print(f"\n📊  Baseline accuracy on GSM8K_to_infer: {baseline_acc:.3%}")


  from .autonotebook import tqdm as notebook_tqdm
Baseline eval:   0%|          | 0/5 [00:00<?, ?it/s]

🔍 Top-K Indices: [81, 159, 35]


Baseline eval:   0%|          | 0/5 [00:02<?, ?it/s]

OUTPUT:  ('72', None)
OUTPUT:  ('48 + 48/2 = 48 + 24 = 72', None)
OUTPUT:  ('144', None)
OUTPUT:  ('72', None)
OUTPUT:  ('48 + (48/2) = 48 + 24 = 72', None)





AttributeError: 'str' object has no attribute 'answer'

# TRAINING LOOP

In [None]:
from tqdm import tqdm
import torch
from mathbert_encoder import MathBERTEncoder
import retriever_cosine as rc
# from import retrieve_top_k_cosine, retrieve_sample_k_cosine
from response_sampler import sample_responses_per_demo
from reward_aggregator import compute_demo_accuracy
from icl_model_wrapper import OpenAIICLModel
from grpo_optimizer import grpo_step
from datasets import load_dataset
from dotenv import load_dotenv
import os
from transformers import get_linear_schedule_with_warmup
from importlib import reload

reload(rc)

load_dotenv()

# === Settings ===
API_KEY = os.getenv("OPENAI_API_KEY")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
K = 16
NUM_SAMPLES_PER_DEMO = 6
LEARNING_RATE =  1e-5
MAX_STEPS = 10
TEMPERATURE = 0.7

# === Init ===
encoder = MathBERTEncoder(device=DEVICE, trainable=True)
encoder.train()

icl_model = OpenAIICLModel(api_key=API_KEY, model_name="gpt-4.1-nano", temperature=TEMPERATURE)
optimizer = torch.optim.Adam(encoder.parameters(), lr=LEARNING_RATE)

gsm8k_data = load_dataset('gsm8k', 'main')['train']
gsm8k_data = gsm8k_data.select(range(256))  # slice first 200 examples
gsm8k_data_to_infer = gsm8k_data.select(range(0,20))  # slice first 200 examples

# === Training Loop ===
for step in tqdm(range(MAX_STEPS), desc="Training Steps"):
    print(f"\n=== Training Step {step+1} ===")

    for inference_index in tqdm(range(len(gsm8k_data_to_infer)), desc="Examples"):
        inference_item = gsm8k_data_to_infer[inference_index]
        demo_pool = [d for idx, d in enumerate(gsm8k_data) if idx != inference_index]

        Q_inf = inference_item["question"]
        A_gt = inference_item["answer"]
        demos = [(d["question"], d["answer"]) for d in demo_pool]

        q_emb = encoder.encode([Q_inf], detach=False).squeeze(0)
        
        # SWITCHING TO BATCH ENCODE
        # demo_embs = encoder.encode([q for (q, a) in demos], detach=False)
        demo_questions = [q for (q, a) in demos]
        demo_embs = encoder.batched_encode(demo_questions, batch_size=16, detach=False)

        
        top_k_indices, _ = rc.retrieve_top_k_cosine(q_emb, demo_embs, k=min(K, len(demos)))
        print("-------Question---------")
        print(Q_inf)
        print(f"\n🧠 Inference Index {inference_index}")
        print(f"🔍 Top-K Indices: {top_k_indices}")
        
        top_k_indices, similarities = rc.retrieve_sample_k_cosine(q_emb, demo_embs, k=min(K, len(demos)))
        selected_demos = [demos[i] for i in top_k_indices]



        all_responses = sample_responses_per_demo(
            demo_tuples=selected_demos,
            Q_inf=Q_inf,
            icl_model=icl_model,
            num_samples=NUM_SAMPLES_PER_DEMO,
            parallel=True
        )

        rewards = []
        for i, responses in enumerate(all_responses):
            reward = compute_demo_accuracy(responses, A_gt)
            rewards.append(reward)
            # print(f"    Demo {i} | Reward: {reward:.2f}")

        rewards = torch.tensor(rewards, dtype=torch.float32).to(DEVICE)

        loss = grpo_step(
            rewards,
            similarities,
            q_emb,
            demo_embs,
            optimizer
        )

        print(f"✅ Loss: {loss:.4f}")



  from .autonotebook import tqdm as notebook_tqdm
Training Steps:   0%|          | 0/10 [00:00<?, ?it/s]


=== Training Step 1 ===




-------Question---------
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?

🧠 Inference Index 0
🔍 Top-K Indices: [81, 159, 30, 84, 104, 145, 109, 204, 50, 4, 122, 8, 220, 56, 136, 51, 3, 99, 211, 176, 40, 251, 147, 82, 203, 60, 194, 150, 89, 239, 59, 207]
OUTPUT:  ('72', None)
OUTPUT:  ('72', None)
OUTPUT:  ('72', None)
OUTPUT:  ('72', None)
OUTPUT:  ('72', None)
OUTPUT:  ('48 + (48/2) = 48 + 24 = 72', None)
------------Demo 1--------
Q: Irene earns $500 if she works for 40 hours a week and gets an extra $20 for every hour of overtime. If she worked 50 hours last week, calculate her total income.
A: If Irene worked 50 hours last week, the total number of hours counting as overtime is 50-40 = <<50-40=10>>10 hours.
Since she's given $20 for every hour of overtime, she earned 10*$20 = $<<10*20=200>>200 in overtime.
Her total income, including the overtime, is $500+$200= $<<500+200=700



✅ Loss: -0.2100
-------Question---------
Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?

🧠 Inference Index 1
🔍 Top-K Indices: [177, 14, 12, 76, 155, 207, 221, 32, 60, 75, 210, 145, 64, 92, 142, 43, 201, 189, 164, 197, 62, 8, 174, 233, 168, 47, 204, 184, 95, 40, 205, 37]
OUTPUT:  ('10', None)
OUTPUT:  ('10', None)
OUTPUT:  ('10', None)
OUTPUT:  ('10', None)
OUTPUT:  ('10', None)
OUTPUT:  ('$10', None)
------------Demo 1--------
Q: Ian won $100 in the lottery.  He decided to use the money to pay off debts. He paid $20 to Colin. He then paid twice as much to Helen, as he had paid to Colin. Then finally, he paid half as much to Benedict, as he had paid to Helen.  How much money, in dollars, does he have left after paying off debts?
A: Starting with $100, he paid $20 to Colin, leaving him with $100-$20=$<<100-20=80>>80.
Twice as much as $20 is 2*$20=$<<2*20=40>>40.
Thus, he paid $40 to Helen, leaving him with $80-$40=$<<80-4



✅ Loss: -0.0000
-------Question---------
Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?

🧠 Inference Index 2
🔍 Top-K Indices: [72, 229, 147, 177, 201, 32, 226, 37, 207, 53, 231, 51, 9, 211, 19, 104, 153, 75, 238, 15, 127, 80, 14, 36, 70, 200, 8, 236, 129, 184, 138, 101]
OUTPUT:  ('50', None)
OUTPUT:  ('50', None)
OUTPUT:  ('50', None)
OUTPUT:  ('50', None)
OUTPUT:  ('50', None)
OUTPUT:  ('50', None)
------------Demo 1--------
Q: Bobby has 16 toy cars, and the number of cars he has increases by 50% every year. How many toy cars will Bobby have in three years?
A: In the first year, Bobby will acquire 16 * .5 = <<16*.5=8>>8 new cars.
After the first year, he will have a total of 16 + 8 = <<16+8=24>>24 cars.
In the second year, Bobby will acquire 24 * .5 = <<24*.5=12>>12 new cars



✅ Loss: 0.0118
-------Question---------
Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?

🧠 Inference Index 3
🔍 Top-K Indices: [56, 85, 13, 81, 194, 187, 202, 220, 149, 32, 250, 140, 74, 5, 104, 196, 110, 41, 229, 73, 120, 209, 239, 150, 207, 122, 204, 87, 233, 101, 151, 117]
OUTPUT:  ('48', None)
OUTPUT:  ('78', None)
OUTPUT:  ('48', None)
OUTPUT:  ('48', None)
OUTPUT:  ('48', None)
OUTPUT:  ('36', None)
------------Demo 1--------
Q: James earns $20 an hour while working at his main job.  He earns 20% less while working his second job.  He works 30 hours at his main job and half that much at his second job.  How much does he earn per week?
A: James earns 20*.2=$<<20*.2=4>>4 less while working his second job
So he earns 20-4=$<<20-4=16>>16 an hour
At his first job he earns 20*30=$<<20*30=600>>600
He works 30/2=<<30/2=



✅ Loss: -0.0000
-------Question---------
James writes a 3-page letter to 2 different friends twice a week.  How many pages does he write a year?

🧠 Inference Index 4
🔍 Top-K Indices: [217, 175, 211, 202, 82, 234, 19, 136, 38, 238, 3, 206, 105, 248, 228, 207, 94, 87, 156, 110, 52, 150, 117, 131, 75, 71, 229, 5, 18, 128, 159, 220]
OUTPUT:  ('312', None)
OUTPUT:  ('3 × 2 × 2 × 52 = 624', None)
OUTPUT:  ('312', None)
OUTPUT:  ('3 pages × 2 friends × 2 times a week × 52 weeks = 624', None)
OUTPUT:  ('Pages per week = 3 pages × 2 friends × 2 times = 12 pages\nNumber of weeks in a year = 52\nTotal pages in a year = 12 pages × 52 weeks = 624', None)
OUTPUT:  ('3 pages × 2 friends × 2 times a week × 52 weeks = 3 × 2 × 2 × 52 = 624', None)
------------Demo 1--------
Q: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
A: Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<



✅ Loss: 0.3716
-------Question---------
Mark has a garden with flowers. He planted plants of three different colors in it. Ten of them are yellow, and there are 80% more of those in purple. There are only 25% as many green flowers as there are yellow and purple flowers. How many flowers does Mark have in his garden?

🧠 Inference Index 5
🔍 Top-K Indices: [201, 37, 56, 212, 177, 30, 184, 155, 116, 207, 52, 2, 241, 115, 228, 144, 43, 3, 238, 123, 96, 229, 60, 83, 235, 166, 199, 142, 26, 161, 234, 5]
OUTPUT:  ('130', None)
OUTPUT:  ('Total yellow flowers = 10  \nPurple flowers = 80% more than yellow = 10 + 0.8 * 10 = 10 + 8 = 18  \nTotal yellow and purple = 10 + 18 = 28  \nGreen flowers = 25% of yellow and purple = 0.25 * 28 = 7  \nTotal flowers = yellow + purple + green = 10 + 18 + 7 = 35', None)
OUTPUT:  ('Yellow flowers = 10  \nPurple flowers = 80% more than yellow = 10 + 0.80 * 10 = 10 + 8 = 18  \nTotal yellow and purple = 10 + 18 = 28  \nGreen flowers = 25% as many as yellow and purpl



✅ Loss: 0.0094
-------Question---------
Albert is wondering how much pizza he can eat in one day. He buys 2 large pizzas and 2 small pizzas. A large pizza has 16 slices and a small pizza has 8 slices. If he eats it all, how many pieces does he eat that day?

🧠 Inference Index 6
🔍 Top-K Indices: [177, 226, 49, 37, 237, 220, 3, 8, 238, 30, 211, 26, 253, 89, 248, 4, 199, 119, 36, 93, 9, 167, 71, 117, 210, 79, 197, 172, 53, 234, 82, 31]
OUTPUT:  ('48', None)
OUTPUT:  ('48', None)
OUTPUT:  ('48', None)
OUTPUT:  ('48', None)
OUTPUT:  ('2 large pizzas × 16 slices = 32 slices  \n2 small pizzas × 8 slices = 16 slices  \nTotal slices eaten = 32 + 16 = 48', None)
OUTPUT:  ('48', None)
------------Demo 1--------
Q: Ian won $100 in the lottery.  He decided to use the money to pay off debts. He paid $20 to Colin. He then paid twice as much to Helen, as he had paid to Colin. Then finally, he paid half as much to Benedict, as he had paid to Helen.  How much money, in dollars, does he have left after p



✅ Loss: -0.0738
-------Question---------
Ken created a care package to send to his brother, who was away at boarding school.  Ken placed a box on a scale, and then he poured into the box enough jelly beans to bring the weight to 2 pounds.  Then, he added enough brownies to cause the weight to triple.  Next, he added another 2 pounds of jelly beans.  And finally, he added enough gummy worms to double the weight once again.  What was the final weight of the box of goodies, in pounds?

🧠 Inference Index 7
🔍 Top-K Indices: [89, 30, 82, 97, 60, 26, 226, 29, 10, 17, 174, 177, 63, 111, 234, 143, 76, 207, 70, 14, 224, 65, 204, 220, 74, 201, 157, 110, 172, 211, 176, 36]
OUTPUT:  ('8', None)
OUTPUT:  ('8', None)
OUTPUT:  ('8', None)
OUTPUT:  ('8', None)
OUTPUT:  ('8', None)
OUTPUT:  ('8', None)
------------Demo 1--------
Q: Silvia’s bakery is offering 10% on advanced orders over $50.00.  She orders 2 quiches for $15.00 each, 6 croissants at $3.00 each and 6 buttermilk biscuits for $2.00 each.  H



✅ Loss: -0.0084
-------Question---------
Alexis is applying for a new job and bought a new set of business clothes to wear to the interview. She went to a department store with a budget of $200 and spent $30 on a button-up shirt, $46 on suit pants, $38 on a suit coat, $11 on socks, and $18 on a belt. She also purchased a pair of shoes, but lost the receipt for them. She has $16 left from her budget. How much did Alexis pay for the shoes?

🧠 Inference Index 8
🔍 Top-K Indices: [25, 23, 59, 127, 82, 240, 191, 9, 24, 40, 97, 223, 16, 129, 36, 224, 196, 144, 58, 83, 136, 72, 215, 30, 228, 120, 156, 225, 115, 45, 179, 100]
OUTPUT:  ('$41', None)
OUTPUT:  ('$41', None)
OUTPUT:  ('$41', None)
OUTPUT:  ('$41', None)
OUTPUT:  ('$41', None)
OUTPUT:  ('$41', None)
------------Demo 1--------
Q: It is Roger’s turn to provide a snack for the baseball team after the game and he has decided to bring trail mix. The trail mix comes in packs of 6 individual pouches. Roger has 13 members on his baseball te



✅ Loss: -0.0916
-------Question---------
Tina makes $18.00 an hour.  If she works more than 8 hours per shift, she is eligible for overtime, which is paid by your hourly wage + 1/2 your hourly wage.  If she works 10 hours every day for 5 days, how much money does she make?

🧠 Inference Index 9
🔍 Top-K Indices: [81, 155, 197, 12, 31, 210, 233, 23, 237, 112, 17, 4, 206, 66, 72, 80, 36, 65, 228, 137, 173, 215, 234, 179, 238, 120, 102, 91, 90, 231, 52, 136]
OUTPUT:  ('$315.00', None)
OUTPUT:  ('$495.00', None)
OUTPUT:  ('$270.00', None)
OUTPUT:  ('270', None)
OUTPUT:  ('$270.00', None)
OUTPUT:  ('$405.00', None)
------------Demo 1--------
Q: James writes a 3-page letter to 2 different friends twice a week.  How many pages does he write a year?
A: He writes each friend 3*2=<<3*2=6>>6 pages a week
So he writes 6*2=<<6*2=12>>12 pages every week
That means he writes 12*52=<<12*52=624>>624 pages a year
#### 624

Q: Tina makes $18.00 an hour.  If she works more than 8 hours per shift, she is eli



✅ Loss: 0.0430
-------Question---------
A deep-sea monster rises from the waters once every hundred years to feast on a ship and sate its hunger. Over three hundred years, it has consumed 847 people. Ships have been built larger over time, so each new ship has twice as many people as the last ship. How many people were on the ship the monster ate in the first hundred years?

🧠 Inference Index 10
🔍 Top-K Indices: [167, 80, 182, 136, 64, 81, 83, 211, 234, 218, 231, 23, 9, 223, 60, 172, 101, 117, 226, 135, 121, 115, 181, 233, 119, 102, 24, 71, 148, 206, 11, 159]
OUTPUT:  ('423.5', None)
OUTPUT:  ('423.5', None)
OUTPUT:  ('423.5', None)
OUTPUT:  ('423.5', None)
OUTPUT:  ('423.5', None)
OUTPUT:  ('423.5', None)
------------Demo 1--------
Q: Mira jogs every morning. She jogs 5 miles per hour. If she jogs for 2 hours every morning, how many miles can she jog for five days?
A: Mira jogs 5 x 2 = <<5*2=10>>10 miles per day.
Therefore she can jog 10 x 5 = <<10*5=50>>50 miles for 5 days.
#### 50





✅ Loss: 0.1431
-------Question---------
Tobias is buying a new pair of shoes that costs $95. He has been saving up his money each month for the past three months. He gets a $5 allowance a month. He also mows lawns and shovels driveways. He charges $15 to mow a lawn and $7 to shovel. After buying the shoes, he has $15 in change. If he mows 4 lawns, how many driveways did he shovel?

🧠 Inference Index 11
🔍 Top-K Indices: [49, 70, 25, 110, 253, 19, 181, 233, 72, 235, 111, 177, 136, 26, 64, 38, 234, 91, 2, 51, 184, 215, 166, 9, 191, 211, 80, 12, 20, 65, 89, 52]
OUTPUT:  ('10', None)
OUTPUT:  ('10', None)
OUTPUT:  ('10', None)
OUTPUT:  ('10', None)
OUTPUT:  ('10', None)
OUTPUT:  ('4', None)
------------Demo 1--------
Q: Jack is stranded on a desert island. He wants some salt to season his fish. He collects 2 liters of seawater in an old bucket. If the water is 20% salt, how many ml of salt will Jack get when all the water evaporates?
A: First find how many liters of the seawater are salt: 2



✅ Loss: -0.0945
-------Question---------
Randy has 60 mango trees on his farm. He also has 5 less than half as many coconut trees as mango trees. How many trees does Randy have in all on his farm?

🧠 Inference Index 12
🔍 Top-K Indices: [234, 120, 143, 223, 114, 52, 97, 75, 9, 172, 87, 181, 113, 51, 182, 105, 111, 83, 229, 38, 16, 30, 100, 224, 27, 122, 190, 36, 129, 179, 19, 236]
OUTPUT:  ('85', None)
OUTPUT:  ('85', None)
OUTPUT:  ('85', None)
OUTPUT:  ('70', None)
OUTPUT:  ('95', None)
OUTPUT:  ('90', None)
------------Demo 1--------
Q: On Monday Buddy has 30 baseball cards. On Tuesday Buddy loses half of them. On Wednesday Buddy buys 12 baseball cards. On Thursday he buys a third of what he had on Tuesday. How many baseball cards does he have on Thursday?
A: On Tuesday Buddy has 30/2 = <<30/2=15>>15 baseball cards.
On Wednesday Buddy has 15+12 = <<15+12=27>>27 baseball cards.
On Thursday Buddy buys 15/3 = <<15/3=5>>5 baseball cards.
On Thursday Buddy has a total of 27+5 = <<27+5=32>



✅ Loss: 0.2757
-------Question---------
Jasper will serve charcuterie at his dinner party. He buys 2 pounds of cheddar cheese for $10, a pound of cream cheese that cost half the price of the cheddar cheese, and a pack of cold cuts that cost twice the price of the cheddar cheese. How much does he spend on the ingredients?

🧠 Inference Index 13
🔍 Top-K Indices: [47, 75, 119, 32, 174, 67, 81, 92, 25, 177, 43, 26, 68, 176, 184, 53, 14, 211, 232, 215, 193, 97, 87, 70, 76, 123, 40, 231, 129, 46, 52, 128]
OUTPUT:  ('$30', None)
OUTPUT:  ('$30', None)
OUTPUT:  ('$30', None)
OUTPUT:  ('$30', None)
OUTPUT:  ('$30', None)
OUTPUT:  ('$30', None)
------------Demo 1--------
Q: An earthquake caused four buildings to collapse. Experts predicted that each following earthquake would have double the number of collapsing buildings as the previous one, since each one would make the foundations less stable. After three more earthquakes, how many buildings had collapsed including those from the first earthqu



✅ Loss: -0.1099
-------Question---------
Joy can read 8 pages of a book in 20 minutes. How many hours will it take her to read 120 pages?

🧠 Inference Index 14
🔍 Top-K Indices: [3, 93, 119, 220, 5, 133, 208, 159, 203, 139, 146, 175, 104, 243, 113, 254, 128, 194, 56, 151, 120, 14, 148, 249, 22, 158, 78, 4, 202, 228, 116, 124]
OUTPUT:  ('3', None)
OUTPUT:  ('3', None)
OUTPUT:  ('20 minutes / 8 pages = 2.5 minutes per page  \n120 pages × 2.5 minutes per page = 300 minutes  \n300 minutes ÷ 60 = 5 hours', None)
OUTPUT:  ('20 minutes / 8 pages = 2.5 minutes per page  \n120 pages × 2.5 minutes per page = 300 minutes  \n300 minutes ÷ 60 = 5 hours', None)
OUTPUT:  ('20 minutes / 8 pages = 2.5 minutes per page  \n120 pages * 2.5 minutes = 300 minutes  \n300 minutes / 60 = 5 hours', None)
OUTPUT:  ('20 minutes / 8 pages = 2.5 minutes per page  \n120 pages × 2.5 minutes = 300 minutes  \n300 minutes ÷ 60 = 5 hours', None)
------------Demo 1--------
Q: An aquarium holds an equal number of clownfish 



✅ Loss: -0.0249
-------Question---------
James creates a media empire.  He creates a movie for $2000.  Each DVD cost $6 to make.  He sells it for 2.5 times that much.  He sells 500 movies a day for 5 days a week.  How much profit does he make in 20 weeks?

🧠 Inference Index 15
🔍 Top-K Indices: [177, 31, 238, 207, 71, 193, 30, 253, 92, 176, 60, 133, 210, 185, 149, 40, 36, 234, 80, 9, 122, 43, 66, 155, 237, 101, 215, 240, 222, 110, 190, 233]
OUTPUT:  ('7500', None)
OUTPUT:  ('Profit per DVD = Selling price - Cost price = (2.5 × 6) - 6 = 15 - 6 = 9 dollars  \nNumber of DVDs sold in 20 weeks = 500 DVDs/day × 5 days/week × 20 weeks = 500 × 5 × 20 = 50,000 DVDs  \nTotal profit = Profit per DVD × Total DVDs = 9 × 50,000 = 450,000 dollars', None)
OUTPUT:  ('Profit per DVD = (Selling price) - (Cost to make) = (2.5 * 6) - 6 = 15 - 6 = 9 dollars  \nNumber of DVDs sold per day = 500  \nNumber of days in 20 weeks = 20 * 5 = 100 days  \nTotal DVDs sold = 500 * 100 = 50,000  \nTotal profit from DVDs 



✅ Loss: -0.1047


Examples:  80%|████████  | 16/20 [21:29<05:22, 80.58s/it]
Training Steps:   0%|          | 0/10 [21:29<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# ------------------------------------------------------------
# 0.  Imports & config – nothing here should clash with yours
# ------------------------------------------------------------
from tqdm import tqdm
import torch
from mathbert_encoder import MathBERTEncoder
import retriever_cosine as rc
# from import retrieve_top_k_cosine, retrieve_sample_k_cosine
from response_sampler import sample_responses_per_demo
from reward_aggregator import compute_demo_accuracy
from icl_model_wrapper import OpenAIICLModel
from grpo_optimizer import grpo_step
from datasets import load_dataset
from dotenv import load_dotenv
import os
from transformers import get_linear_schedule_with_warmup
from importlib import reload

reload(rc)

load_dotenv()

API_KEY = os.getenv("OPENAI_API_KEY")
DEVICE  = "cuda" if torch.cuda.is_available() else "cpu"
K       = 3                # demos per query
NUM_SAMPLES = 5             # model completions per query
TEMPERATURE = 0.7           # keep same as training loop

# ------------------------------------------------------------
# 1.  Initialise encoder **in eval mode** (weights frozen)
# ------------------------------------------------------------
encoder.eval()                                   # no grads!

icl_model = OpenAIICLModel(api_key=API_KEY,
                           model_name="gpt-4.1-nano",
                           temperature=TEMPERATURE)

# ------------------------------------------------------------
# 2.  Load/define the inference set
#     If you already saved a slice elsewhere, just load it.
# ------------------------------------------------------------
gsm8k_data = load_dataset("gsm8k", "main")["train"].select(range(256))  # slice first 200 examples
gsm8k_to_infer = load_dataset("gsm8k", "main")["train"] \
                     .select(range(5))          # ⬅ change as needed

# ------------------------------------------------------------
# 3.  Evaluation loop
# ------------------------------------------------------------
total, correct = 0, 0
all_losses     = []          # optional – to compare prompt quality

for idx in tqdm(range(len(gsm8k_to_infer)), desc="Baseline eval"):
    item       = gsm8k_to_infer[idx]
    Q_inf, A_gt = item["question"], item["answer"]

    # Build demo pool (everything except current query)
    demos = [(d["question"], d["answer"])
             for j, d in enumerate(gsm8k_data) if j != idx]

    # Encode query + candidate demos
    with torch.no_grad():
        q_emb     = encoder.encode([Q_inf], detach=True).squeeze(0)
        demo_embs = encoder.encode([q for (q, _) in demos], detach=True)

    # ------- ORIGINAL cosine retrieval -------------------------
    top_k, _ = rc.retrieve_top_k_cosine(
        q_emb, demo_embs, k=min(K, len(demos))
    )
    print(f"🔍 Top-K Indices: {top_k}")
    selected_demos = [demos[i] for i in top_k]          # length = 2

    # ------- Run the ICL model --------------------------------
    responses_nested = sample_responses_per_demo(
        demo_tuples = selected_demos,
        Q_inf       = Q_inf,
        icl_model   = icl_model,
        num_samples = NUM_SAMPLES,
        parallel=True
    )
    print(responses_nested)
    flat_responses = [r for demo_resps in responses_nested for r in demo_resps]
    acc = compute_demo_accuracy(flat_responses, A_gt)
    print(acc) 
    correct += acc
    total   += 1

baseline_acc = correct / total
print(f"\n📊  Baseline accuracy on GSM8K_to_infer: {baseline_acc:.3%}")


Baseline eval:   0%|          | 0/5 [00:00<?, ?it/s]

🔍 Top-K Indices: [49, 138, 8]
------------Demo 1--------
Q: Gerald spends $100 a month on baseball supplies. His season is 4 months long. He wants to use the months he's not playing baseball to save up by raking, shoveling, and mowing lawns. He charges $10 for each. How many chores does he need to average a month to save up for his supplies?
A: He needs to save up $400 because 4 x 100 = <<4*100=400>>400
He has 8 months to earn this money because 12 - 4 = <<12-4=8>>8
He needs to earn $50 a month because 400 / 8 = <<400/8=50>>50
He needs to do 5 tasks a month because 50 / 10 = <<50/10=5>>5
#### 5

Q: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
A:
72


Baseline eval:  20%|██        | 1/5 [00:01<00:06,  1.55s/it]

[['72', '72', '72', '72', '72'], ['48 + 24 = 72', '48 + (48 / 2) = 48 + 24 = 72', '48 + 24 = 72', '48 + 24 = 72', '48 + (48 / 2) = 48 + 24 = 72'], ['72', '72', '72', '72', '144']]
0.9333333333333333
🔍 Top-K Indices: [12, 177, 221]
------------Demo 1--------
Q: Jasper will serve charcuterie at his dinner party. He buys 2 pounds of cheddar cheese for $10, a pound of cream cheese that cost half the price of the cheddar cheese, and a pack of cold cuts that cost twice the price of the cheddar cheese. How much does he spend on the ingredients?
A: A pound of cream cheese cost $10 / 2 = $<<10/2=5>>5.
A pack of cold cuts cost $10 x 2 = $<<10*2=20>>20.
Jasper spent $10 + $5 + $20 = $<<10+5+20=35>>35 on the ingredients.
#### 35

Q: Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?
A:
10


Baseline eval:  40%|████      | 2/5 [00:03<00:04,  1.50s/it]

[['10', '10', '10', '10', '10'], ['10', '10', '10', '10', '10'], ['10', '10', '10', '10', '10']]
1.0
🔍 Top-K Indices: [177, 37, 176]
------------Demo 1--------
Q: Simon wanted to buy flowers that his mom could plant for Mother's Day.  The garden center was offering 10% off all purchases.  He bought 5 pansies at $2.50 each, one hydrangea that cost $12.50 and 5 petunias that cost $1.00 each.  If he paid with a $50 bill, how much change would Simon receive back from his purchase?
A: 5 pansies at $2.50 each is 5*2.50 = $<<5*2.5=12.50>>12.50
5 petunias at $1.00 each 5*1 = $<<5*1=5.00>>5.00
All total he spends 12.50+12.50+5.00 = $<<12.50+12.50+5.00=30.00>>30.00
The sale is 10% off so 30*.10 = $<<30*.10=3.00>>3.00
The purchase total now comes to 30-3 = $<<30-3=27.00>>27.00
He pays with a $50 bill so 50-27 = $<<50-27=23.00>>23.00
#### 23

Q: Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose,

Baseline eval:  60%|██████    | 3/5 [00:05<00:03,  1.91s/it]

[['50', '50', '50', '50', '50'], ['55', '25', '25', '25', '70'], ['55', '100 - (50 + 15 + 30) = 100 - 95 = 5', '55', '55', '55']]
0.9333333333333333
🔍 Top-K Indices: [5, 49, 8]
------------Demo 1--------
Q: Albert is wondering how much pizza he can eat in one day. He buys 2 large pizzas and 2 small pizzas. A large pizza has 16 slices and a small pizza has 8 slices. If he eats it all, how many pieces does he eat that day?
A: He eats 32 from the largest pizzas because 2 x 16 = <<2*16=32>>32
He eats 16 from the small pizza because 2 x 8 = <<2*8=16>>16
He eats 48 pieces because 32 + 16 = <<32+16=48>>48
#### 48

Q: Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?
A:
54


Baseline eval:  80%|████████  | 4/5 [00:07<00:02,  2.14s/it]

[['54', '36', '48', '33', '54'], ['48', '48', '48', '48', '48'], ['54', '36', '54', '54', '54']]
0.0
🔍 Top-K Indices: [206, 156, 175]
------------Demo 1--------
Q: John volunteers at a shelter twice a month for 3 hours at a time.  How many hours does he volunteer per year?
A: He volunteers 2*12=<<2*12=24>>24 times a year
So he volunteers for 24*3=<<24*3=72>>72 hours
#### 72

Q: James writes a 3-page letter to 2 different friends twice a week.  How many pages does he write a year?
A:
624


Baseline eval: 100%|██████████| 5/5 [00:09<00:00,  1.92s/it]

[['624', '312', '624', '312', '624'], ['312', '312', '312', '312', '312'], ['3 pages × 2 friends × 2 times a week × 52 weeks = 3 × 2 × 2 × 52 = 624', '312', '312', '312', '312']]
0.26666666666666666

📊  Baseline accuracy on GSM8K_to_infer: 62.667%





In [None]:
# Save the updated MathBERT model
save_path = "./updated_mathbert"  # your save directory
encoder.model.save_pretrained(save_path)
encoder.tokenizer.save_pretrained(save_path)

# LOADING

# from transformers import BertTokenizer, BertModel

# model = BertModel.from_pretrained("./updated_mathbert")
# tokenizer = BertTokenizer.from_pretrained("./updated_mathbert")
