# EVALUATION LOOP

In [1]:
# ------------------------------------------------------------
# 0.  Imports & config ‚Äì nothing here should clash with yours
# ------------------------------------------------------------
from tqdm import tqdm
import torch
from mathbert_encoder import MathBERTEncoder
import retriever_cosine as rc
# from import retrieve_top_k_cosine, retrieve_sample_k_cosine
from response_sampler import sample_responses_per_demo
from reward_aggregator import compute_demo_accuracy
from icl_model_wrapper import OpenAIICLModel
from grpo_optimizer import grpo_step
from datasets import load_dataset
from dotenv import load_dotenv
import os
from transformers import get_linear_schedule_with_warmup
from importlib import reload

reload(rc)

load_dotenv()

API_KEY = os.getenv("OPENAI_API_KEY")
DEVICE  = "cuda" if torch.cuda.is_available() else "cpu"
K       = 3                # demos per query
NUM_SAMPLES = 5             # model completions per query
TEMPERATURE = 0.7           # keep same as training loop

# ------------------------------------------------------------
# 1.  Initialise encoder **in eval mode** (weights frozen)
# ------------------------------------------------------------
encoder = MathBERTEncoder(device=DEVICE, trainable=False)
encoder.eval()                                   # no grads!

icl_model = OpenAIICLModel(api_key=API_KEY,
                           model_name="gpt-4.1-nano",
                           temperature=TEMPERATURE)

# ------------------------------------------------------------
# 2.  Load/define the inference set
#     If you already saved a slice elsewhere, just load it.
# ------------------------------------------------------------
gsm8k_data = load_dataset("gsm8k", "main")["train"].select(range(256))  # slice first 200 examples
gsm8k_to_infer = load_dataset("gsm8k", "main")["train"] \
                     .select(range(5))          # ‚¨Ö change as needed

# ------------------------------------------------------------
# 3.  Evaluation loop
# ------------------------------------------------------------
total, correct = 0, 0
all_losses     = []          # optional ‚Äì to compare prompt quality

for idx in tqdm(range(len(gsm8k_to_infer)), desc="Baseline eval"):
    item       = gsm8k_to_infer[idx]
    Q_inf, A_gt = item["question"], item["answer"]

    # Build demo pool (everything except current query)
    demos = [(d["question"], d["answer"])
             for j, d in enumerate(gsm8k_data) if j != idx]

    # Encode query + candidate demos
    with torch.no_grad():
        q_emb     = encoder.encode([Q_inf], detach=True).squeeze(0)
        demo_embs = encoder.encode([q for (q, _) in demos], detach=True)

    # ------- ORIGINAL cosine retrieval -------------------------
    top_k, _ = rc.retrieve_top_k_cosine(
        q_emb, demo_embs, k=min(K, len(demos))
    )
    print(f"üîç Top-K Indices: {top_k}")
    selected_demos = [demos[i] for i in top_k]          # length = 2

    # ------- Run the ICL model --------------------------------
    responses_nested = sample_responses_per_demo(
        demo_tuples = selected_demos,
        Q_inf       = Q_inf,
        icl_model   = icl_model,
        num_samples = NUM_SAMPLES,
        parallel=True
    )
    print(responses_nested)
    flat_responses = [r for demo_resps in responses_nested for r in demo_resps]
    acc = compute_demo_accuracy(flat_responses, A_gt)
    print(acc) 
    correct += acc
    total   += 1

baseline_acc = correct / total
print(f"\nüìä  Baseline accuracy on GSM8K_to_infer: {baseline_acc:.3%}")


  from .autonotebook import tqdm as notebook_tqdm
Baseline eval:   0%|          | 0/5 [00:00<?, ?it/s]

üîç Top-K Indices: [81, 159, 35]


Baseline eval:   0%|          | 0/5 [00:02<?, ?it/s]

OUTPUT:  ('72', None)
OUTPUT:  ('48 + 48/2 = 48 + 24 = 72', None)
OUTPUT:  ('144', None)
OUTPUT:  ('72', None)
OUTPUT:  ('48 + (48/2) = 48 + 24 = 72', None)





AttributeError: 'str' object has no attribute 'answer'

# TRAINING LOOP

In [None]:
from tqdm import tqdm
import torch
from mathbert_encoder import MathBERTEncoder
import retriever_cosine as rc
# from import retrieve_top_k_cosine, retrieve_sample_k_cosine
from response_sampler import sample_responses_per_demo
from reward_aggregator import compute_demo_accuracy
from icl_model_wrapper import OpenAIICLModel
from grpo_optimizer import grpo_step
from datasets import load_dataset
from dotenv import load_dotenv
import os
from transformers import get_linear_schedule_with_warmup
from importlib import reload

reload(rc)

load_dotenv()

# === Settings ===
API_KEY = os.getenv("OPENAI_API_KEY")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
K = 32
NUM_SAMPLES_PER_DEMO = 6
LEARNING_RATE =  1e-5
MAX_STEPS = 10
TEMPERATURE = 0.7

# === Init ===
encoder = MathBERTEncoder(device=DEVICE, trainable=True)
encoder.train()

icl_model = OpenAIICLModel(api_key=API_KEY, model_name="gpt-4.1-nano", temperature=TEMPERATURE)
optimizer = torch.optim.Adam(encoder.parameters(), lr=LEARNING_RATE)

gsm8k_data = load_dataset('gsm8k', 'main')['train']
gsm8k_data = gsm8k_data.select(range(256))  # slice first 200 examples
gsm8k_data_to_infer = gsm8k_data.select(range(0,20))  # slice first 200 examples

# === Training Loop ===
for step in tqdm(range(MAX_STEPS), desc="Training Steps"):
    print(f"\n=== Training Step {step+1} ===")

    for inference_index in tqdm(range(len(gsm8k_data_to_infer)), desc="Examples"):
        inference_item = gsm8k_data_to_infer[inference_index]
        demo_pool = [d for idx, d in enumerate(gsm8k_data) if idx != inference_index]

        Q_inf = inference_item["question"]
        A_gt = inference_item["answer"]
        demos = [(d["question"], d["answer"]) for d in demo_pool]

        q_emb = encoder.encode([Q_inf], detach=False).squeeze(0)
        
        # SWITCHING TO BATCH ENCODE
        # demo_embs = encoder.encode([q for (q, a) in demos], detach=False)
        demo_questions = [q for (q, a) in demos]
        demo_embs = encoder.batched_encode(demo_questions, batch_size=16, detach=False)

        
        top_k_indices, _ = rc.retrieve_top_k_cosine(q_emb, demo_embs, k=min(K, len(demos)))
        print("-------Question---------")
        print(Q_inf)
        print(f"\nüß† Inference Index {inference_index}")
        print(f"üîç Top-K Indices: {top_k_indices}")
        
        top_k_indices, similarities = rc.retrieve_sample_k_cosine(q_emb, demo_embs, k=min(K, len(demos)))
        selected_demos = [demos[i] for i in top_k_indices]



        all_responses = sample_responses_per_demo(
            demo_tuples=selected_demos,
            Q_inf=Q_inf,
            icl_model=icl_model,
            num_samples=NUM_SAMPLES_PER_DEMO,
            parallel=True
        )

        rewards = []
        for i, responses in enumerate(all_responses):
            reward = compute_demo_accuracy(responses, A_gt)
            rewards.append(reward)
            # print(f"    Demo {i} | Reward: {reward:.2f}")

        rewards = torch.tensor(rewards, dtype=torch.float32).to(DEVICE)

        loss = grpo_step(
            rewards,
            similarities,
            q_emb,
            demo_embs,
            optimizer
        )

        print(f"‚úÖ Loss: {loss:.4f}")



  from .autonotebook import tqdm as notebook_tqdm
Training Steps:   0%|          | 0/10 [00:00<?, ?it/s]


=== Training Step 1 ===




-------Question---------
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?

üß† Inference Index 0
üîç Top-K Indices: [81, 79, 84, 215, 202, 21, 147, 206, 77, 35, 159, 63, 220, 194, 250, 176, 148, 5, 136, 93, 113, 52, 145, 1, 2, 110, 50, 118, 137, 171, 239, 119, 87, 254, 4, 85, 109, 105, 150, 56, 201, 133, 29, 142, 233, 235, 14, 161, 8, 11, 219, 167, 228, 178, 245, 30, 140, 195, 172, 78, 3, 51, 46, 157]
OUTPUT:  ('144', None)
OUTPUT:  ('144', None)
OUTPUT:  ('144', None)
OUTPUT:  ('144', None)
OUTPUT:  ('144', None)
OUTPUT:  ('144', None)
------------Demo 1--------
Q: Mike and Ted planted tomatoes.  In the morning, Mike planted 50 tomato seeds while Ted planted twice as much as Mike.  In the afternoon, Mike planted 60 tomato seeds while Ted planted 20 fewer tomato seeds than Mike. How many tomato seeds did they plant altogether?
A: Ted planted 2 x 50 = <<2*50=100>>100 tomato seeds



‚úÖ Loss: -0.1398
-------Question---------
Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?

üß† Inference Index 1
üîç Top-K Indices: [14, 207, 95, 250, 128, 88, 201, 110, 64, 75, 222, 167, 65, 176, 219, 189, 11, 163, 137, 120, 147, 8, 235, 168, 171, 221, 109, 92, 114, 220, 94, 117, 59, 178, 28, 113, 131, 157, 68, 67, 240, 146, 175, 208, 124, 89, 136, 52, 247, 185, 1, 101, 119, 202, 179, 154, 22, 19, 81, 36, 228, 5, 196, 227]
OUTPUT:  ('10', None)
OUTPUT:  ('10', None)
OUTPUT:  ('10', None)
OUTPUT:  ('10', None)
OUTPUT:  ('10', None)
OUTPUT:  ('10', None)
------------Demo 1--------
Q: Tina makes $18.00 an hour.  If she works more than 8 hours per shift, she is eligible for overtime, which is paid by your hourly wage + 1/2 your hourly wage.  If she works 10 hours every day for 5 days, how much money does she make?
A: She works 8 hours a day for $18 per hour so she makes 8*18 = $<<8*18=144.00>>144.00 per 8-hour shift
She 



‚úÖ Loss: -0.0000
-------Question---------
Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?

üß† Inference Index 2
üîç Top-K Indices: [159, 222, 19, 112, 136, 201, 202, 105, 92, 40, 83, 182, 239, 120, 137, 207, 89, 244, 30, 73, 8, 125, 147, 142, 61, 87, 41, 110, 56, 101, 145, 251, 191, 167, 156, 77, 194, 228, 215, 5, 59, 75, 117, 3, 51, 79, 99, 84, 212, 149, 64, 82, 190, 225, 35, 172, 85, 65, 4, 185, 250, 233, 109, 176]
OUTPUT:  ('$55', None)
OUTPUT:  ('$55', None)
OUTPUT:  ('50', None)
OUTPUT:  ('50', None)
OUTPUT:  ('50', None)
OUTPUT:  ('25', None)
------------Demo 1--------
Q: Baez has 25 marbles. She loses 20% of them one day. Then a friend sees her and gives her double the amount that Baez has after she lost them. How many marbles does Baez end up with?
A: She loses 5 m

In [None]:
# ------------------------------------------------------------
# 0.  Imports & config ‚Äì nothing here should clash with yours
# ------------------------------------------------------------
from tqdm import tqdm
import torch
from mathbert_encoder import MathBERTEncoder
import retriever_cosine as rc
# from import retrieve_top_k_cosine, retrieve_sample_k_cosine
from response_sampler import sample_responses_per_demo
from reward_aggregator import compute_demo_accuracy
from icl_model_wrapper import OpenAIICLModel
from grpo_optimizer import grpo_step
from datasets import load_dataset
from dotenv import load_dotenv
import os
from transformers import get_linear_schedule_with_warmup
from importlib import reload

reload(rc)

load_dotenv()

API_KEY = os.getenv("OPENAI_API_KEY")
DEVICE  = "cuda" if torch.cuda.is_available() else "cpu"
K       = 3                # demos per query
NUM_SAMPLES = 5             # model completions per query
TEMPERATURE = 0.7           # keep same as training loop

# ------------------------------------------------------------
# 1.  Initialise encoder **in eval mode** (weights frozen)
# ------------------------------------------------------------
encoder.eval()                                   # no grads!

icl_model = OpenAIICLModel(api_key=API_KEY,
                           model_name="gpt-4.1-nano",
                           temperature=TEMPERATURE)

# ------------------------------------------------------------
# 2.  Load/define the inference set
#     If you already saved a slice elsewhere, just load it.
# ------------------------------------------------------------
gsm8k_data = load_dataset("gsm8k", "main")["train"].select(range(256))  # slice first 200 examples
gsm8k_to_infer = load_dataset("gsm8k", "main")["train"] \
                     .select(range(5))          # ‚¨Ö change as needed

# ------------------------------------------------------------
# 3.  Evaluation loop
# ------------------------------------------------------------
total, correct = 0, 0
all_losses     = []          # optional ‚Äì to compare prompt quality

for idx in tqdm(range(len(gsm8k_to_infer)), desc="Baseline eval"):
    item       = gsm8k_to_infer[idx]
    Q_inf, A_gt = item["question"], item["answer"]

    # Build demo pool (everything except current query)
    demos = [(d["question"], d["answer"])
             for j, d in enumerate(gsm8k_data) if j != idx]

    # Encode query + candidate demos
    with torch.no_grad():
        q_emb     = encoder.encode([Q_inf], detach=True).squeeze(0)
        demo_embs = encoder.encode([q for (q, _) in demos], detach=True)

    # ------- ORIGINAL cosine retrieval -------------------------
    top_k, _ = rc.retrieve_top_k_cosine(
        q_emb, demo_embs, k=min(K, len(demos))
    )
    print(f"üîç Top-K Indices: {top_k}")
    selected_demos = [demos[i] for i in top_k]          # length = 2

    # ------- Run the ICL model --------------------------------
    responses_nested = sample_responses_per_demo(
        demo_tuples = selected_demos,
        Q_inf       = Q_inf,
        icl_model   = icl_model,
        num_samples = NUM_SAMPLES,
        parallel=True
    )
    print(responses_nested)
    flat_responses = [r for demo_resps in responses_nested for r in demo_resps]
    acc = compute_demo_accuracy(flat_responses, A_gt)
    print(acc) 
    correct += acc
    total   += 1

baseline_acc = correct / total
print(f"\nüìä  Baseline accuracy on GSM8K_to_infer: {baseline_acc:.3%}")


Baseline eval:   0%|          | 0/5 [00:00<?, ?it/s]

üîç Top-K Indices: [49, 138, 8]
------------Demo 1--------
Q: Gerald spends $100 a month on baseball supplies. His season is 4 months long. He wants to use the months he's not playing baseball to save up by raking, shoveling, and mowing lawns. He charges $10 for each. How many chores does he need to average a month to save up for his supplies?
A: He needs to save up $400 because 4 x 100 = <<4*100=400>>400
He has 8 months to earn this money because 12 - 4 = <<12-4=8>>8
He needs to earn $50 a month because 400 / 8 = <<400/8=50>>50
He needs to do 5 tasks a month because 50 / 10 = <<50/10=5>>5
#### 5

Q: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
A:
72


Baseline eval:  20%|‚ñà‚ñà        | 1/5 [00:01<00:06,  1.55s/it]

[['72', '72', '72', '72', '72'], ['48 + 24 = 72', '48 + (48 / 2) = 48 + 24 = 72', '48 + 24 = 72', '48 + 24 = 72', '48 + (48 / 2) = 48 + 24 = 72'], ['72', '72', '72', '72', '144']]
0.9333333333333333
üîç Top-K Indices: [12, 177, 221]
------------Demo 1--------
Q: Jasper will serve charcuterie at his dinner party. He buys 2 pounds of cheddar cheese for $10, a pound of cream cheese that cost half the price of the cheddar cheese, and a pack of cold cuts that cost twice the price of the cheddar cheese. How much does he spend on the ingredients?
A: A pound of cream cheese cost $10 / 2 = $<<10/2=5>>5.
A pack of cold cuts cost $10 x 2 = $<<10*2=20>>20.
Jasper spent $10 + $5 + $20 = $<<10+5+20=35>>35 on the ingredients.
#### 35

Q: Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?
A:
10


Baseline eval:  40%|‚ñà‚ñà‚ñà‚ñà      | 2/5 [00:03<00:04,  1.50s/it]

[['10', '10', '10', '10', '10'], ['10', '10', '10', '10', '10'], ['10', '10', '10', '10', '10']]
1.0
üîç Top-K Indices: [177, 37, 176]
------------Demo 1--------
Q: Simon wanted to buy flowers that his mom could plant for Mother's Day.  The garden center was offering 10% off all purchases.  He bought 5 pansies at $2.50 each, one hydrangea that cost $12.50 and 5 petunias that cost $1.00 each.  If he paid with a $50 bill, how much change would Simon receive back from his purchase?
A: 5 pansies at $2.50 each is 5*2.50 = $<<5*2.5=12.50>>12.50
5 petunias at $1.00 each 5*1 = $<<5*1=5.00>>5.00
All total he spends 12.50+12.50+5.00 = $<<12.50+12.50+5.00=30.00>>30.00
The sale is 10% off so 30*.10 = $<<30*.10=3.00>>3.00
The purchase total now comes to 30-3 = $<<30-3=27.00>>27.00
He pays with a $50 bill so 50-27 = $<<50-27=23.00>>23.00
#### 23

Q: Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpo

Baseline eval:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 3/5 [00:05<00:03,  1.91s/it]

[['50', '50', '50', '50', '50'], ['55', '25', '25', '25', '70'], ['55', '100 - (50 + 15 + 30) = 100 - 95 = 5', '55', '55', '55']]
0.9333333333333333
üîç Top-K Indices: [5, 49, 8]
------------Demo 1--------
Q: Albert is wondering how much pizza he can eat in one day. He buys 2 large pizzas and 2 small pizzas. A large pizza has 16 slices and a small pizza has 8 slices. If he eats it all, how many pieces does he eat that day?
A: He eats 32 from the largest pizzas because 2 x 16 = <<2*16=32>>32
He eats 16 from the small pizza because 2 x 8 = <<2*8=16>>16
He eats 48 pieces because 32 + 16 = <<32+16=48>>48
#### 48

Q: Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?
A:
54


Baseline eval:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 4/5 [00:07<00:02,  2.14s/it]

[['54', '36', '48', '33', '54'], ['48', '48', '48', '48', '48'], ['54', '36', '54', '54', '54']]
0.0
üîç Top-K Indices: [206, 156, 175]
------------Demo 1--------
Q: John volunteers at a shelter twice a month for 3 hours at a time.  How many hours does he volunteer per year?
A: He volunteers 2*12=<<2*12=24>>24 times a year
So he volunteers for 24*3=<<24*3=72>>72 hours
#### 72

Q: James writes a 3-page letter to 2 different friends twice a week.  How many pages does he write a year?
A:
624


Baseline eval: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:09<00:00,  1.92s/it]

[['624', '312', '624', '312', '624'], ['312', '312', '312', '312', '312'], ['3 pages √ó 2 friends √ó 2 times a week √ó 52 weeks = 3 √ó 2 √ó 2 √ó 52 = 624', '312', '312', '312', '312']]
0.26666666666666666

üìä  Baseline accuracy on GSM8K_to_infer: 62.667%





In [None]:
# Save the updated MathBERT model
save_path = "./updated_mathbert"  # your save directory
encoder.model.save_pretrained(save_path)
encoder.tokenizer.save_pretrained(save_path)

# LOADING

# from transformers import BertTokenizer, BertModel

# model = BertModel.from_pretrained("./updated_mathbert")
# tokenizer = BertTokenizer.from_pretrained("./updated_mathbert")
