In [None]:

import re
from datasets import load_dataset, Dataset
from trl import GRPOConfig, GRPOTrainer
from transformers import AutoTokenizer, AutoModel # Used for checking tokenizer properties if needed

# --- 1. Dataset Loading and Preprocessing ---

def extract_final_answer_from_gsm8k(text: str) -> float | None:
    """Extracts the final numerical answer from a GSM8K answer string."""
    # Example format: "... The final answer is #### <number>"
    match = re.search(r"####\s*([\d\.\,]+)", text)
    if match:
        try:
            # Remove commas for thousands separator before converting to float
            return float(match.group(1).replace(',', ''))
        except ValueError:
            return None
    return None

def extract_intermediate_steps(text):
    """
    Parses <<expr=result>> chunks but only returns those where
    `result` can actually be cast to float.
    """
    pattern = r"<<\s*([^<>=]+)\s*=\s*([^<>]+)\s*>>"
    matches = re.findall(pattern, text)
    cleaned = []
    for expr, result in matches:
        try:
            val = float(result.strip())
        except ValueError:
            # skip any non-numeric “results”
            continue
        cleaned.append((expr.strip(), val))
    return cleaned


def preprocess_gsm8k(examples):
    prompts, answer_values, inter_steps_str = [], [], []
    for q, a in zip(examples["question"], examples["answer"]):
        num = extract_final_answer_from_gsm8k(a)
        steps = extract_intermediate_steps(a)
        if num is None:
            continue
        # turn each (expr, val) into a single string "expr=val"
        steps_str = [f"{expr}={val}" for expr, val in steps]
        prompts.append(f"Question: {q}\nAnswer:")
        answer_values.append(num)
        inter_steps_str.append(steps_str)

    return {
        "prompt": prompts,
        "answer_value": answer_values,
        "inter_steps": inter_steps_str,  # now a list of lists of strings
    }

# Load GSM8K dataset (main split, which contains 'question' and 'answer')
print("Loading GSM8K dataset...")
try:
    # Using a small subset for faster demonstration/debugging. Remove .select() for full dataset.
    # raw_train_dataset = load_dataset("gsm8k", "main", split="train").select(range(200)) # Small subset
    raw_train_dataset = load_dataset("gsm8k", "main", split="train")
except Exception as e:
    print(f"Failed to load 'gsm8k/main' split: {e}")
    print("Please ensure the 'gsm8k' dataset is available or check its identifier on Hugging Face Datasets.")
    raise

print(f"Loaded {len(raw_train_dataset)} examples from GSM8K train split.")

# Preprocess the dataset
# GRPOTrainer expects a 'prompt' field by default.
# Other columns (like 'answer_value' here) are passed as **kwargs to the reward function(s).
train_dataset = raw_train_dataset.map(
    preprocess_gsm8k,
    batched=True, # Process examples in batches
    remove_columns=raw_train_dataset.column_names, # Remove original 'question' and 'answer' columns
    desc="Preprocessing GSM8K dataset"
)

# Filter out any examples where 'prompt' or 'answer_value' might have become None
# (though preprocess_gsm8k tries to avoid this by only adding valid pairs)
train_dataset = train_dataset.filter(
    lambda example: example['prompt'] is not None and example['answer_value'] is not None
)

if len(train_dataset) == 0:
    raise ValueError(
        "Preprocessing resulted in an empty dataset. "
        "Check dataset loading and the preprocessing function."
    )
print(f"Dataset preprocessed. Number of examples: {len(train_dataset)}")
print(f"Example preprocessed data: {train_dataset[0]}")

# --- 2. Reward Function Definition ---
import re


def evaluate_intermediate_steps(predicted_steps, expected_steps, tol=1e-3):
    """
    Rewards each expected (expr, value) pair so long as it appears 
    anywhere in predicted_steps (order no longer matters).
    """
    correct = 0
    total = len(expected_steps)
    # For each expected step, check if any predicted step matches
    for exp_expr, exp_val in expected_steps:
        for pred_expr, pred_val in predicted_steps:
            if exp_expr == pred_expr and abs(pred_val - exp_val) < tol:
                correct += 1
                break  # stop searching once this expected step is matched
    return correct, total


def parse_step_str(step_str: str) -> tuple[str, float]:
    expr, val = step_str.split("=", 1)
    return expr, float(val)




# import torch
# import torch.nn.functional as F
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# # Initialize MathBERT model and tokenizer (downloaded once)
# mathberttokenizer = AutoTokenizer.from_pretrained("tbs17/MathBERT", )
# mathbertmodel = AutoModel.from_pretrained("tbs17/MathBERT", ).to(device)
# mathbertmodel.eval()

# def embed_text(text: str) -> torch.Tensor:
#     """
#     Returns a pooled embedding for the input text using MathBERT.
#     """
#     inputs = mathberttokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
#     with torch.no_grad():
#         outputs = mathbertmodel(**inputs)
#         # Use [CLS] token representation
#         return outputs.last_hidden_state[:, 0]


# def cosine_similarity(a: torch.Tensor, b: torch.Tensor) -> float:
#     """
#     Compute cosine similarity between two 1D tensors.
#     """
#         # Ensure tensors are on same device
#     a, b = a.to(device), b.to(device)
#     return F.cosine_similarity(a, b, dim=-1).item()

from typing import List, Any


def math_accuracy_reward(
    completions: List[str],
    **kwargs: Any
) -> List[float]:
    """
    Reward function combining final-answer accuracy, intermediate-step correctness,
    and MathBERT embedding similarity (non-negative contribution).
    """
    rewards: List[float] = []
    true_answers = kwargs.get("answer_value", [])
    expected_steps_str = kwargs.get("inter_steps", [])

    # Precompute embeddings for expected reasoning if provided
    # expected_reasoning = [" ".join(steps) for steps in expected_steps_str]
    # expected_embeddings = []
    # if expected_reasoning:
    #     expected_embeddings = [embed_text(r) for r in expected_reasoning]

    for i, comp in enumerate(completions):
        r = 0.0
        # Final answer reward
        pred_ans = extract_final_answer_from_gsm8k(comp)
        true_ans = true_answers[i] if i < len(true_answers) else None
        if pred_ans is not None and true_ans is not None and abs(pred_ans - true_ans) < 1e-3:
            r += 2.0
            # print("Positive reward")
        else:
            r -= 0.5

        # Intermediate-step reward
        if expected_steps_str:
            predicted_steps = extract_intermediate_steps(comp)
            expected = [parse_step_str(s) for s in expected_steps_str[i]]
            correct, total = evaluate_intermediate_steps(predicted_steps, expected)
            if total > 0:
                r += (correct / total) * 2

        # MathBERT embedding similarity reward (no negative contribution)
        # Embed the model's reasoning text
        # reasoning_text = comp.replace("\n", " ")
        # pred_embedding = embed_text(reasoning_text)
        # if expected_embeddings:
        #     exp_emb = expected_embeddings[i]
        #     sim = cosine_similarity(pred_embedding, exp_emb)
        #     # Add similarity score (0 to 1)
        #     r += sim

        rewards.append(r)
    return rewards


# --- 3. Model and Training Configuration ---

model_name = "Ashed00/SmolMath1-SFT-gsm8k"
tokenizer = AutoTokenizer.from_pretrained(model_name,)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print(f"Set tokenizer.pad_token to tokenizer.eos_token: {tokenizer.eos_token}")



# GRPO Configuration
training_args = GRPOConfig(
    # --- Basic setup ---
    output_dir=f"{model_name}-gsm8k-GRPO-custom", # Directory to save model and logs


    # --- GRPO specific parameters ---
    beta=0.2,

    # --- Training hyperparameters ---
    learning_rate=5.0e-5,      
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1, 
    num_train_epochs=4, 
    logging_steps=20,
    save_steps=4000,
    report_to="wandb",
    num_generations=4,
    max_grad_norm = 0.8,



    # --- Technical settings ---
    remove_unused_columns=False,

)

# --- 4. GRPOTrainer Initialization and Training ---

print("Initializing GRPOTrainer...")
trainer = GRPOTrainer(
    model=model_name, 
    reward_funcs=math_accuracy_reward, 
    args=training_args,
    train_dataset=train_dataset,

)

print("Starting GRPO training...")
trainer.train(resume_from_checkpoint=True)

print("Training finished.")

# --- 5. Save the final model ---
final_model_path = f"{training_args.output_dir}/final_model"
trainer.save_model(final_model_path)
print(f"Final trained model saved to: {final_model_path}")

# To use the model for inference later:
from transformers import pipeline
question = "Question: A farmer has 10 apples. He gives 3 to his friend and buys 5 more. How many apples does he have now? \nAnswer:"
# # Load the trained model (the value head is usually not saved by default for inference pipelines,
# # but the base model's weights are updated)
text_generator = pipeline("text-generation", model=final_model_path, tokenizer=model_name)
result = text_generator(question, max_new_tokens=100) # Use similar generation params as in training
print(f"\n--- Inference Example ---")
print(f"{question}")
print(f"{result[0]['generated_text']}")

Loading GSM8K dataset...


Using the latest cached version of the dataset since gsm8k couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'main' at /home/ivlabs/.cache/huggingface/datasets/gsm8k/main/0.0.0/e53f048856ff4f594e959d75785d2c2d37b678ee (last modified on Sat Jul  5 00:09:21 2025).


Loaded 7473 examples from GSM8K train split.
Dataset preprocessed. Number of examples: 7470
Example preprocessed data: {'prompt': 'Question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\nAnswer:', 'answer_value': 72.0, 'inter_steps': ['48/2=24.0', '48+24=72.0']}
Initializing GRPOTrainer...


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Starting GRPO training...


  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
[34m[1mwandb[0m: Currently logged in as: [33mbt22ece049[0m ([33mbt21ece003-nit-nagpur[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss
8020,0.0182
8040,0.0178
8060,0.0198
8080,0.0206
8100,0.0173
8120,0.0177
8140,0.0185
8160,0.0167
8180,0.0148
8200,0.0177


Training finished.
Final trained model saved to: Ashed00/SmolMath1-SFT-gsm8k-gsm8k-GRPO-custom/final_model


Device set to use cuda:0



--- Inference Example ---
Question: A farmer has 10 apples. He gives 3 to his friend and buys 5 more. How many apples does he have now? 
Answer:
Question: A farmer has 10 apples. He gives 3 to his friend and buys 5 more. How many apples does he have now? 
Answer: The farmer has 10 + 3 = <<10+3=13>>13 apples.
The farmer has 13 + 5 = <<13+5=18>>18 apples now.
#### 18


In [None]:
# !huggingface-cli login

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import re
from tqdm import tqdm # for a progress bar

# Load the fine-tuned model and tokenizer
model_path = "Macromrit/SmolLM2-135M-GRPO-Trained-For-Reasoning"                                                                                                                                                                                                                                                                                                # Path where your fine-tuned model is saved
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Ensure pad token is set for generation. For some models, especially causal LMs,
# the EOS token is often used as the pad token during batch inference.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Move the model to the selected device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval() # Set model to evaluation mode
print(f"Using device for evaluation: {device}")

# Load the GSM8K test dataset
dataset = load_dataset("gsm8k", "main")
eval_dataset = dataset["test"]

# Function to extract the answer from the model's generation using regex

def extract_answer_from_generation(generation_text):
    # First, try to extract using the explicit '#### <number>' pattern
    match = re.search(r"####\s*(-?\d+(\.\d+)?)", generation_text)
    if match:
        try:
            return float(match.group(1))
        except ValueError:
            return None

    #if '#### <number>' not found, search for the last float/integer in the entire text
    all_numbers = re.findall(r"-?\d+(?:\.\d+)?", generation_text)
    if all_numbers:
        try:
            return float(all_numbers[-1])  # Return the last number
        except ValueError:
            return None

    return None


# Function to extract the ground truth answer
def extract_ground_truth_answer(answer_text):
    # The ground truth answers in GSM8K are already in the "#### <answer>" format
    # We can reuse the same regex for consistency.
    match = re.search(r"####\s*(-?\d+(\.\d+)?)", answer_text)
    if match:
        try:
            return float(match.group(1))
        except ValueError:
            return None
    return None

# Evaluation parameters
batch_size = 16 # Adjust batch size based on your GPU memory
max_new_tokens = 256 # Adjust as needed based on expected answer length

correct_predictions = 0
total_predictions = 0

print("Starting batched evaluation...")

# Create batches manually
for i in tqdm(range(0, len(eval_dataset), batch_size)):
    batch = eval_dataset[i:i+batch_size]
    # Access each example in the batch by its index
    questions = [batch["question"][j] for j in range(len(batch["question"]))]
    ground_truth_answer_texts = [batch["answer"][j] for j in range(len(batch["answer"]))]

    # Format prompts for the current batch
    prompts = [f"Question: {q.strip()}\nAnswer:" for q in questions]

    # Tokenize the batch of prompts
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, padding_side='left').to(device)

    # Generate answers for the entire batch
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    for j in range(len(prompts)):
        input_len = inputs.input_ids[j].shape[0]
        generated_token_ids = outputs[j, input_len:] # Get only the generated tokens
        generated_text = tokenizer.decode(generated_token_ids, skip_special_tokens=True)

        predicted_answer = extract_answer_from_generation(generated_text)
        ground_truth_answer = extract_ground_truth_answer(ground_truth_answer_texts[j])
        total_predictions += 1

        if predicted_answer is not None and ground_truth_answer is not None:
            
            if abs(predicted_answer - ground_truth_answer) < 1e-6: # Using a small tolerance for float comparison
                correct_predictions += 1



if total_predictions > 0:
    accuracy = correct_predictions / total_predictions
    print(f"\nEvaluation Complete:")
    print(f"Correct Predictions: {correct_predictions}")
    print(f"Total Predictions: {total_predictions}")
    print(f"Accuracy: {accuracy:.4f}")
else:
    print("No predictions were made or no answers could be extracted for comparison.")

In [None]:
# To use the model for inference later:
from transformers import pipeline
question = "Question: Nancy's old washing machine could only wash 9 pieces of clothing at a time. If she had to wash 19 shirts and 8 sweaters how many loads would she have to do?\n Answer:"
# # Load the trained model (the value head is usually not saved by default for inference pipelines,
# # but the base model's weights are updated)
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
result = text_generator(question, max_new_tokens=200) # Use similar generation params as in training
print(f"\n--- Inference Example ---")
# print(f"{question}")
print(f"{result[0]['generated_text']}")