### Importing the required modules

In [None]:
import re
import sys
import torch
import pandas as pd
from pathlib import Path
from trl import GRPOTrainer, GRPOConfig
from transformers import AutoTokenizer, AutoModelForCausalLM

# Add the parent directory to the system path
sys.path.append(str(Path().resolve().parent.parent))

# Import local dependencies
from src.utils import get_device, set_seed
from src.data_processing import generate_response
from src.hf import hf_login, load_hf_dataset, dataset_to_pandas

### Setting up the environment

In [None]:
# Login to Hugging Face
hf_login()

In [None]:
# Get the device available on the system
device = get_device()
use_cuda = torch.cuda.is_available() and "cuda" in str(device).lower()

# Print the detected device
print(f"Detected device: {device}")

### Constants, hyperparameters and model configurations

In [None]:
seed = 42 # Seed for reproducibility
test_size = 0.2 # Train-test split percentage
model_id = "Qwen/Qwen2.5-0.5B-Instruct" # The model ID
dataset_name = "openai/gsm8k" # The dataset name on Hugging Face Hub
model_path = Path().resolve().parent.parent / "saved_models" / f"{model_id.split('/')[-1]}_grpo" # Path to save the fine-tuned model

In [None]:
# Set the seed for reproducibility
set_seed(seed)

### Data loading

In [None]:
# Load the dataset from Hugging Face Hub
train_dataset = load_hf_dataset(dataset_name, config_name="main", split="train")
test_dataset = load_hf_dataset(dataset_name, config_name="main", split="test")

In [None]:
# Convert the dataset to a pandas DataFrame for easier manipulation
train_dataset_df = dataset_to_pandas(train_dataset)

# Set pandas display options for better readability
pd.set_option("display.max_colwidth", None) 
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 0)         

# Display the first few rows of the dataset
display(train_dataset_df.head())  

In [None]:
def reward_func(completions: list[dict], ground_truth: list[str], **kwargs) -> list[float]:
    """
    Custom reward function that checks if the model's output matches the ground truth answer.
    
    Args:
        completions (list[dict]): List of model completions, each a dict with 'content' key.
        ground_truth (list[str]): List of ground truth answers.
        
    Returns:
        list[float]: List of rewards (1.0 for correct, 0.0 for incorrect).
    """
    
    # Regular expression to capture content inside \boxed{}
    matches = [re.search(r"\\boxed\{(.*?)\}", completion[0]['content']) for completion in completions]
    contents = [match.group(1) if match else "" for match in matches]
    
    # Reward 1 if the content is the same as the ground truth, 0 otherwise
    return [1.0 if c == gt else 0.0 for c, gt in zip(contents, ground_truth)]

In [None]:
def post_processing(example: dict) -> dict:
    """
    Post-processes a dataset example to extract the ground truth answer and format the prompt.
    
    Args:
        example (dict): A dictionary containing the dataset example with 'question' and 'answer' keys.
    
    Returns:
        dict: The modified example with 'ground_truth' and 'prompt' keys.
    """
    
    # Define the system prompt
    SYSTEM_PROMPT = (
        "You are a helpful assistant that solves problems step-by-step. "
        "Always include the final numeric answer inside \\boxed{}."
    )
    
    # Extract the ground truth answer using regex
    match = re.search(r"####\s*(-?\d+)", example["answer"])
    example["ground_truth"] = match.group(1) if match else None
    
    # Format the prompt with system and user roles
    example["prompt"] = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": example["question"]}
    ]

    # Return the modified example
    return example

In [None]:
# Apply post-processing to the datasets
train_dataset = train_dataset.map(post_processing).remove_columns(["question", "answer"])
test_dataset = test_dataset.map(post_processing).remove_columns(["question", "answer"])

In [None]:
# Convert the processed training dataset to a pandas DataFrame
train_df = dataset_to_pandas(train_dataset)

# Display the first few rows of the processed training DataFrame
display(train_df.head())

### Tokenizer

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Set the padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

### Building the model

In [None]:
# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage = True,
    device_map = "auto"
)

### Training the model

In [None]:
# Mixed precision settings
use_pin_memory = bool(use_cuda)
bf16 = bool(use_cuda and torch.cuda.is_bf16_supported())

# Define GRPO training configuration
config = GRPOConfig(
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps = 8,
    eval_strategy = "steps",
    num_generations = 4,
    num_train_epochs = 1,
    learning_rate = 5e-6,
    logging_steps = 2,
    weight_decay = 0.01,
    lr_scheduler_type = "cosine",
    dataloader_pin_memory = use_pin_memory,
    bf16 = bf16
)

In [None]:
# Initialize the GRPOTrainer
grpo_trainer = GRPOTrainer(
    args = config,
    model = model,
    reward_funcs = reward_func,
    processing_class = tokenizer,  
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)

# Start the training process
grpo_trainer.train()

### Evaluation

In [None]:

# Store predictions and ground truths
all_preds = []
all_labels = []

# Iterate over the test dataset for evaluation
for example in test_dataset:
    # Ensure the example is a dictionary
    assert isinstance(example, dict)
    
    # Get the input prompt and ground truth answer
    input_prompt = example["prompt"]
    ground_truth = example["ground_truth"]

    # Run the model to generate an answer
    response = generate_response(
        model = model, 
        tokenizer = tokenizer,
        full_message = input_prompt,
        stream = True
    ) 
    
    # Store the predictions and ground truths
    all_preds.append([{"role": "assistant", "content": response}])
    all_labels.append(ground_truth)
    
    # Print the ground truth
    print("Ground truth: ", ground_truth)

In [None]:

# Evaluate using reward_func
rewards = reward_func(all_preds, all_labels)

# Compute and display accuracy
accuracy = sum(rewards) / len(rewards)
print(f"Evaluation Accuracy: {accuracy:.2%}")