In [2]:
# Main training script
"""
Supervised Finetuning (Priming) stage

The model will go through standard instruction fine tuning.
The dataset has been artificially generated using a subset of the original dataset and a larger,
more trust worthy model (deepseek-r1-0528) that generated the reasoning and answers.
"""

# TODO

"""
Reinforcement Learning (RL) stage

Here the model will be trained via GRPO with accuracy based reward functions.
(minimizing l2 distance between predicted and actual values)

Since smaller models generally need more context length to reason to reach the same performance as larger models, we will iteratively increase the context length.
For the first training run the context length will be set to 4096, when the model tries to exceed the context length >2% of the time, we will increase the context length by 4096 and continue training.
We will repeat this pattern until the context length reaches a maximum of 12228 tokens.

This should be approximately ~2000 training steps.
"""

'\nReinforcement Learning (RL) stage\n\nHere the model will be trained via GRPO with accuracy based reward functions.\n(minimizing l2 distance between predicted and actual values)\n\nSince smaller models generally need more context length to reason to reach the same performance as larger models, we will iteratively increase the context length.\nFor the first training run the context length will be set to 4096, when the model tries to exceed the context length >2% of the time, we will increase the context length by 4096 and continue training.\nWe will repeat this pattern until the context length reaches a maximum of 12228 tokens.\n\nThis should be approximately ~2000 training steps.\n'

In [3]:
!pip3 install -q trl

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/376.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m376.2/376.2 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/494.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m121.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m87.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import re
from fractions import Fraction
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import GRPOTrainer, GRPOConfig
from datasets import load_dataset

# Configuration
MODEL_NAME = "Qwen/Qwen3-0.6B"
DATA_PATH = "/content/train.csv"
OUTPUT_DIR = "grpo_math_model"


In [5]:

# 1. Load your dataset from a local CSV
dataset = load_dataset(
    "csv",
    data_files={"train": DATA_PATH},
    split="train"
)

# 2. Preprocess: add reasoning instruction to each task
def add_instruction(example):
    example["prompt"] = (
        f"Problem: {example['task'].strip()}\n\n"
        "Please solve this step by step:\n"
        "1. First, understand what is being asked\n"
        "2. Show your reasoning\n"
        "3. Provide your final answer in brackets like [52]\n\n"
        "Your response should end with your final numerical answer in brackets."
    )
    return example

dataset = dataset.map(add_instruction)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [6]:

# 3. Define parse and reward functions
def parse_answer(s):
    """Extracts answer from various formats, returns float."""
    if s is None:
        return None

    s = str(s)

    # Try different answer formats in order of preference
    patterns = [
        r"\[(\d+(?:\.\d+)?)\]",           # [52] format
        r"\\boxed\{(\d+(?:\.\d+)?)\}",    # \boxed{52} format
        r"boxed\{(\d+(?:\.\d+)?)\}",      # boxed{52} format
        r"answer is (\d+(?:\.\d+)?)",     # "answer is 52"
        r"(\d+(?:\.\d+)?)(?:\s*$|\s*\n)", # number at end
    ]

    for pattern in patterns:
        match = re.search(pattern, s, re.IGNORECASE)
        if match:
            val = match.group(1).strip()
            break
    else:
        return None

    if val.lower() in ['answer', 'solution', '']:
        return None

    try:
        return float(val)
    except ValueError:
        try:
            return float(Fraction(val))
        except (ValueError, ZeroDivisionError):
            return None

def reward_func(prompts, completions, **kwargs):
    """Reward based on negative absolute error to true answer."""
    rewards = []
    for prompt, completion in zip(prompts, completions):
        # Extract the answer from the original dataset by finding the example
        # that matches this prompt
        matching_example = None
        for example in dataset:
            if example["prompt"] == prompt:
                matching_example = example
                break

        if matching_example is None:
            rewards.append(-1.0)
            continue

        # Debug: print what we're trying to parse
        print(f"Raw answer from dataset: {repr(matching_example['answer'])}")
        print(f"Completion: {repr(completion[:100])}...")  # First 100 chars

        true_val = parse_answer(matching_example["answer"])
        pred = parse_answer(completion)

        print(f"Parsed true_val: {true_val}, pred: {pred}")

        if pred is None or true_val is None:
            rewards.append(-1.0)
        else:
            rewards.append(-abs(pred - true_val))
    return rewards

In [7]:

# 5. Initialize model and tokenizer
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

In [8]:
# 4. Configure GRPO training arguments
training_args = GRPOConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=4,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    logging_steps=10,
    temperature=0.6,
    top_p=0.95,
)
# 6. Initialize GRPOTrainer and train
trainer = GRPOTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    reward_funcs=reward_func,
)

In [9]:
trainer.train()

print("GRPO training complete!")



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdenissud[0m ([33mdenissud-msu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`generation_config` default values have been modified to match model-specific defaults: {'bos_token_id': 151643}. If this is not desired, please set these values explicitly.


Raw answer from dataset: '[52]'
Completion: ' The answer is unique.\nStep 1: Determine the total number of small cubes that touch the bottom of th'...
Parsed true_val: 52.0, pred: None
Raw answer from dataset: '[52]'
Completion: ' So, [52] and not [52] in any case.\n\nOkay, so I need to figure out how many small cubes in a 4x4x4 o'...
Parsed true_val: 52.0, pred: 52.0
Raw answer from dataset: '[52]'
Completion: ' Your answer must be in English.\nTo solve this, you may use the formula for the volume of a cube, bu'...
Parsed true_val: 52.0, pred: None
Raw answer from dataset: '[52]'
Completion: " So, if your answer is 100, then put \\boxed{100} in brackets.\n[52]\n\nOkay, let's see. I need to find "...
Parsed true_val: 52.0, pred: 52.0
Raw answer from dataset: '[52]'
Completion: " Please don't use markdown, just plain text.\n\nTo solve the problem, first, find the total number of "...
Parsed true_val: 52.0, pred: 52.0
Raw answer from dataset: '[52]'
Completion: ' [52]\n\nTo solve this pr

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.16 GiB. GPU 0 has a total capacity of 14.74 GiB of which 680.12 MiB is free. Process 2184 has 14.07 GiB memory in use. Of the allocated memory 13.49 GiB is allocated by PyTorch, and 470.05 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)