In this notebook, I demonstrate a reasoning model (deepseek-r1) training pipeline for a math reasoning task. I use GRPOTrainer from trl (huggingface) and unsltorh for fast inference.

# Dependencies

We will be relying on Unsloth, vLLM and Transformers Reinforcement Learning (TRL) as the core packages for training and inferencing.

In [None]:
%%capture
!pip install unsloth vllm
!pip install --upgrade pillow
# If you are running this notebook locally (not colab), you need to install `diffusers` too
# !pip install diffusers

# Temporarily install a specific TRL nightly version that supports GRPO
!pip install git+https://github.com/huggingface/trl.git@e95f9fb74a3c3647b86f251b7e230ec51c64b72b

In [1]:
from unsloth import FastLanguageModel, PatchFastRL

# Execute the Patch
PatchFastRL("GRPO", FastLanguageModel)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-24 11:41:05 __init__.py:207] Automatically detected platform cuda.


In [2]:
import os
import json
import math
import copy
import re
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset as TorchDataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AdamW,
    BitsAndBytesConfig,
    PrinterCallback
)
from peft import LoraConfig, get_peft_model
from Levenshtein import ratio as levenshtein_ratio
from tqdm import tqdm
import datetime

In [3]:
class CFG:
    MAX_TRAIN = 100
    MAX_TOKENS = 2048
    NUM_GENERATIONS = 4
    USE_PEFT = True
    BATCH_SIZE=1
    MAX_STEPS = 80
    
    BETA = 0.04
    LR = 1.e-5
    
    model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
    splitter = '<｜Assistant｜>'
    
    step_count=10
    DEBUG = False

# Data preparation

In [4]:
import kagglehub
# Download latest version
path = kagglehub.dataset_download("artemgoncarov/math-problems-imo")

print("Path to dataset files:", path)

Path to dataset files: /home/dortp58/.cache/kagglehub/datasets/artemgoncarov/math-problems-imo/versions/1


In [5]:
import re

def extract_boxed_text(text):
    pattern = r'oxed{(.*?)}'
    matches = re.findall(pattern, text)
    if not matches:
        return ""
    for match in matches[::-1]:
        if match != "":
            return match
    return ""

In [7]:
import pandas as pd
df = pd.read_parquet(path)
df = df.reset_index().rename({'index': 'id'}, axis=1)
df['answer'] = df['solution'].map(extract_boxed_text)

def is_valid_answer(s):
    try:
        if float(s) == int(s):
            i = int(s)
            return 0<=i<1000
        else:
            return False
    except ValueError:
        return False
    
mask = df['answer'].map(is_valid_answer)
df = df[mask]

In [8]:
from datasets import Dataset

df = df.iloc[:CFG.MAX_TRAIN]
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['id', 'problem', 'solution', 'answer', '__index_level_0__'],
    num_rows: 100
})

In [9]:
dataset = dataset.train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'problem', 'solution', 'answer', '__index_level_0__'],
        num_rows: 90
    })
    test: Dataset({
        features: ['id', 'problem', 'solution', 'answer', '__index_level_0__'],
        num_rows: 10
    })
})

# Training setup

I used 3 different reward functions, and used a prompt from the Deepseek paper (https://arxiv.org/abs/2501.12948)

In [10]:
## We would also want a reward function based on accuracy
# split after </think>, then get the answer within bbox

## We can also do a reward based on Similarity of 

import re

def format_reward_func(completions, **kwargs):
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<think>.*?</think>.*?oxed{(.*?)}.*?$"
    matches = [re.match(pattern, content, re.DOTALL) for content in completions]
    return [1.0 if match else 0.0 for match in matches]


def extract_boxed_text(text):
    pattern = r'oxed{(.*?)}'
    matches = re.findall(pattern, text)
    if not matches:
        return ""
    for match in matches[::-1]:
        if match != "":
            return match
    return ""

def accuracy_reward_func(completions, answer, **kwargs):
    # Regular expression to capture content inside \boxed{}
    contents = [extract_boxed_text(completion) for completion in completions]
    # Reward 1 if the content is the same as the ground truth, 0 otherwise
    return [1.0 if c == str(gt) else 0.0 for c, gt in zip(contents, answer)]

In [11]:
def levenshtein_reward_func(completions, solution, **kwargs):
    res = []
    for completion, sol in zip(completions, solution):
        if '</think>' in completion:
            t = completion.split('</think>')[-1]
            res.append(levenshtein_ratio(t, sol))
        else:
            res.append(0.0)
    return res

In [12]:
# Load Base Model & Tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = CFG.model_name,
    max_seq_length = CFG.MAX_TOKENS,                      # Can increase for longer reasoning traces
    load_in_4bit = True,                        # False for LoRA 16bit
    fast_inference = True,                      # Enable vLLM fast inference
    max_lora_rank = 64,                         # Larger rank = smarter, but slower
    gpu_memory_utilization = 0.7,               # Reduce if out of memory
)

# Prepare Model for Parameter Efficient Fine Tuning
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,                                     # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha = 32,                            # LoRA Rank
    use_gradient_checkpointing = "unsloth",     # Enable long context finetuning
)

Unsloth: Switching from Unsloth dynamic quant to normal quant since
we do not yet support fast inference for unsloth/deepseek-r1-distill-qwen-1.5b-unsloth-bnb-4bit
==((====))==  Unsloth 2025.3.17: Fast Qwen2 patching. Transformers: 4.49.0. vLLM: 0.7.3.
   \\   /|    NVIDIA GeForce RTX 3060 Laptop GPU. Num GPUs = 1. Max memory: 5.792 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/deepseek-r1-distill-qwen-1.5b-bnb-4bit with actual GPU utilization = 63.12%
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 5.79 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 128.
Unsloth: vLLM's KV Cache can use up to 1.89 GB. Also swap space = 1 GB.
IN



INFO 03-24 11:42:53 weight_utils.py:254] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-24 11:42:54 model_runner.py:1115] Loading model weights took 1.5365 GB
INFO 03-24 11:42:54 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-24 11:42:55 worker.py:267] Memory profiling takes 1.08 seconds
INFO 03-24 11:42:55 worker.py:267] the current vLLM instance can use total_gpu_memory (5.79GiB) x gpu_memory_utilization (0.63) = 3.66GiB
INFO 03-24 11:42:55 worker.py:267] model weights take 1.54GiB; non_torch_memory takes 0.03GiB; PyTorch activation peak memory takes 0.70GiB; the rest of the memory reserved for KV Cache is 1.38GiB.
INFO 03-24 11:42:55 executor_base.py:111] # cuda blocks: 3240, # CPU blocks: 2340
INFO 03-24 11:42:55 executor_base.py:116] Maximum concurrency for 2048 tokens per request: 25.31x
INFO 03-24 11:42:56 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occu

Capturing CUDA graph shapes: 100%|██████████████| 19/19 [00:09<00:00,  2.08it/s]

INFO 03-24 11:43:05 model_runner.py:1562] Graph capturing finished in 9 secs, took 0.36 GiB
INFO 03-24 11:43:05 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 11.13 seconds



Unsloth 2025.3.17 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [13]:
def create_prompt(sample):
    question = sample['problem']
    chat = [{"role": "system", "content": "A conversation between User and Assistant. The user asks a question, and the Assistant solves it.  The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>"},
            {"role": "user", "content": question + ' Return final answer within \\boxed{}, after taking modulo 1000.'},]
    sample['prompt'] = tokenizer.apply_chat_template(
            conversation=chat,
            tokenize=False,
            add_generation_prompt=True
        )
    return sample

In [14]:
dataset = dataset.map(create_prompt)#, batched=True)

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [15]:
def gen(model, text, max_tokens):
    model_input = tokenizer(text, return_tensors='pt').to(model.device)
    model.eval()
    with torch.no_grad():
        tok = model.generate(**model_input, max_new_tokens=max_tokens, pad_token_id=tokenizer.pad_token_type_id)
        outputs = []
        for i in range(len(tok)):
            res = tokenizer.decode(tok[i], skip_special_tokens=True)
            output = res.split(CFG.splitter)[-1]
            outputs.append(output)
        return outputs[0] if len(outputs) == 1 else outputs

In [16]:
def evaluate_rewards(model, dataset, reward_functions: dict[str, callable], max_tokens: int, num_generations: int):
    completions = []
    other_info = []
    for example in tqdm(dataset):
        txt = example['prompt']
        kw = {k: v for k, v in example.items() if k not in {'prompt', 'completion'}}
        for _ in range(num_generations):
            other_info.append(kw)
            
        completion = gen(model, [txt]*num_generations, max_tokens)
        if isinstance(completion, str):
            completions.append(completion)
        else:
            completions += completion
        
    kwargs = {k: [d[k] for d in other_info] for k in other_info[0].keys()}
    res = {}
    for nm, reward_func in reward_functions.items():
        v = reward_func(completions=completions, **kwargs)
        print(nm, np.mean(v))
        res[nm] = np.mean(v)
    return res

In [17]:
reward_functions = {'formatting': format_reward_func, 'accuracy': accuracy_reward_func, 'solution_quality': levenshtein_reward_func}

In [22]:
if not CFG.DEBUG:
    original_rewards = evaluate_rewards(model=original_model, dataset=dataset['test'], reward_functions=reward_functions, max_tokens=CFG.MAX_TOKENS, num_generations=CFG.NUM_GENERATIONS)


100%|██████████| 10/10 [16:29<00:00, 98.90s/it]

formatting 0.725
accuracy 0.6
solution_quality 0.3062495202351799





# Training

In [18]:
from trl import GRPOConfig, GRPOTrainer
dtstr = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
output_directory=f"./DEEPSEEK-GRPO-{dtstr}"


training_args = GRPOConfig(
    output_dir=output_directory,
    use_vllm = True,
    learning_rate=CFG.LR,
    adam_beta1 = 0.9,                           # AdamW optimizer momentum parameter
    adam_beta2 = 0.99,                          # AdamW optimizer second moment parameter
    weight_decay = 0.1,                         # L2 regularization to prevent overfitting
    warmup_ratio = 0.1,                         # Portion of training steps for learning rate warmup
    lr_scheduler_type = "cosine",               # Learning rate decay schedule type
    optim = "adamw_8bit",
    per_device_train_batch_size=CFG.BATCH_SIZE,
    gradient_accumulation_steps=1,
    max_steps=CFG.MAX_STEPS,
    
    max_completion_length=CFG.MAX_TOKENS,  #2048
    num_generations=CFG.NUM_GENERATIONS,
    
    logging_steps=CFG.step_count,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=CFG.step_count,
#     eval_strategy="steps",
#     eval_steps=CFG.step_count,
#     do_eval=True,
    # gradient_checkpointing=True,  # Will crash the whole thing
    report_to="none",
    overwrite_output_dir = 'True',    
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 4


In [19]:
trainer = GRPOTrainer(
        model=model,
        reward_funcs=list(reward_functions.values()),
        args=training_args,
        train_dataset=dataset['train'],
        callbacks=[PrinterCallback()]
    )

In [25]:
trainer.train()

Step,Training Loss
10,-0.0
20,-0.0
30,-0.0
40,0.0
50,-0.0
60,-0.0
70,-0.0
80,-0.0


{'loss': -0.0, 'grad_norm': 0.034604042768478394, 'learning_rate': 8.750000000000001e-06, 'completion_length': 1693.35, 'rewards/format_reward_func': 0.325, 'rewards/accuracy_reward_func': 0.35, 'rewards/levenshtein_reward_func': 0.1815756194293499, 'reward': 0.8565756164491176, 'reward_std': 0.3982465725392103, 'kl': -5.865097045898438e-06, 'epoch': 0.1111111111111111}
{'loss': -0.0, 'grad_norm': 0.0, 'learning_rate': 7.500000000000001e-06, 'completion_length': 1424.0, 'rewards/format_reward_func': 0.425, 'rewards/accuracy_reward_func': 0.35, 'rewards/levenshtein_reward_func': 0.18992570266127587, 'reward': 0.9649257034063339, 'reward_std': 0.35146077554672955, 'kl': -6.186962127685547e-06, 'epoch': 0.2222222222222222}
{'loss': -0.0, 'grad_norm': 0.026678849011659622, 'learning_rate': 6.25e-06, 'completion_length': 1210.225, 'rewards/format_reward_func': 0.675, 'rewards/accuracy_reward_func': 0.425, 'rewards/levenshtein_reward_func': 0.30511854588985443, 'reward': 1.4051185488700866, 

TrainOutput(global_step=80, training_loss=-2.279258296766784e-07, metrics={'train_runtime': 13210.1595, 'train_samples_per_second': 0.006, 'train_steps_per_second': 0.006, 'total_flos': 0.0, 'train_loss': -2.279258296766784e-07})

# Results

In [26]:
if CFG.USE_PEFT:
    print('Loading trained model')
    CHKPT = CFG.MAX_STEPS
    adapter_model_name = f'{output_directory}/checkpoint-{CHKPT}/'
    new_model = PeftModel.from_pretrained(original_model, adapter_model_name)
else:
    new_model = original_model

Loading trained model


In [27]:
rewards = evaluate_rewards(model=new_model, dataset=dataset['test'], reward_functions=reward_functions, max_tokens=CFG.MAX_TOKENS, num_generations=CFG.NUM_GENERATIONS)
rewards

100%|██████████| 10/10 [22:48<00:00, 136.87s/it]

formatting 0.675
accuracy 0.525
solution_quality 0.27533443147722725





{'formatting': 0.675,
 'accuracy': 0.525,
 'solution_quality': 0.27533443147722725}