In [None]:
!pip install unsloth vllm triton==3.2.0 datasets==3.6.0 -qq

In [2]:
import re
from typing import Union, List
import torch
from tqdm import tqdm
from datasets import load_dataset
from unsloth import FastLanguageModel, PatchFastRL
from trl import GRPOConfig, GRPOTrainer
from vllm import SamplingParams

PatchFastRL("GRPO", FastLanguageModel)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-14 10:40:24 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-14 10:40:24 [__init__.py:239] Automatically detected platform cuda.


In [3]:
# Constants
lora_rank = 64

system_prompt = """
  Respond in the following format:
  <reasoning>
  ...
  </reasoning>
  <answer>
  ...
  </answer>
"""

sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.9,
        max_tokens=4096
)

In [4]:
# Model Initialization
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="Qwen/Qwen2.5-0.5B-Instruct",
    max_seq_length=1024,
    load_in_4bit=True,
    fast_inference=True,  # Enable vLLM fast inference
    max_lora_rank=lora_rank,
    dtype=torch.float16,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=lora_rank,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=lora_rank,
    random_state=42,
)

==((====))==  Unsloth 2025.7.3: Fast Qwen2 patching. Transformers: 4.53.1. vLLM: 0.8.5.post1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-0.5b-instruct-unsloth-bnb-4bit with actual GPU utilization = 49.53%
Unsloth: Your GPU has CUDA compute capability 7.5 with VRAM = 14.74 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 1024. Num Sequences = 192.
Unsloth: vLLM's KV Cache can use up to 6.77 GB. Also swap space = 0 GB.
INFO 07-14 10:41:28 [config.py:717] This model supports multiple tasks: {'embed', 'score', 'classify', 'reward', 'generate'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes confi

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

INFO 07-14 10:41:33 [cuda.py:240] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 07-14 10:41:33 [cuda.py:289] Using XFormers backend.
INFO 07-14 10:41:35 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 07-14 10:41:35 [model_runner.py:1108] Starting to load model unsloth/qwen2.5-0.5b-instruct-unsloth-bnb-4bit...
INFO 07-14 10:41:35 [loader.py:1187] Loading weights with BitsAndBytes quantization. May take a while ...
INFO 07-14 10:41:36 [weight_utils.py:265] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/538M [00:00<?, ?B/s]

INFO 07-14 10:41:44 [weight_utils.py:281] Time spent downloading weights for unsloth/qwen2.5-0.5b-instruct-unsloth-bnb-4bit: 7.650218 seconds
INFO 07-14 10:41:44 [weight_utils.py:315] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 07-14 10:41:45 [punica_selector.py:18] Using PunicaWrapperGPU.
INFO 07-14 10:41:47 [model_runner.py:1140] Model loading took 0.5747 GiB and 10.424625 seconds
INFO 07-14 10:41:56 [worker.py:287] Memory profiling takes 7.35 seconds
INFO 07-14 10:41:56 [worker.py:287] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.50) = 7.30GiB
INFO 07-14 10:41:56 [worker.py:287] model weights take 0.57GiB; non_torch_memory takes 0.03GiB; PyTorch activation peak memory takes 1.04GiB; the rest of the memory reserved for KV Cache is 5.65GiB.
INFO 07-14 10:41:56 [executor_base.py:112] # cuda blocks: 30878, # CPU blocks: 0
INFO 07-14 10:41:56 [executor_base.py:117] Maximum concurrency for 1024 tokens per request: 482.47x
INFO 07-14 10:41:56 [model_runner.py:1450] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If o

Capturing CUDA graph shapes:   0%|          | 0/27 [00:00<?, ?it/s]

INFO 07-14 10:42:51 [model_runner.py:1592] Graph capturing finished in 55 secs, took 0.38 GiB
INFO 07-14 10:42:51 [llm_engine.py:437] init engine (profile, create kv cache, warmup model) took 64.14 seconds
Unsloth: Just some info: will skip parsing ['q_norm', 'post_feedforward_layernorm', 'pre_feedforward_layernorm', 'k_norm']
Unsloth: Just some info: will skip parsing ['q_norm', 'post_feedforward_layernorm', 'pre_feedforward_layernorm', 'k_norm']


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Unsloth 2025.7.3 patched 24 layers with 24 QKV layers, 24 O layers and 24 MLP layers.


In [5]:
# Dataset Preparing for GRPO
def extract_answer(answer: str) -> Union[str, None]:
    """Get only the answer after '####' tokens (in gsm8k)"""
    return answer.split('####')[-1].strip().replace(',', '') if '####' in answer else None


def extract_reasoning(answer):
    """Get only the reasoning before '####' tokens (in gsm8k)"""
    return answer.split('####')[0].strip() if '####' in answer else None


def preproc(example):
    messages = [
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': example['question']}
    ]
    clear_answer = extract_answer(example['answer'])
    reasoning = extract_reasoning(example['answer'])

    return {
        'prompt': messages,
        'answer': clear_answer,
        'target_reasoning': reasoning
    }


dataset = load_dataset('openai/gsm8k', 'main', trust_remote_code=True)
dataset = dataset.map(preproc)

README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

## Training

In [6]:
# Auxiliary functions for using inside reward functions
def get_reasoning(content: str) -> Union[str, None]:
    """Get only the reasoning between <reasoning>REASONING</reasoning> tokens"""
    if '<reasoning>' in content and '</reasoning>' in content:
        return content.split('<reasoning>')[1].split('</reasoning>')[0].strip()
    return None


def isnumber(answer: str) -> bool:
    """Check if the string is a number (any: float, int, scientific, etc.)"""
    try:
        float(answer)
        return True
    except ValueError:
        return False


def get_answer(content: str) -> Union[str, None]:
    """Get only the answer between <answer>ANSWER</answer> tokens"""
    if '<answer>' in content and '</answer>' in content:
        return content.split('<answer>')[1].split('</answer>')[0].strip()
    return None


def calc_structure_reward(content: str) -> float:
    """Check the structure correctness of model's answer"""
    reward = 0.0
    if '<reasoning>' in content and '</reasoning>' in content:
        reward += 0.5
    if '<answer>' in content and '</answer>' in content:
        reward += 0.5
    return reward

In [7]:
# Rule-based rewards for GRPO
def target_length_reward_func(completions, target_reasoning, **kwargs) -> List[float]:
    """Give reward for the same reasoning len of model's answer as in target"""
    rewards = []
    contents = [completion[0]['content'] for completion in completions]

    for generated_content, target in zip(contents, target_reasoning):
        generated_reasoning = get_reasoning(generated_content)
        if generated_reasoning is None:
            rewards.append(0.0)
            continue

        generated_len = len(tokenizer(generated_reasoning)['input_ids'])
        target_len = len(tokenizer(target)['input_ids'])

        if target_len == 0:
            rewards.append(1.0 if generated_len == 0 else 0.0)
            continue

        reward = max(0.0, 1.0 - abs(generated_len - target_len) / target_len)
        rewards.append(reward)

    return rewards


def soft_format_reward_func(completions, **kwargs) -> List[float]:
    """Give reward for structure correctness of model's answer"""
    completion_contents = [completion[0]['content'] for completion in completions]
    return [calc_structure_reward(content) for content in completion_contents]


def isnumber_reward_func(completions, **kwargs) -> List[float]:
    """Give reward if the model's answer is a number"""
    completion_contents = [completion[0]['content'] for completion in completions]
    completion_answers = [get_answer(completion) for completion in completion_contents]
    return [0.25 if ans is not None and isnumber(ans) else 0.0 for ans in completion_answers]


def exact_match_reward_func(prompts, completions, answer, **kwargs) -> List[float]:
    """Give reward for exact match with target"""
    completion_contents = [completion[0]['content'] for completion in completions]
    completion_answers = [get_answer(completion) for completion in completion_contents]
    return [1.0 if pred == target else 0.0 for pred, target in zip(completion_answers, answer)]


def strict_format_reward_func(completions, **kwargs) -> List[float]:
    """Give reward for strict following the response format"""
    pattern = r'^\s*<reasoning>.*?</reasoning>\s*<answer>.*?</answer>\s*$'
    completion_contents = [completion[0]['content'] for completion in completions]
    matches = [re.match(pattern, content, re.S) for content in
               completion_contents]  # re.S for allowing '\n' in answer as in prompt
    return [1.0 if match else 0.0 for match in matches]

In [8]:
# L1: LCPO rewards
def lcpo_exact_length_reward_func(completions, target_reasoning, **kwargs) -> List[float]:
    """LCPO exact length reward: reward = -alpha * abs(target_len - generated_len)"""
    rewards = []
    for completion, target in zip(completions, target_reasoning):
        content = completion[0]['content']
        generated_reasoning = get_reasoning(content)
        if generated_reasoning is None or target is None:
            rewards.append(0.0)
            continue

        generated_len = len(tokenizer(generated_reasoning)['input_ids'])
        target_len = len(tokenizer(target)['input_ids'])
        rewards.append(-0.03 * abs(target_len - generated_len))

    return rewards


# reward grows if the model does not exceed the target_len and trying to answer more concise
def lcpo_max_length_reward_func(completions, target_reasoning, **kwargs) -> List[float]:
    """LCPO max: reward = clip(alpha * (target_len - generated_len) + beta, 0, 1) """
    rewards = []
    for completion, target in zip(completions, target_reasoning):
        content = completion[0]['content']
        generated_reasoning = get_reasoning(content)
        if generated_reasoning is None or target is None:
            rewards.append(0.0)
            continue

        generated_len = len(tokenizer(generated_reasoning)['input_ids'])
        target_len = len(tokenizer(target)['input_ids'])

        reward = 0.03 * (target_len - generated_len) + 0.5
        rewards.append(max(0.0, min(1.0, reward)))

    return rewards

In [9]:
# Training params
training_args = GRPOConfig(
    use_vllm=True,
    learning_rate=5e-6,
    max_grad_norm=0.1,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    logging_steps=10,
    fp16=True,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_generations=8,
    max_prompt_length=512,
    max_completion_length=512,
    num_train_epochs=1,
    save_steps=934,  # 1 epoch
    report_to='wandb',
    output_dir='outputs',
)

# you can place any rule-based reward function inside reward_funcs=[]
trainer = GRPOTrainer(
    model=model,
    reward_funcs=[
        strict_format_reward_func,
        isnumber_reward_func,
        soft_format_reward_func,
        exact_match_reward_func,
    ],
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    args=training_args,
    processing_class=tokenizer
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 2 to the `num_generations` of 8


In [10]:
# Uncomment if you want to calculate baseline accuracy and format_accuracy
# evaluate_model(model, tokenizer, dataset['test'])

In [11]:
trainer.train()
model.save_lora('lora_after_grpo')

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1 | Total steps = 935
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 35,192,832 of 529,225,600 (6.65% trained)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdanilka200300[0m ([33mdanilka200300-misis[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,kl,rewards / strict_format_reward_func / mean,rewards / strict_format_reward_func / std,rewards / isnumber_reward_func / mean,rewards / isnumber_reward_func / std,rewards / soft_format_reward_func / mean,rewards / soft_format_reward_func / std,rewards / exact_match_reward_func / mean,rewards / exact_match_reward_func / std
10,0.0,0.004297,0.012153,238.523438,24.5,512.0,0.110937,204.332872,24.5,477.3,1.9e-05,0.0,0.0,0.000391,0.003125,0.003906,0.027518,0.0,0.0
20,0.0,0.000781,0.00221,225.884375,26.4,512.0,0.107813,192.734805,26.4,493.4,2.7e-05,0.0,0.0,0.0,0.0,0.000781,0.00625,0.0,0.0


KeyboardInterrupt: 

In [None]:
# Результаты всех таких запусков трейна в папке logs
# Colab'а не хватает даже на один запуск на полную эпоху

## Evaluation

In [12]:
# Auxiliary functions for evaluation
def check_format(text: str) -> bool:
    """Check the correctness of the model's answer format for calculating format_accuracy"""
    pattern = r'^\s*<reasoning>.*?</reasoning>\s*<answer>.*?</answer>\s*$'
    return bool(re.match(pattern, text, re.S))


def extract_num_from_answer(text: str) -> Union[float, None]:
    """Get last number between <answer>ANSWER</answer> tokens for calculating accuracy"""
    answer = get_answer(text)
    if answer:
        all_digits = re.findall(r'-?\d+(?:\.\d+)?', answer.replace(',', ''))
        if len(all_digits) > 0:
            return float(all_digits[-1])
        return None
    return None

In [13]:
# Evaluation function
def evaluate_model(model, tokenizer, dataset, lora_request=None):
    """
    Args:
        lora_request - controls if it is a baseline model (=None) or GRPO-trained
    """
    ans_correct = 0
    format_correct = 0
    for sample in tqdm(dataset):
        question = sample['question']
        target = float(sample['answer'])

        messages = [
            {'role': 'system', 'content': system_prompt},
            {'role': 'user', 'content': question}
        ]

        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        out = model.fast_generate(
            [text], sampling_params=sampling_params, lora_request=lora_request
        )[0].outputs[0].text

        pred = extract_num_from_answer(out)
        if pred == target: ans_correct += 1
        if check_format(out): format_correct += 1

    accuracy = ans_correct / len(dataset)
    format_accuracy = format_correct / len(dataset)

    model_format = 'after GRPO' if lora_request is not None else 'baseline'
    print(f'{model_format}: Accuracy -> {accuracy}, Right format percent -> {format_accuracy}')

In [None]:
evaluate_model(model, tokenizer, dataset['test'], lora_request=model.load_lora('lora_after_grpo'))