# Environment Setup

In [1]:
!uv pip install bitsandbytes xformers triton unsloth vllm==0.10.2
!uv pip install transformers==4.55.4
!uv pip install --no-deps trl==0.22.2

[2mUsing Python 3.12.11 environment at: /usr[0m
[2mAudited [1m5 packages[0m [2min 119ms[0m[0m
[2mUsing Python 3.12.11 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 105ms[0m[0m
[2mUsing Python 3.12.11 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 109ms[0m[0m


In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
%cd '/content/drive/MyDrive/multi-reward-math-reasoning/Llama'

/content/drive/MyDrive/multi-reward-math-reasoning/Llama


In [4]:
%ls

Llama.ipynb


In [5]:
from unsloth import FastLanguageModel
from trl import SFTConfig, GRPOConfig, SFTTrainer, GRPOTrainer
from vllm import SamplingParams

import gc
import re
import time
import torch
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm.notebook import tqdm
from datasets import load_dataset, Dataset
from safetensors import safe_open

from peft import LoraConfig, get_peft_model, TaskType
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig, TextStreamer
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 10-10 02:01:56 [__init__.py:216] Automatically detected platform cuda.
🦥 Unsloth Zoo will now patch everything to make training faster!


# Model Setup

In [6]:
model_id = 'unsloth/Llama-3.2-1B'          # Select model optimized for instruction-following and reasoning
model_name = model_id.split('/')[-1].lower()  # Extract model name from ID
max_seq_length = 2048                         # Can increase for longer reasoning traces
lora_rank = 32                                # Larger rank = smarter, but slower

In [7]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_id,
    max_seq_length=max_seq_length,
    load_in_4bit=False,         # False for LoRA 16bit
    fast_inference=True,        # Enable vLLM fast inference
    max_lora_rank=lora_rank,
    gpu_memory_utilization=0.9, # Reduce if out of memory
)
model = FastLanguageModel.get_peft_model(
    model,
    r=lora_rank,                          # Rank: adaptation capacity (16 good for reasoning tasks)
    lora_alpha=lora_rank * 2,             # Scaling factor (typically 2x rank)
    lora_dropout=0.1,                     # Regularization to prevent overfitting
    target_modules=[                      # Remove QKVO if out of memory
        'q_proj', 'k_proj', 'v_proj', 'o_proj',
        'gate_proj', 'up_proj', 'down_proj',
    ],
    use_gradient_checkpointing='unsloth', # Reduces memory usage
    random_state=3407,
)

INFO 10-10 02:02:10 [vllm_utils.py:689] Unsloth: Patching vLLM v1 graph capture
INFO 10-10 02:02:10 [vllm_utils.py:717] Unsloth: Patching vLLM v0 graph capture
==((====))==  Unsloth 2025.10.1: Fast Llama patching. Transformers: 4.55.4. vLLM: 0.10.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/Llama-3.2-1B with actual GPU utilization = 88.97%
Unsloth: Your GPU has CUDA compute capability 8.0 with VRAM = 39.56 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 320.
Unsloth: vLLM's KV Cache can use up to 32.85 GB. Also swap space = 6 GB.
Unsloth: Not an error, but `device` is no

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

INFO 10-10 02:02:34 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='unsloth/Llama-3.2-1B', speculative_config=None, tokenizer='unsloth/Llama-3.2-1B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=unsloth/Llama-3.2-1B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

INFO 10-10 02:02:42 [weight_utils.py:369] Time spent downloading weights for unsloth/Llama-3.2-1B: 6.695035 seconds
INFO 10-10 02:02:42 [weight_utils.py:406] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 10-10 02:02:43 [default_loader.py:268] Loading weights took 0.80 seconds
INFO 10-10 02:02:43 [punica_selector.py:19] Using PunicaWrapperGPU.
INFO 10-10 02:02:45 [gpu_model_runner.py:2392] Model loading took 2.3801 GiB and 8.318614 seconds
INFO 10-10 02:02:53 [backends.py:539] Using cache directory: /root/.cache/vllm/torch_compile_cache/5df3daed28/rank_0_0/backbone for vLLM's torch.compile
INFO 10-10 02:02:53 [backends.py:550] Dynamo bytecode transform time: 7.34 s


Unsloth: Compiling kernels: 100%|██████████| 7/7 [00:00<00:00, 15.38it/s, triton_poi_fused_view_6]

INFO 10-10 02:02:57 [backends.py:194] Cache the graph for dynamic shape for later use



Unsloth: Compiling kernels: 100%|██████████| 9/9 [00:00<00:00, 23.87it/s, triton_poi_fused_view_8]
Unsloth: Compiling kernels: 100%|██████████| 9/9 [00:00<00:00, 464.64it/s, triton_poi_fused_view_8]
Unsloth: Compiling kernels: 100%|██████████| 9/9 [00:00<00:00, 451.82it/s, triton_poi_fused_view_8]
Unsloth: Compiling kernels: 100%|██████████| 9/9 [00:00<00:00, 462.78it/s, triton_poi_fused_view_8]
Unsloth: Compiling kernels: 100%|██████████| 9/9 [00:00<00:00, 488.99it/s, triton_poi_fused_view_8]
Unsloth: Compiling kernels: 100%|██████████| 9/9 [00:00<00:00, 490.63it/s, triton_poi_fused_view_8]
Unsloth: Compiling kernels: 100%|██████████| 9/9 [00:00<00:00, 492.55it/s, triton_poi_fused_view_8]
Unsloth: Compiling kernels: 100%|██████████| 9/9 [00:00<00:00, 493.03it/s, triton_poi_fused_view_8]
Unsloth: Compiling kernels: 100%|██████████| 9/9 [00:00<00:00, 494.38it/s, triton_poi_fused_view_8]
Unsloth: Compiling kernels: 100%|██████████| 9/9 [00:00<00:00, 446.91it/s, triton_poi_fused_view_8]


INFO 10-10 02:03:19 [backends.py:215] Compiling a graph for dynamic shape takes 25.37 s





INFO 10-10 02:03:27 [monitor.py:34] torch.compile takes 32.71 s in total
INFO 10-10 02:03:30 [gpu_worker.py:298] Available KV cache memory: 31.31 GiB
INFO 10-10 02:03:30 [kv_cache_utils.py:864] GPU KV cache size: 1,025,920 tokens
INFO 10-10 02:03:30 [kv_cache_utils.py:868] Maximum concurrency for 2,048 tokens per request: 500.94x
INFO 10-10 02:03:30 [vllm_utils.py:694] Unsloth: Running patched vLLM v1 `capture_model`.


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:15<00:00,  4.20it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 43/43 [00:03<00:00, 12.11it/s]

INFO 10-10 02:03:50 [gpu_model_runner.py:3118] Graph capturing finished in 20 secs, took 0.62 GiB
INFO 10-10 02:03:50 [vllm_utils.py:701] Unsloth: Patched vLLM v1 graph capture finished in 20 secs.





INFO 10-10 02:03:51 [gpu_worker.py:391] Free memory on device (39.03/39.56 GiB) on startup. Desired GPU memory utilization is (0.88969201168322, 35.19 GiB). Actual usage is 2.38 GiB for weight, 1.49 GiB for peak activation, 0.02 GiB for non-torch memory, and 0.62 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=32793579110` to fit into requested memory, or `--kv-cache-memory=36912687104` to fully utilize gpu memory. Current kv cache memory in use is 33617759846 bytes.
INFO 10-10 02:03:51 [core.py:218] init engine (profile, create kv cache, warmup model) took 66.92 seconds
INFO 10-10 02:03:53 [llm.py:295] Supported_tasks: ('generate',)
INFO 10-10 02:03:53 [__init__.py:36] No IOProcessor plugins requested by the model
Unsloth: Just some info: will skip parsing ['q_norm', 'attention_norm', 'post_layernorm', 'layer_norm1', 'k_norm', 'input_layernorm', 'post_feedforward_layernorm', 'norm1', 'post_attention_layernorm', 'norm2', 'ffn_norm', 'layer_norm2'

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.10.1 patched 16 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


# Chat Template

In [8]:
# Define structured output format for mathematical reasoning
REASONING_START = '<THINK>'   # Begin reasoning section
REASONING_END = '</THINK>'    # End reasoning section
SOLUTION_START = '<SOLUTION>' # Begin final answer
SOLUTION_END = '</SOLUTION>'  # End final answer

# System prompt that teaches the model our desired reasoning structure
SYSTEM_PROMPT = f'''You are a mathematical reasoning assistant. When given a math problem:
1. Show your step-by-step work between {REASONING_START} and {REASONING_END}.
2. Provide your final numerical answer between {SOLUTION_START} and {SOLUTION_END}.
3. Be precise and show all calculation steps clearly.'''
print(SYSTEM_PROMPT)

You are a mathematical reasoning assistant. When given a math problem:
1. Show your step-by-step work between <THINK> and </THINK>.
2. Provide your final numerical answer between <SOLUTION> and </SOLUTION>.
3. Be precise and show all calculation steps clearly.


In [9]:
chat_template = ( # Build and assign chat_template to the tokenizer
    # If the very first message is a SYSTEM role, print it + <eos>:
    "{% if messages[0]['role'] == 'system' %}"
      "{{ messages[0]['content'] + eos_token }}"
      "{% set loop_messages = messages[1:] %}"
    "{% else %}"
      # Otherwise, inject our system_prompt + <eos>:
      "{{ '{system_prompt}' + eos_token }}"
      "{% set loop_messages = messages %}"
    "{% endif %}"

    # Now loop over the remaining messages (either user or assistant):
    "{% for message in loop_messages %}"
      "{% if message['role'] == 'user' %}"
        "{{ message['content'] }}"
      "{% elif message['role'] == 'assistant' %}"
        "{{ message['content'] + eos_token }}"
      "{% endif %}"
    "{% endfor %}"

    # If we asked for "add_generation_prompt", append <REASONING> to the end:
    "{% if add_generation_prompt %}{{ '{reasoning_start}' }}"
    "{% endif %}"
)
# Replace with out specific template:
tokenizer.chat_template = chat_template\
    .replace("'{system_prompt}'",   f"'{SYSTEM_PROMPT}'")\
    .replace("'{reasoning_start}'", f"'{REASONING_START}'")

In [10]:
example_messages = [ # Quick sanity check of the template
    {'role': 'user', 'content': 'Which country has the highest population density?'},
    {'role': 'assistant', 'content': (
        f'{REASONING_START}'
        'I know that country X is small in area but has a huge population, '
        'so its people per square kilometer is extremely high.'
        f'{REASONING_END}{SOLUTION_START}Monaco{SOLUTION_END}'
    )},
    {'role': 'user', 'content': 'Which planet is farthest from the Sun?'},
]
print(tokenizer.apply_chat_template(example_messages, tokenize=False, add_generation_prompt=True))

You are a mathematical reasoning assistant. When given a math problem:
1. Show your step-by-step work between <THINK> and </THINK>.
2. Provide your final numerical answer between <SOLUTION> and </SOLUTION>.
3. Be precise and show all calculation steps clearly.<|end_of_text|>Which country has the highest population density?<THINK>I know that country X is small in area but has a huge population, so its people per square kilometer is extremely high.</THINK><SOLUTION>Monaco</SOLUTION><|end_of_text|>Which planet is farthest from the Sun?<THINK>


# Pre Fine-tuning (SFT)

## Data preparation

In [11]:
# Use a subset of NVIDIA's Open Math Reasoning dataset, which was filtered to only include high quality DeepSeek R1 traces
sft_dataset = load_dataset('unsloth/OpenMathReasoning-mini', split='cot').to_pandas()
sft_dataset = sft_dataset[['expected_answer', 'problem', 'generated_solution']]

# Try converting to number - if not, replace with NaN
is_number = pd.to_numeric(pd.Series(sft_dataset['expected_answer']), errors='coerce').notnull()
sft_dataset = sft_dataset.iloc[np.where(is_number)[0]] # Select only numbers
sft_dataset

README.md:   0%|          | 0.00/603 [00:00<?, ?B/s]

data/cot-00000-of-00001.parquet:   0%|          | 0.00/106M [00:00<?, ?B/s]

Generating cot split:   0%|          | 0/19252 [00:00<?, ? examples/s]

Unnamed: 0,expected_answer,problem,generated_solution
0,14,Given $\sqrt{x^2+165}-\sqrt{x^2-52}=7$ and $x$...,"<think>\nOkay, let's see. I need to solve the ..."
6,-2,Find the value of the parameter $a$ for which ...,"<think>\nOkay, so I need to find the value of ..."
9,18,What is the sum of all real numbers $x$ for wh...,"<think>\nOkay, so I need to solve the equation..."
13,2,Evaluate the sum \(\sum_{n=1}^\infty \frac{\ph...,"<think>\nOkay, so I need to evaluate the infin..."
17,30,What is the largest positive integer that divi...,"<think>\nAlright, so I need to find the larges..."
...,...,...,...
19243,244,"Let \( p \), \( q \), and \( r \) be the disti...","<think>\nOkay, so I need to find the value of ..."
19245,1,A bug is on the $0$ of a number line. At any p...,"<think>\nOkay, so I have this problem where a ..."
19247,4,A bus left point X for point Y. Two hours late...,"<think>\nOkay, let's tackle this problem step ..."
19248,18,Each interior angle of a regular n-gon measure...,"<think>\nOkay, let's see. I need to find the n..."


In [12]:
def format_dataset(x): # Format the dataset to follow our GRPO style formatting
    expected_answer = x['expected_answer']
    problem = x['problem']

    # Remove generated <think> and </think>
    thoughts = x['generated_solution'].replace('<think>', '').replace('</think>', '')
    thoughts = thoughts.strip()

    # Add our custom formatting
    final_prompt = REASONING_START + thoughts + REASONING_END + \
                   SOLUTION_START + expected_answer + SOLUTION_END
    return [
        {'role': 'system'   , 'content': SYSTEM_PROMPT},
        {'role': 'user'     , 'content': problem},
        {'role': 'assistant', 'content': final_prompt},
    ]

sft_dataset['messages'] = sft_dataset.apply(format_dataset, axis=1)
print(tokenizer.apply_chat_template(sft_dataset['messages'][0], tokenize=False))

You are a mathematical reasoning assistant. When given a math problem:
1. Show your step-by-step work between <THINK> and </THINK>.
2. Provide your final numerical answer between <SOLUTION> and </SOLUTION>.
3. Be precise and show all calculation steps clearly.<|end_of_text|>Given $\sqrt{x^2+165}-\sqrt{x^2-52}=7$ and $x$ is positive, find all possible values of $x$.<THINK>Okay, let's see. I need to solve the equation √(x² + 165) - √(x² - 52) = 7, and find all positive values of x. Hmm, radicals can be tricky, but maybe if I can eliminate the square roots by squaring both sides. Let me try that.

First, let me write down the equation again to make sure I have it right:

√(x² + 165) - √(x² - 52) = 7.

Okay, so the idea is to isolate one of the radicals and then square both sides. Let me try moving the second radical to the other side:

√(x² + 165) = 7 + √(x² - 52).

Now, if I square both sides, maybe I can get rid of the square roots. Let's do that:

(√(x² + 165))² = (7 + √(x² - 52))².

S

In [13]:
# Truncate pre fine-tuning sft_dataset to max_seq_length / 2 since we don't want too long reasoning traces
sft_dataset['seq_length'] = sft_dataset['messages'].apply(lambda x: len(tokenizer.apply_chat_template(x)))
print('Token-length percentiles (50/90/99):', np.percentile(sft_dataset['seq_length'], [50, 90, 99]))

threshold = max_seq_length / 2
sft_dataset_filtered = sft_dataset.loc[sft_dataset['seq_length'] <= threshold].copy()
print(f'Remaining for training (<= {threshold} tokens): {len(sft_dataset_filtered)}/{len(sft_dataset)}')

sft_dataset_filtered['text'] = tokenizer.apply_chat_template(sft_dataset_filtered['messages'].values.tolist(), tokenize=False)
sft_dataset_filtered = Dataset.from_pandas(sft_dataset_filtered)
sft_dataset_filtered

Token-length percentiles (50/90/99): [ 3469.   8310.4 14578.9]
Remaining for training (<= 1024.0 tokens): 73/7507


Dataset({
    features: ['expected_answer', 'problem', 'generated_solution', 'messages', 'seq_length', 'text', '__index_level_0__'],
    num_rows: 73
})

## Pre fine-tune to understand custom GRPO formatting

In [14]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=sft_dataset_filtered,
    args=SFTConfig(
        dataset_text_field='text',
        num_train_epochs=3,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=1,
        optim='adamw_8bit',
        weight_decay=0.01,
        learning_rate=2e-4,
        lr_scheduler_type='cosine',
        warmup_steps=5,
        logging_steps=5,
        seed=3407,
        report_to='none', # Use this for WandB
    )
)
trainer.train()

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/73 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 73 | Num Epochs = 3 | Total steps = 219
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 1 x 1) = 1
 "-____-"     Trainable parameters = 22,544,384 of 1,258,358,784 (1.79% trained)


Step,Training Loss
5,1.4335
10,1.1104
15,1.0016
20,1.0123
25,0.9034
30,0.9406
35,0.9358
40,0.7999
45,0.8742
50,0.8486


Unsloth: Will smartly offload gradients to save VRAM!


TrainOutput(global_step=219, training_loss=0.5996594461676192, metrics={'train_runtime': 69.9715, 'train_samples_per_second': 3.13, 'train_steps_per_second': 3.13, 'total_flos': 1180872997208064.0, 'train_loss': 0.5996594461676192, 'epoch': 3.0})

## Check if model has learnt to follow the format

In [15]:
text = tokenizer.apply_chat_template( # Render into a single string and append <REASONING> for generation
    sft_dataset_filtered[1]['messages'][:2],
    tokenize=False, add_generation_prompt=True, # Append the final <REASONING>
)
_ = model.generate(
    **tokenizer(text, return_tensors='pt').to('cuda'),
    temperature=0, max_new_tokens=1024,
    streamer=TextStreamer(tokenizer, skip_prompt=False), # Stream the model's generations (CoT + solution)
)

<|begin_of_text|>You are a mathematical reasoning assistant. When given a math problem:
1. Show your step-by-step work between <THINK> and </THINK>.
2. Provide your final numerical answer between <SOLUTION> and </SOLUTION>.
3. Be precise and show all calculation steps clearly.<|end_of_text|>What is the average book width, in centimeters, of five books with the following widths: $6$, $\frac{1}{2}$, $1$, $2.5$, and $10$?<THINK>Okay, let's see. I need to find the average width of five books. The widths given are 6 cm, 1/2 cm, 1 cm, 2.5 cm, and 10 cm. Hmm, average is when you add up all the numbers and then divide by how many there are. So, first, I need to convert all the numbers to the same format.

Let me look at each number. The ones I know are 6, 1/2, 1, 2.5, and 10. The others are fractions. I can convert the fraction to a decimal by multiplying both numerator and denominator by 100. So, 1/2 would be 0.5, right? And 2.5 is 2.5. Then, adding those up step by step. Let's start with 6 a

In [16]:
del sft_dataset, sft_dataset_filtered
gc.collect()
torch.cuda.empty_cache()

# Post Fine-tuning (RL)

## Data preparation

In [17]:
def process_dataset_sample(example): # Convert GSM8K example to conversation format for GRPO training
    return {
        'prompt': [ # Create conversation with system prompt for structured reasoning
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': example['question']},
        ],
        # Extract numerical answer from GSM8K format ('Explanation... #### 42') as Ground truth for reward functions
        'answer': example['answer'].split('####')[1].strip() if '####' in example['answer'] else None
    }

In [18]:
# train_dataset = load_dataset('openai/gsm8k', 'main', split=['train[:10%]'])
train_dataset = load_dataset('openai/gsm8k', 'main', split='train')
train_dataset = train_dataset.map(process_dataset_sample)

print(f'Training samples: {len(train_dataset):,}\n'
      f"- Sample question: {train_dataset[0]['prompt'][1]['content']}\n"
      f"- Sample answer (ground truth for rewards): {train_dataset[0]['answer']}\n"
      f"- Prompt (system + user):\n{train_dataset[0]['prompt']}")

README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Training samples: 7,473
- Sample question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
- Sample answer (ground truth for rewards): 72
- Prompt (system + user):
[{'content': 'You are a mathematical reasoning assistant. When given a math problem:\n1. Show your step-by-step work between <THINK> and </THINK>.\n2. Provide your final numerical answer between <SOLUTION> and </SOLUTION>.\n3. Be precise and show all calculation steps clearly.', 'role': 'system'}, {'content': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'role': 'user'}]


In [19]:
# Get the top 90% prompt length so we don't accidentally truncate them, i.e. we'll remove the top 10% long prompts
tokenized_dataset = train_dataset.map(
    lambda x: {'tokens': tokenizer.apply_chat_template(x['prompt'], add_generation_prompt=True, tokenize=True)},
    batched=True,
).map(lambda x: {'length': len(x['tokens'])})
print(tokenizer.decode(tokenized_dataset[0]['tokens']))

thresholds = np.percentile(tokenized_dataset['length'], [50, 90, 99])
max_prompt_length = int(thresholds[1])
print('Token-length percentiles (50/90/99):', thresholds, '=> Choose max_prompt_length =', max_prompt_length)

# Filter only samples smaller than 90% max length
train_dataset = train_dataset.select(np.where(np.array(tokenized_dataset['length']) <= max_prompt_length)[0])
print(f'Remaining for training (<= {max_prompt_length} tokens): {len(train_dataset)}/{len(tokenized_dataset)}')
del tokenized_dataset

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

You are a mathematical reasoning assistant. When given a math problem:
1. Show your step-by-step work between <THINK> and </THINK>.
2. Provide your final numerical answer between <SOLUTION> and </SOLUTION>.
3. Be precise and show all calculation steps clearly.<|end_of_text|>Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<THINK>
Token-length percentiles (50/90/99): [116. 149. 187.] => Choose max_prompt_length = 149
Remaining for training (<= 149 tokens): 6749/7473


## Regex Patterns

In [20]:
# Match the reasoning sections and answers
match_format = re.compile(
    # rf'^[\s]{{0,}}'                                     # Optional whitespace at start
    # rf'{REASONING_START}.+?{REASONING_END}.*?'          # Reasoning section (non-greedy)
    rf'{REASONING_END}.*?'                              # We always prepend REASONING_START
    rf'{SOLUTION_START}(.+?){SOLUTION_END}'             # Solution section with capture group
    rf'[\s]{{0,}}(?:{re.escape(tokenizer.eos_token)})?' # Add optional EOS token matching
    rf'[\s]{{0,}}$',                                    # Optional whitespace at end
    flags=re.MULTILINE | re.DOTALL,                     # Multi-line matching with . matching newlines
)
match_format.findall( # Verify it works
    f'{REASONING_START}Let me think!{REASONING_END}'\
    f'{SOLUTION_START}\n2\n{SOLUTION_END}\n\n',
)

['\n2\n']

In [21]:
# Sometimes it might not be 1 number as the answer, but like a sentence.
# For example: 'The solution is $20' -> we extract 20
# We also remove possible commas for example as in 123,456
match_number = re.compile(
    rf'{SOLUTION_START}.*?[\s]{{0,}}([-]?[\d\.\,]{{1,}})', # Extract numbers from solution section
    flags=re.MULTILINE | re.DOTALL | re.IGNORECASE,        # Flexible pattern matching
)
print(match_number.findall('<SOLUTION>  0.34  </SOLUTION>'))
print(match_number.findall('<SOLUTION>  123,456  </SOLUTION>'))
print(match_number.findall('<SOLUTION>  -0.234  </SOLUTION>'))
print(match_number.findall('<SOLUTION>17</SOLUTION>'))

['0.34']
['123,456']
['-0.234']
['17']


## Multi-reward design

In [22]:
def match_format_strictly(completions, **kwargs) -> list[float]:
    ''' Reward Function 1: Exact Format Compliance
    High reward (3.0) for perfect format adherence
    Ensures model learns the complete structured output pattern
    '''
    return [
        3.0 if match_format.search(completion[0]['content']) else 0.0
        for completion in completions
    ]

In [23]:
# If it fails, reward the model if it at least follows the format partially, by counting each symbol
def match_format_softly(completions, **kwargs) -> list[float]:
    ''' Reward Function 2: Partial Format Credit
    Graduated scoring for format elements
    Encourages learning individual components even if not perfect
    '''
    rewards = []
    for completion in completions:
        reward = 0
        response = completion[0]['content']

        # Count how many keywords are seen - we penalize if too many!
        # Award +0.5 for correct token count, -0.5 for wrong count
        # No need to reward REASONING_START since we always prepend it!
        # reward += 0.5 if response.count(REASONING_START) == 1 else -0.5
        reward += 0.5 if response.count(REASONING_END) == 1 else -0.5
        reward += 0.5 if response.count(SOLUTION_START) == 1 else -0.5
        reward += 0.5 if response.count(SOLUTION_END) == 1 else -0.5
        rewards.append(reward)
    return rewards

In [24]:
# Extract the generated answer, and reward or penalize it
def check_answer_correctness(completions, answer, **kwargs) -> list[float]:
    ''' Reward Function 3: Graduated scoring for mathematical accuracy
    - 5.0: Exact string match gets full points
    - 2.0: Within 10% (close answer)
    - 1.5: Within 20% (reasonable attempt)
    - -2.5: Wrong answer (penalty for incorrect math)
    '''
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [ # Extract answers using format pattern
        guess.group(1) if (guess := match_format.search(r)) else None
        for r in responses
    ]
    rewards = []
    for guess, true_answer in zip(extracted_responses, answer):
        if guess is None: # No extractable answer
            rewards.append(-2.0)
            continue

        if guess == true_answer: rewards.append(5.0)                   # Correct answer gets 5 points!
        elif guess.strip() == true_answer.strip(): rewards.append(3.5) # Match if spaces are seen, but less reward
        else: # Try numerical comparison for partial credit
            try: # We also reward it based on how close the answer is to the true one via ratios
                ratio = float(guess) / float(true_answer)     # If the answer is within some range, reward it!
                if 0.9 <= ratio <= 1.1: rewards.append(2.0)   # Within 10%
                elif 0.8 <= ratio <= 1.2: rewards.append(1.5) # Within 20%
                else: rewards.append(-2.5)                    # Penalize wrong answers
            except (ValueError, ZeroDivisionError):
                rewards.append(-4.5)                          # Invalid numerical format
    return rewards

In [25]:
def check_numbers_extraction(prompts, completions, answer, **kwargs) -> list[float]:
    ''' Reward Function 4: Number Extraction Ability
    Tests the model's ability to extract numerical values from solution sections
    Complementary to exact format matching - focuses on parsing capability
    '''
    question = prompts[0][-1]['content'] # Exclude system prompt
    responses = [completion[0]['content'] for completion in completions]

    extracted_responses = [ # Extract numbers from solution sections using number pattern
        guess.group(1) if (guess := match_number.search(r)) else None
        for r in responses
    ]
    rewards = []

    # Print only every few steps
    check_numbers_extraction.counter = getattr(check_numbers_extraction, 'counter', 0) + 1
    if check_numbers_extraction.counter % 100 == 0:
        print(
            '==' * 100,
            f'\nQuestion: {question}'
            f'\nPrediction: {extracted_responses[0]}, GT Answer: {answer[0]}'
            f'\nResponse:\n{responses[0]}'
        )
    for guess, true_answer in zip(extracted_responses, answer):
        if guess is None: # No extractable number
            rewards.append(-2.5)
            continue

        try: # Simple numerical equality check
            true_val = float(true_answer.strip())             # Convert to numbers
            guess_val = float(guess.strip().replace(',', '')) # Remove commas like in 123,456
            rewards.append(3.5 if guess_val == true_val else -1.5)
        except (ValueError, TypeError):
            rewards.append(0) # Invalid number format
    return rewards

## GRPO training setup

In [26]:
max_prompt_length = 152 + 1 # + 1 just in case!
max_completion_length = max_seq_length - max_prompt_length
vllm_sampling_params = SamplingParams(
    min_p = 0.1,
    top_p = 1.0,
    top_k = -1,
    stop = [tokenizer.eos_token],
    include_stop_str_in_output = True,
)

In [27]:
training_args = GRPOConfig(          # Configure GRPO training parameters for mathematical reasoning
    output_dir=f'/tmp/{model_name}', # Directory for checkpoints and logs
    vllm_sampling_params=vllm_sampling_params,
    # Training speed control
    num_train_epochs=1,              # Total number of training epochs
    per_device_train_batch_size=2,   # Small batch for GPU memory constraints
    gradient_accumulation_steps=8,   # Effective batch size = 2 * 8 = 16
    # Computing the loss: https://huggingface.co/docs/trl/main/grpo_trainer#computing-the-loss
    scale_rewards='batch',           # Calculate mean at local/group level and std at global/batch level enables more robust reward shaping
    loss_type='dr_grpo',             # Fully remove response length bias, dividing by a constant instead of the sequence length
    # Precision & Optimization
    optim='adamw_8bit',              # adamw_torch_fused, adamw_8bit, paged_adamw_8bit
    weight_decay=0.1,                # Regularization
    max_grad_norm=0.1,               # Aggressive gradient clipping for stable training
    gradient_checkpointing=True,
    bf16=torch.cuda.is_available(),  # Enable mixed-precision training if a CUDA GPU is available (faster, less memory)
    # Learning rate scheduling
    learning_rate=1e-5,              # Conservative LR to prevent destabilizing reasoning
    warmup_ratio=0.1,
    lr_scheduler_type='cosine_with_min_lr',
    lr_scheduler_kwargs=dict(min_lr=1e-6),
    # Generation control
    temperature=1.0,
    num_generations=2,                           # Default: 8 generations per step
    max_prompt_length=max_prompt_length,         # Default: 512. Sufficient for complex word problems
    max_completion_length=max_completion_length, # Default: 256. Room for detailed step-by-step reasoning
    # Reporting and saving
    report_to='wandb',
    logging_steps=10,
    logging_strategy='steps',
    save_total_limit=1,
    # max_steps=100,
    # For optional evaluation
    # per_device_eval_batch_size=4,
    # bf16_full_eval=torch.cuda.is_available(),
    # eval_strategy='steps',                       # Evaluate after each epoch
    # load_best_model_at_end=True,                 # Load the best model based on validation loss
)

## Train the model

In [28]:
%%time
trainer = GRPOTrainer(            # Initialize GRPO trainer with multi-reward system
    model=model,                  # LoRA-adapted quantized model
    processing_class=tokenizer,
    train_dataset=train_dataset,  # Processed GSM8K dataset
    args=training_args,           # Training configuration
    reward_funcs=[                # 4 complementary reward functions
        match_format_strictly,    # Perfect structure compliance
        match_format_softly,      # Partial format credit
        check_answer_correctness, # Mathematical accuracy
        check_numbers_extraction, # Number parsing ability
    ]
)
trainer.train()
trainer.save_model(f'./{model_name}_grpo')

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 6,749 | Num Epochs = 1 | Total steps = 843
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 22,544,384 of 1,258,358,784 (1.79% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mwalteryeyint[0m ([33mwalteryeyint-university-of-technology-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,kl,rewards / match_format_strictly / mean,rewards / match_format_strictly / std,rewards / match_format_softly / mean,rewards / match_format_softly / std,rewards / check_answer_correctness / mean,rewards / check_answer_correctness / std,rewards / check_numbers_extraction / mean,rewards / check_numbers_extraction / std
10,0.4395,-1.478125,4.220768,1500.03125,788.8,1895.0,0.375,1260.993237,788.8,1719.5,0.553382,1.8,1.446595,0.3,1.429855,-1.85,1.496247,-1.728125,0.977385
20,0.6417,-1.459375,4.259568,1480.7125,748.5,1895.0,0.39375,1212.622534,748.5,1663.1,0.539858,1.81875,1.486598,0.33125,1.479948,-1.946875,1.488704,-1.6625,1.07594
30,0.4445,-0.553125,4.128797,1382.85,729.3,1895.0,0.275,1188.33656,729.3,1698.2,0.663552,2.15625,1.322988,0.65,1.32247,-1.765625,1.628388,-1.59375,0.924556
40,0.5189,-1.36875,3.811721,1456.5875,681.5,1895.0,0.33125,1245.795862,681.5,1771.2,0.53137,1.9875,1.437603,0.46875,1.447053,-2.1,1.169889,-1.725,0.798265
50,0.3743,0.125,3.820067,1321.50625,705.0,1872.0,0.1625,1211.185437,705.0,1743.8,0.747573,2.45625,1.109631,0.98125,1.082135,-1.9375,1.434697,-1.375,1.077167
60,0.3459,0.69375,4.608223,1299.55,690.7,1895.0,0.18125,1168.552014,690.7,1737.2,0.643177,2.4,1.207302,0.91875,1.174298,-1.375,2.087229,-1.25,1.43165
70,0.2466,0.859375,3.561284,1181.7125,588.1,1866.2,0.10625,1100.152112,588.1,1748.8,0.587588,2.64375,0.783802,1.15,0.774638,-1.634375,1.837277,-1.3,0.980812
80,0.3714,0.5625,3.657686,1151.7625,614.4,1868.4,0.11875,1049.805127,614.4,1640.7,0.706063,2.60625,0.940126,1.125,0.912656,-1.825,1.747174,-1.34375,1.008009
90,0.3428,0.790625,3.017772,1064.1875,642.6,1778.7,0.06875,1004.116577,642.6,1515.5,2.057492,2.775,0.639103,1.26875,0.641563,-1.884375,1.52509,-1.36875,0.827262
100,0.2972,0.371875,2.676565,1057.85,564.6,1857.5,0.06875,996.202142,564.6,1565.6,0.66452,2.71875,0.821807,1.21875,0.811026,-2.065625,1.155612,-1.5,0.601638


Question: The tallest building in the world is 100 feet tall.  If the second tallest is half that tall, and the third tallest is half as tall as the second, and the fourth is one-fifth as tall as the third, how tall are all 4 buildings put together?
Prediction: None, GT Answer: 180
Response:
Okay, let's see. The problem is about finding the total height of four tall buildings. The tallest building is 100 feet tall. We need to know how tall all four buildings put together.

Right, so the tallest building is 100 feet. Let's call the second tallest building 50 feet tall. So, half of that is 25 feet. The third building would be half of 50, which is 25. The fourth building is half the third, so 2.5.

So, the question is asking for the total height of all four buildings. Hmm, adding 25, 25, 2.5, and 0.5 would be 100 + 50 + 25 + 2.5 + 0.5. Let me compute that.

Starting with 100. Adding 25 gives 125. Then add 25 again to get 150. Next, add 2.5: 150 + 2.5 is 152.5. Finally, add 0.5 more to get

# Evaluation

## Resource usage

In [29]:
# Memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

print(f'GPU = {gpu_stats.name}. Max memory = {max_memory} GB.')
print(f'{start_gpu_memory} GB of memory reserved.')

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.557 GB.
37.469 GB of memory reserved.


In [30]:
# Extract runtime info
last_log = trainer.state.log_history[-1] # Final memory and time stats
train_seconds = last_log['train_runtime']
samples_per_second = last_log.get('train_samples_per_second', None)

# Recompute GPU memory stats
used_memory   = round(torch.cuda.max_memory_reserved() / 1024**3, 2)
used_for_lora = round(used_memory - start_gpu_memory, 2)
used_pct      = round(used_memory / max_memory * 100, 2)
lora_pct      = round(used_for_lora / max_memory * 100, 2)

print(f'Training time: {train_seconds:.1f} seconds ({train_seconds / 60:.2f} minutes)')
if samples_per_second: print(f'Throughput: {samples_per_second:.1f} samples/second')
print(f'Peak VRAM usage: {used_memory} GB ({used_pct}% of max memory)')
print(f'VRAM for training: {used_for_lora} GB ({lora_pct}% of max memory)')

Training time: 10038.8 seconds (167.31 minutes)
Throughput: 0.7 samples/second
Peak VRAM usage: 37.47 GB (94.72% of max memory)
VRAM for training: 0.0 GB (0.0% of max memory)


## Verify LoRA is actually trained

In [31]:
example_text = 'What is the sqrt of 101?'
# example_text = 'Solve (x + 2)^2 = 0'
# example_text = "How many r's are in strawberry?"

sampling_params = SamplingParams(
    temperature=1.0,
    top_k=50,
    max_tokens=max_completion_length,
)
print(model.fast_generate( # Try the model without any GRPO trained
    example_text, sampling_params=sampling_params,
    lora_request=None
)[0].outputs[0].text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

 Is 101 sqrt of 101?
The answer is no, 101 is not the sqrt of 101. But if you try a program in your editor, it might look like the problem should be solvable, but not solve it.
What is sqrt of 101?
101 is not the square root of 101. It is not only not the square root of 101, it isn’t even a close approximation to it — it is 101, which when divided by 2 gives 50.5, which when divided by 5 gives 10.2 which when divided by 2 gives 5.1, which when divided by 3 gives 1.66, which when divided by 10 gives 0.16506, which when divided by 2 gives 0.083 which when divided by 3 gives 0.027 which when divided by 5 gives 0.00537, and which when divided by 10 gives 0.000557, and when divided by 2 gives 0.000275, which when divided by 3 gives 0.0000909, which when divided by 5 gives 0.000016, and when divided by 10 gives 0.0000014. That’s just how close it came.
If you get a chance to calculate the square root of a million with pencil and paper I suggest you do so to prove that it is not mathematicall

In [32]:
tensors = {}
with safe_open(f'./{model_name}_grpo/adapter_model.safetensors', framework='pt') as f:
    for key in f.keys(): # Verify both A and B are non zero
        tensor = f.get_tensor(key)
        n_zeros = (tensor == 0).sum() / tensor.numel()
        assert(n_zeros.item() != tensor.numel())

In [33]:
# Load the LoRA and test without using system prompt
# which should not (or minimal) affect the model's original reasoning ability
text = tokenizer.apply_chat_template(
    [{'role': 'user', 'content': example_text}],
    add_generation_prompt=True, tokenize=False,
)
print(model.fast_generate(
    text, sampling_params=sampling_params,
    lora_request=model.load_lora(f'./{model_name}_grpo'),
)[0].outputs[0].text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Okay, let's take a look at this problem: What is the square root of 101? Hmm, square root is basically finding the value that when you multiply it by itself gives the original number. I need to find the real number that when squared gives 101. That seems straightforward, but there's one thing to note here: when squared, some numbers can't be squared, like negative numbers and zero. Let me think if there's another form of the square that may help. Maybe the square of 100 is 10000, and 10 times 101 is 10100. So 101 would be 10000 plus 10100. Let me check the calculations. 

10000 + 10100 = 110000. Yeah, 101 times 100 is 10000, plus 101 times 1, which is 101. That's correct. So the square root is closer to 11000 than to 10000. So 11000 should be the answer. Is there any other method? Wait, has there been any approximation here? Because when I square 101, it gives 10112202, which is a really long number. So maybe there's actually a math trick hiding here. Let me verify.

Yeah, actually, th

In [34]:
# Test using system prompt
text = tokenizer.apply_chat_template([
    {'role': 'system', 'content': SYSTEM_PROMPT},
    {'role': 'user'  , 'content': example_text},
], add_generation_prompt=True, tokenize=False)

# Compare results with system prompt but without LoRA
print(model.fast_generate(
    text, sampling_params=sampling_params,
    lora_request=None,
)[0].outputs[0].text)

# Reasoning model is much better - it's not always correct, since we only trained it for an hour
# It'll be better if we extend the sequence length and train for longer
print(model.fast_generate(
    text, sampling_params=sampling_params,
    lora_request=model.load_lora(f'./{model_name}_grpo'),
)[0].outputs[0].text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

 13.<SOLUTION>15
1a. <THINK> 2a. 14 <SOLUTION>36
1.<SOLUTION>1.5 <SOLUTION>3:4
2.<SOLUTION>2:<SOLUTION>1
3.<SOLUTION>1:4<THINK>4.<SOLUTION>7
5.<SOLUTION>0.9:<SOLUTION>9:<THINK>2.81:<SOLUTION>0 <SOLUTION>11.9
6.<SOLUTION>3.14:<SOLUTION>0.14
7.<SOLUTION>1.11:<SOLUTION>0.88
2.<SOLUTION>1.11:<SOLUTION>1.1
6.<THINK>35
4.<SOLUTION>0.02:<THINK>25.02
5.<SOLUTION>1.5:<THINK>3.09
6.<THINK>36:7<THINK>0.53:<SOLUTION>1.5<THINK>2.81:<SOLUTION>0.54:<SOLUTION>9.27
11.<SOLUTION> 0.14<THINK>1:<SOLUTION>11.1:14<THINK>0.9:<SOLUTION>13:
SOLUTION>0.3<THINK>26.9:
SOLUTION>0.2:<THINK>14<THINK>2.81:<SOLUTION> 1.5:<THINK>0:<SOLUTION> 3:<SOLUTION>0:
6.<SOLUTION>0.54:<SOLUTION>9.27
7.<SOLUTION>2.84:0:<SOLUTION>0:<SOLUTION>14:<SOLUTION>0.14:18:<SOLUTION> 1:<SOLUTION>22:<SOLUTION>0.2:<SOLUTION>23:<SOLUTION>0.3:<SOLUTION>27:<SOLUTION>1:<SOLUTION>11.1:<SOLUTION>26:<SOLUTION>0.2:<SOLUTION>17:<SOLUTION>0:<SOLUTION> 33.6:<SOLUTION>0:<SOLUTION>0:<SOLUTION>1:37<THINK>36:<SOLUTION>25<THINK>0:<SOLUTION>0:<SOLUTION>1 :44:<SO

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Okay, so I need to find the square root of 101. Let me think. The square root of a number a squared is a times root a times a. Hmm, exponent of 10 here. Let's use the scientific method there. Start by writing the base and the exponent.

I remember that scientific notation involves multiplying by 10^n, except up to ten digits (which is usually ten. But let's see, ten times ten is 100, so a squared is 100, times ten is 1000. So the square root of 1000 is ten, times the square root of 10 is 1, right? So maybe the square root is 10, times 1, which is 10, so the square root is 10. Is that correct?

But wait, does the square root always have to be an integer? Like, there are square roots of fractions in math. For example, the square root of 2 is 1.4142135623730951... is that correct? Because it's a recurring decimal. But for whole numbers, the square root should always be an integer. So maybe I'm making a mistake here. But let me check again to make sure I didn't mix it up.

Original number:

## Performance on Test set

In [35]:
# test_dataset = load_dataset('openai/gsm8k', 'main', split=['test[:10%]'])
test_dataset = load_dataset('openai/gsm8k', 'main', split='test').map(process_dataset_sample)
test_texts = [
    tokenizer.apply_chat_template([
        {'role': 'system', 'content': SYSTEM_PROMPT},
        {'role': 'user', 'content': sample['prompt'][1]['content']},
    ], add_generation_prompt=True, tokenize=False)
for sample in test_dataset]
print(f'Testing samples:', len(test_dataset))

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

Testing samples: 1319


In [36]:
outputs_with_lora = model.fast_generate(
    test_texts, sampling_params=sampling_params,
    lora_request=model.load_lora(f'./{model_name}_grpo'),
)
outputs_without_lora = model.fast_generate(
    test_texts, sampling_params=sampling_params,
    lora_request=None,
)

Adding requests:   0%|          | 0/1319 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1319 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s…

Adding requests:   0%|          | 0/1319 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1319 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s…

In [37]:
# Compare the correct amount of using and not using LoRA
no_lora_correct_format_cnt = lora_correct_format_cnt = 0
no_lora_correct_answer_cnt = lora_correct_answer_cnt = 0
no_lora_correct_all_cnt = lora_correct_all_cnt = 0
num_test_samples = len(test_dataset)

for output_with_lora, output_without_lora, answer in zip(outputs_with_lora, outputs_without_lora, test_dataset['answer']):
    correct_format = match_format.search(output_with_lora.outputs[0].text)
    correct_answer = (guess := match_number.search(output_with_lora.outputs[0].text)) and guess.group(1) == answer
    correct_all = correct_format and correct_answer
    if correct_format: lora_correct_format_cnt += 1
    if correct_answer: lora_correct_answer_cnt += 1
    if correct_all: lora_correct_all_cnt += 1

    correct_format = match_format.search(output_without_lora.outputs[0].text)
    correct_answer = (guess := match_number.search(output_without_lora.outputs[0].text)) and guess.group(1) == answer
    correct_all = correct_format and correct_answer
    if correct_format: no_lora_correct_format_cnt += 1
    if correct_answer: no_lora_correct_answer_cnt += 1
    if correct_all: no_lora_correct_all_cnt += 1

pd.DataFrame({
    'Without LoRA': {
        'Correct Format': f'{no_lora_correct_format_cnt}/{num_test_samples} ({no_lora_correct_format_cnt / num_test_samples * 100:.2f}%)',
        'Correct Answer': f'{no_lora_correct_answer_cnt}/{num_test_samples} ({no_lora_correct_answer_cnt / num_test_samples * 100:.2f}%)',
        'Correct Both': f'{no_lora_correct_all_cnt}/{num_test_samples} ({no_lora_correct_all_cnt / num_test_samples * 100:.2f}%)',
    },
    'With LoRA': {
        'Correct Format': f'{lora_correct_format_cnt}/{num_test_samples} ({lora_correct_format_cnt / num_test_samples * 100:.2f}%)',
        'Correct Answer': f'{lora_correct_answer_cnt}/{num_test_samples} ({lora_correct_answer_cnt / num_test_samples * 100:.2f}%)',
        'Correct Both': f'{lora_correct_all_cnt}/{num_test_samples} ({lora_correct_all_cnt / num_test_samples * 100:.2f}%)',
    },
    'Improvement': {
        'Correct Format': f'+{lora_correct_format_cnt - no_lora_correct_format_cnt} ({(lora_correct_format_cnt - no_lora_correct_format_cnt) / num_test_samples * 100:.2f}%)',
        'Correct Answer': f'+{lora_correct_answer_cnt - no_lora_correct_answer_cnt} ({(lora_correct_answer_cnt - no_lora_correct_answer_cnt) / num_test_samples * 100:.2f}%)',
        'Correct Both': f'+{lora_correct_all_cnt - no_lora_correct_all_cnt} ({(lora_correct_all_cnt - no_lora_correct_all_cnt) / num_test_samples * 100:.2f}%)',
    }
}).T

Unnamed: 0,Correct Format,Correct Answer,Correct Both
Without LoRA,126/1319 (9.55%),20/1319 (1.52%),2/1319 (0.15%)
With LoRA,1285/1319 (97.42%),45/1319 (3.41%),45/1319 (3.41%)
Improvement,+1159 (87.87%),+25 (1.90%),+43 (3.26%)


# Inference

In [38]:
def generate_with_reasoning(questions, max_completion_length=512):
    conversations = [[                        # Format input using conversation template
        {'role': 'system', 'content': SYSTEM_PROMPT},
        {'role': 'user', 'content': question},
    ] for question in questions]

    prompts = [tokenizer.apply_chat_template( # Apply chat template and tokenize
        conversation,
        add_generation_prompt=True,         # Add assistant prompt
        tokenize=False,                     # Return string, not tokens
    ) for conversation in conversations]

    # Generate response with reasoning-optimized parameters
    inputs = tokenizer(prompts, return_tensors='pt', padding=True).to(model.device)
    start_time = time.time()
    with torch.no_grad():
        output_ids = model.generate(           # Generate response with reasoning-optimized parameters
            **inputs,
            max_new_tokens=max_completion_length,
            temperature=0.7,                # Balance creativity and consistency
            top_p=0.9,                      # Nucleus sampling for quality
            do_sample=True,                 # Enable sampling for varied reasoning paths
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1,         # Reduce repetitive reasoning steps
            length_penalty=1.0,             # Neutral preference for response length
            early_stopping=True,            # Stop at natural completion
            streamer=TextStreamer(tokenizer, skip_prompt=True),
        )
    end_time = time.time()
    inference_duration = end_time - start_time
    num_generated_tokens = output_ids.shape[1] - inputs['input_ids'].shape[1]

    output_ids = output_ids[:, inputs['input_ids'][0].shape[-1]:output_ids.shape[-1]]
    responses = tokenizer.batch_decode(output_ids, skip_special_tokens=True) # Decode and extract only the generated portion
    return responses, inference_duration, num_generated_tokens

In [39]:
test_dataset = load_dataset('openai/gsm8k', 'main', split='test').map(process_dataset_sample)
gsm8k_question = test_dataset[0]['question']
expected_answer = test_dataset[0]['answer']

print('Question:', gsm8k_question, '\nResponse:')
gsm8k_responses, inference_duration, num_generated_tokens = generate_with_reasoning([gsm8k_question], max_completion_length)
gsm8k_response = gsm8k_responses[0]
print('Inference time (secs):', inference_duration)
print('Generated tokens:', num_generated_tokens)

Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market? 
Response:
Okay, let's see here. I need to figure out how much money Janet makes each day selling her freshly laid eggs at the farmers’ market. So far, so good.

First, let me recall that one egg is typically worth about $2, right? Because most people buy them fresh from the farmer or have them delivered. So if Janet can sell 16 eggs per day, then multiply that by $2 to get the total amount she earns per day. Let's do that part.

So, 16 eggs * $2 per egg = $32 per day. Then, add the three she eats each morning plus the four she makes into baked goods for friends. The other 9 eggs would be sold at the market. Let's call that $9 per day. Add those two numbers together to find the total revenue she gets

In [40]:
# Validate format compliance
has_solution = SOLUTION_START in gsm8k_response and SOLUTION_END in gsm8k_response
print('Reasoning section:', REASONING_END in gsm8k_response)
print('Solution section:', has_solution)

if has_solution: # Check answer accuracy if solution section exists
    try:
        solution_text = gsm8k_response.split(SOLUTION_START)[1].split(SOLUTION_END)[0].strip()
        extracted_number = ''.join(filter(str.isdigit, solution_text))
        expected_number = ''.join(filter(str.isdigit, expected_answer))
        print('Extracted:', solution_text)
        print('Expected:', expected_answer)
        print('Correct:', extracted_number == expected_number)
    except:
        print('Could not extract solution')

Reasoning section: True
Solution section: True
Extracted: 41
Expected: 18
Correct: False
