In [2]:
!pip install -q trl openai wandb

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.9/518.9 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h

In [4]:
# %%writefile train_grpo.py

import wandb

wandb.login(key="TOKEN")
os.environ["WANDB_RESUME"] = "allow"
from peft import LoraConfig

from huggingface_hub import login

login(token="TOKEN")
from datasets import load_dataset
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
)
from trl import GRPOTrainer, GRPOConfig
from peft import LoraConfig


# ======================================================
# 1. Dataset
# ======================================================
from datasets import load_dataset
from transformers import AutoTokenizer
import numpy as np

ds = load_dataset("AIPlans/Helpsteer2-helpfulness-prompts", split="train")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")

lengths = [len(tokenizer(x["prompt"]).input_ids) for x in ds]
cutoff = int(np.percentile(lengths, 90))

ds = ds.filter(
    lambda x: len(tokenizer(x["prompt"]).input_ids) <= cutoff
)


print("90% kept:", len(ds))
print("Cutoff tokens:", cutoff)

# ======================================================
# 2. Reward model + reward function
# ======================================================

from openai import OpenAI

client = OpenAI(
    base_url="https://integrate.api.nvidia.com/v1",
    api_key="TOKEN",
)

def reward_model_score(prompts, completions,**kwargs):
    # print(prompts)
    # print(completions)
    # for i, (p, c) in enumerate(zip(prompts, completions)):
    #         print(f"[{i}] PROMPT:\n{p}")
    #         print(f"\nCOMPLETION:\n{c}")
    #         print("\n" + "-" * 80 + "\n")

    if not prompts:
        return []

    scores = []

    for prompt, completion in zip(prompts, completions):
        resp = client.chat.completions.create(
            model="nvidia/llama-3.1-nemotron-70b-reward",
            messages=[
                {"role": "user", "content": prompt},
                {"role": "assistant", "content": completion},
            ],
        )

        content = resp.choices[0].message.content

        # Normalize content to string
        if not isinstance(content, str):
            content = "".join(part.get("text", "") for part in content)

        # Parse reward (handles "reward: -12.625" or just "-12.625")
        try:
            score = float(content.split(":", 1)[-1].strip())
        except ValueError as e:
            raise ValueError(f"Could not parse reward from content={content!r}") from e

        scores.append(score)
    # print(f'score={scores}')
    return scores

from datasets import load_dataset
from trl import GRPOTrainer, GRPOConfig
from transformers import AutoTokenizer

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],  # or ["q_proj","k_proj","v_proj","o_proj"]
    task_type="CAUSAL_LM",
)

from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import GRPOConfig, GRPOTrainer

model_id = "Qwen/Qwen3-0.6B"

# Load the tokenizer
tok = AutoTokenizer.from_pretrained(model_id)
tok.padding_side = "left"
if tok.pad_token is None:
    tok.pad_token = tok.eos_token


# Load the model explicitly as requested
model = AutoModelForCausalLM.from_pretrained(
    model_id,
)

train_args = GRPOConfig(
    output_dir="QwenModel",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=6,
    num_train_epochs=3,
    max_completion_length=248,  # increase later
    num_generations=4,          # keep small at first
    chat_template_kwargs={"enable_thinking": False},
    report_to="wandb",          # This is the key line
    # log_completions=True,
    logging_steps=1,
    save_steps=50,
    save_total_limit=2,
)

trainer = GRPOTrainer(
    model=model,                # Passing the loaded model object
    args=train_args,
    processing_class=tok,
    reward_funcs=reward_model_score,
    train_dataset=ds,
    peft_config=peft_config,
)

trainer.train(resume_from_checkpoint="QwenModel/checkpoint-3249")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


90% kept: 6499
Cutoff tokens: 501


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


[34m[1mwandb[0m: Detected [openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Step,Training Loss


TrainOutput(global_step=3249, training_loss=0.0, metrics={'train_runtime': 6.506, 'train_samples_per_second': 2996.789, 'train_steps_per_second': 499.388, 'total_flos': 0.0, 'train_loss': 0.0})

In [6]:
trained_model = trainer.model
from peft import PeftModel

# Merge LoRA → base model
merged_model = trained_model.merge_and_unload()
repo_id = "AIPlans/Qwen3-0.6B-GRPO-RM_NVIDIA"

merged_model.push_to_hub(
    repo_id,
    # safe_serialization=True,   # recommended
)

tok.push_to_hub(repo_id)

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

No files have been modified since last commit. Skipping to prevent empty commit.


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/AIPlans/Qwen3-0.6B-GRPO-RM_NVIDIA/commit/82fc6a441d008ac84a60d8e55df407ff2df170c8', commit_message='Upload tokenizer', commit_description='', oid='82fc6a441d008ac84a60d8e55df407ff2df170c8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AIPlans/Qwen3-0.6B-GRPO-RM_NVIDIA', endpoint='https://huggingface.co', repo_type='model', repo_id='AIPlans/Qwen3-0.6B-GRPO-RM_NVIDIA'), pr_revision=None, pr_num=None)

In [1]:
import os
os.listdir("/kaggle/working/QwenModel")

['checkpoint-3249', 'checkpoint-3200', 'README.md']

In [1]:
import shutil

shutil.make_archive(
    base_name="/kaggle/working/working_folder",
    format="zip",
    root_dir="/kaggle/working"
)


'/kaggle/working/working_folder.zip'