In [None]:
%pip install trl

In [None]:
# train_grpo.py
from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer

dataset = load_dataset("trl-lib/tldr", split="train")

# Define the reward function, which rewards completions that are close to 20 characters
def reward_len(completions, **kwargs):
    return [-abs(20 - len(completion)) for completion in completions]

training_args = GRPOConfig(
    output_dir="Qwen2-0.5B-GRPO",
    per_device_train_batch_size=2,
    learning_rate=5e-6,
    num_generations=2,
    gradient_accumulation_steps=2,
    logging_steps=5,
    save_steps=50,
    max_steps=75,
)
trainer = GRPOTrainer(
    model="Qwen/Qwen2-0.5B-Instruct",
    reward_funcs=reward_len,
    args=training_args,
    train_dataset=dataset,
)
trainer.train()

In [None]:
trainer.save_model("Qwen2-0.5B-GRPO-final")

In [None]:
# inference_len.py
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load your fine-tuned model
model_path = "Qwen2-0.5B-GRPO-final"  # or "Qwen2-0.5B-GRPO-final" if you saved with that name
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")

def generate(prompt, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example: test with your Reddit story
reddit_story ="Summarize: I went to the store and bought some milk, eggs, and bread.But i kept the extra money so my mom slapped me."

prompt = f"Write something about this:\n\n{reddit_story}\n\nOutput:"
print(generate(prompt))