In [1]:
!pip install datasets



In [None]:
#!pip install trl

In [2]:
!pip install transformers datasets peft trl accelerate bitsandbytes



In [3]:
!pip install unsloth



In [4]:
!pip install vllm

Collecting vllm
  Downloading vllm-0.8.3-cp38-abi3-manylinux1_x86_64.whl.metadata (27 kB)
Collecting blake3 (from vllm)
  Downloading blake3-1.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting transformers>=4.51.0 (from vllm)
  Downloading transformers-4.51.2-py3-none-any.whl.metadata (38 kB)
Collecting fastapi>=0.115.0 (from fastapi[standard]>=0.115.0->vllm)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Downloading prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl.metadata (13 kB)
Collecting tiktoken>=0.6.0 (from vllm)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting lm-format-enforcer<0.11,>=0.10.11 (from vllm)
  Downloading lm_format_enforcer-0.10.11-py3-none-any.whl.metadata (17 kB)
Collecting llguidance<0.8.0,>=0.7.9 (from vllm)
  Downloading llguidance-0.7.13-cp39-abi3-manylinux_2_

In [5]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Make sure to download the required resources
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer
import torch
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
import bitsandbytes as bnb
import math
from trl import GRPOConfig, GRPOTrainer
from unsloth import FastLanguageModel, PatchFastRL

INFO 04-11 04:08:42 [__init__.py:239] Automatically detected platform cuda.



Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel, PatchFastRL


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
# Load train and validation datasets directly
train_dataset = load_dataset("trl-lib/tldr")

In [None]:
from datasets import concatenate_datasets

# Combine train + validation + test
full_dataset = concatenate_datasets([
    train_dataset["train"],
    train_dataset["validation"],
    train_dataset["test"],
])

In [None]:

# === Optional: Patch RL logic (used only in RLHF-type training) ===
PatchFastRL("GRPO", FastLanguageModel)

# === Config ===
model_name = "microsoft/phi-2"
max_seq_length = 512
lora_rank = 8
use_4bit = True

# === Load model and tokenizer via Unsloth (replaces AutoModelForCausalLM + BitsAndBytesConfig) ===
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    load_in_4bit = use_4bit,
    fast_inference = True,         # Use vLLM-style speedups
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.5,  # Adjust based on VRAM
)

# === Unsloth handles tokenizer defaults — but you can explicitly set these too: ===
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# === Apply LoRA ===
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,
    lora_alpha = lora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)


==((====))==  Unsloth 2025.3.19: Fast Phi patching. Transformers: 4.51.1. vLLM: 0.8.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

microsoft/phi-2 does not have a padding token! Will use pad_token = <|endoftext|>.
Unsloth: Making `model.base_model.model.model` require gradients


In [None]:
# 3. Optimized GRPO config
training_args = GRPOConfig(
    learning_rate=5e-4,
    adam_beta1=0.9,
    adam_beta2=0.99,
    weight_decay=0.1,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    optim="adamw_8bit",
    fp16=True,
    per_device_train_batch_size=6,
    gradient_accumulation_steps=4,
    num_generations=6,  # reduced from 6
    max_prompt_length=256,
    max_completion_length=128,
    eval_steps=5,
    max_steps=300,
    save_steps=20,
    max_grad_norm = 0.1,
    logging_steps=5,
    report_to="none",
    output_dir="./phi2-grpo-results",
    remove_unused_columns=False,
    label_names=[]
)

In [None]:
def reward_bleu(prompts, completions, **kwargs):
    """
    Reward function that scores based on BLEU score for the completion
    Args:
        prompts: list of input prompts (not used in BLEU directly)
        completions: list of generated completions
        **kwargs: additional arguments passed by the trainer
    Returns:
        list of reward scores based on BLEU
    """
    # This is just an example target. You'd want to adjust based on your task.
    reference = kwargs.get("reference", ["This is a placeholder reference"])

    rewards = []
    smoothing_function = SmoothingFunction().method4  # Use smoothing to avoid zero BLEU score

    for completion in completions:
        # Tokenize the generated completion
        generated_tokens = nltk.word_tokenize(completion.lower())

        # Compute BLEU score
        bleu_score = sentence_bleu([reference], generated_tokens, smoothing_function=smoothing_function)
        rewards.append(bleu_score)

    return rewards


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
# Initialize trainer
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=reward_bleu,
    args=training_args,
    train_dataset=full_dataset
)

# Train the model
trainer.train()

# Save the final model
trainer.save_model("./phi2-grpo-final")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 129,722 | Num Epochs = 1 | Total steps = 300
O^O/ \_/ \    Batch size per device = 6 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (6 x 4 x 1) = 24
 "-____-"     Trainable parameters = 3,932,160/1,525,324,800 (0.26% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
5,0.0039
10,0.0048
15,0.0035
20,0.0042
25,0.004
30,0.0036
35,0.0034
40,0.0037
45,0.0035
50,0.0039


In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the quantization config (make sure to have the `bnb_config` set up)
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",  # Load Phi-2 model from Hugging Face # Use the appropriate quantization config (like QLoRA)
    device_map="auto"  # Automatically map model to the available device (e.g., GPU/CPU)
)

phi2_tokenizer =AutoTokenizer.from_pretrained("microsoft/phi-2")  # Adjust the tokenizer name if needed

# Load the GRPO fine-tuned model from the checkpoint folder
grpo_model = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/Checkpoint_300")
grpo_tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Checkpoint_300")

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
def generate_response(model, tokenizer, prompt, max_length=150, device='cuda'):
    # Ensure model is on the correct device
    model.to(device)

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=max_length)

    # Move input tensors to the same device as the model
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.get('attention_mask', None)
    if attention_mask is not None:
        attention_mask = attention_mask.to(device)

    # Generate the output
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=200,
        num_return_sequences=1,
        do_sample=True,
        top_p=0.95,
        temperature=0.9
    )

    # Decode the output tokens back to text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Ensure both models are on the same device (e.g., 'cuda' or 'cpu')
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Get responses from both models, making sure they're on the same device
phi2_responses = [generate_response(model, phi2_tokenizer, prompt, device=device) for prompt in prompts]
grpo_responses = [generate_response(grpo_model, grpo_tokenizer, prompt, device=device) for prompt in prompts]


In [14]:
# Display the responses
for prompt, phi2_response, grpo_response in zip(prompts, phi2_responses, grpo_responses):
    print(f"Prompt: {prompt}")
    print(f"Phi-2 Response: {phi2_response}")
    print(f"GRPO + QLoRA Response: {grpo_response}")
    print("="*80)

Prompt: When I (f/22) went through my first real breakup 2 years ago because he needed space after a year of dating roand it effected me more than I thought. It was a horrible time in my life due to living with my mother and finally having the chance to cut her out of my life. I can admit because of it was an emotional wreck and this guy was stable and didn't know how to deal with me. We ended by him avoiding for a month or so after going to a festival with my friends. When I think back I wish he just ended. So after he ended it added my depression I suffered but my friends helped me through it and I got rid of everything from him along with cutting contact.
Phi-2 Response: When I (f/22) went through my first real breakup 2 years ago because he needed space after a year of dating roand it effected me more than I thought. It was a horrible time in my life due to living with my mother and finally having the chance to cut her out of my life. I can admit because of it was an emotional wrec