In [None]:
!pip install datasets

In [None]:
#!pip install trl

In [None]:
!pip install transformers datasets peft trl accelerate bitsandbytes

In [None]:
!pip install unsloth

In [None]:
!pip install vllm

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Make sure to download the required resources
nltk.download("punkt")

In [None]:
from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer
import torch
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
import bitsandbytes as bnb
import math
from trl import GRPOConfig, GRPOTrainer
from unsloth import FastLanguageModel, PatchFastRL

In [None]:
# Load train and validation datasets directly
train_dataset = load_dataset("trl-lib/tldr")

In [None]:
from datasets import concatenate_datasets

# Combine train + validation + test
full_dataset = concatenate_datasets([
    train_dataset["train"],
    train_dataset["validation"],
    train_dataset["test"],
])

In [None]:

# === Optional: Patch RL logic (used only in RLHF-type training) ===
PatchFastRL("GRPO", FastLanguageModel)

# === Config ===
model_name = "microsoft/phi-2"
max_seq_length = 512
lora_rank = 8
use_4bit = True

# === Load model and tokenizer via Unsloth (replaces AutoModelForCausalLM + BitsAndBytesConfig) ===
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    load_in_4bit = use_4bit,
    fast_inference = True,         # Use vLLM-style speedups
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.5,  # Adjust based on VRAM
)

# === Unsloth handles tokenizer defaults — but you can explicitly set these too: ===
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# === Apply LoRA ===
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,
    lora_alpha = lora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)


In [None]:
# 3. Optimized GRPO config
training_args = GRPOConfig(
    learning_rate=5e-4,
    adam_beta1=0.9,
    adam_beta2=0.99,
    weight_decay=0.1,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    optim="adamw_8bit",
    fp16=True,
    per_device_train_batch_size=6,
    gradient_accumulation_steps=4,
    num_generations=6,  # reduced from 6
    max_prompt_length=256,
    max_completion_length=128,
    eval_steps=5,
    max_steps=300,
    save_steps=20,
    max_grad_norm = 0.1,
    logging_steps=5,
    report_to="none",
    output_dir="./phi2-grpo-results",
    remove_unused_columns=False,
    label_names=[]
)

In [None]:
def reward_bleu(prompts, completions, **kwargs):
    """
    Reward function that scores based on BLEU score for the completion
    Args:
        prompts: list of input prompts (not used in BLEU directly)
        completions: list of generated completions
        **kwargs: additional arguments passed by the trainer
    Returns:
        list of reward scores based on BLEU
    """
    # This is just an example target. You'd want to adjust based on your task.
    reference = kwargs.get("reference", ["This is a placeholder reference"])

    rewards = []
    smoothing_function = SmoothingFunction().method4  # Use smoothing to avoid zero BLEU score

    for completion in completions:
        # Tokenize the generated completion
        generated_tokens = nltk.word_tokenize(completion.lower())

        # Compute BLEU score
        bleu_score = sentence_bleu([reference], generated_tokens, smoothing_function=smoothing_function)
        rewards.append(bleu_score)

    return rewards


In [None]:
import nltk
nltk.download('punkt')

In [None]:
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
# Initialize trainer
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=reward_bleu,
    args=training_args,
    train_dataset=full_dataset
)

# Train the model
trainer.train()

# Save the final model
trainer.save_model("./phi2-grpo-final")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the quantization config (make sure to have the `bnb_config` set up)
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",  # Load Phi-2 model from Hugging Face # Use the appropriate quantization config (like QLoRA)
    device_map="auto"  # Automatically map model to the available device (e.g., GPU/CPU)
)

phi2_tokenizer =AutoTokenizer.from_pretrained("microsoft/phi-2")  # Adjust the tokenizer name if needed

# Load the GRPO fine-tuned model from the checkpoint folder
grpo_model = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/Checkpoint_300")
grpo_tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Checkpoint_300")

In [13]:
def generate_response(model, tokenizer, prompt, max_length=150, device='cuda'):
    # Ensure model is on the correct device
    model.to(device)

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=max_length)

    # Move input tensors to the same device as the model
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.get('attention_mask', None)
    if attention_mask is not None:
        attention_mask = attention_mask.to(device)

    # Generate the output
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=200,
        num_return_sequences=1,
        do_sample=True,
        top_p=0.95,
        temperature=0.9
    )

    # Decode the output tokens back to text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Ensure both models are on the same device (e.g., 'cuda' or 'cpu')
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Get responses from both models, making sure they're on the same device
phi2_responses = [generate_response(model, phi2_tokenizer, prompt, device=device) for prompt in prompts]
grpo_responses = [generate_response(grpo_model, grpo_tokenizer, prompt, device=device) for prompt in prompts]


In [None]:
# Display the responses
for prompt, phi2_response, grpo_response in zip(prompts, phi2_responses, grpo_responses):
    print(f"Prompt: {prompt}")
    print(f"Phi-2 Response: {phi2_response}")
    print(f"GRPO + QLoRA Response: {grpo_response}")
    print("="*80)