In [None]:
# Uninstall and reinstall bitsandbytes to ensure the latest version
!pip uninstall bitsandbytes -y
!pip install bitsandbytes --extra-index-url https://download.pytorch.org/whl/cu124

# Verify bitsandbytes version
!pip show bitsandbytes

# Install other dependencies for Colab environment
if "COLAB_" in "".join(os.environ.keys()):
    !pip install unsloth==2025.6.5 transformers datasets trl torch
else:
    !pip install unsloth==2025.6.5 transformers datasets trl torch

# IMPORTANT: Restart the runtime after installing bitsandbytes to ensure the new version is used.
# In Colab, go to Runtime > Restart runtime, then rerun the script.

In [None]:


import os
import re
import torch
from unsloth import FastLanguageModel  # Moved to top for Unsloth optimization
from datasets import load_dataset
from transformers import AutoTokenizer
from trl import GRPOConfig, GRPOTrainer

# Load the quantized model and tokenizer
model_name = "devatar/quantized_Llama-3.1-8B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name,
    load_in_4bit=True,  # Use 4-bit quantization
    device_map="auto",
)

# Add LoRA adapters for fine-tuning
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,  # No dropout for efficiency
    bias="none",
    use_gradient_checkpointing="unsloth",  # Use Unsloth's gradient checkpointing
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# Set padding token explicitly
tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
tokenizer.padding_side = "right"

# Enable inference optimizations
FastLanguageModel.for_inference(model)

# Load the dataset
dataset = load_dataset("Anthropic/hh-rlhf", split="train")

# Preprocess dataset: Handle raw string format of hh-rlhf
def format_hh_rlhf(example):
    # Extract prompt by removing the last assistant response
    chosen = example["chosen"]
    rejected = example["rejected"]
    # Split on 'Assistant:' to isolate the last response, keeping the rest as prompt
    chosen_parts = chosen.rsplit("Assistant:", 1)
    rejected_parts = rejected.rsplit("Assistant:", 1)
    prompt = chosen_parts[0].strip() if len(chosen_parts) > 1 else ""
    chosen_response = chosen_parts[-1].strip() if len(chosen_parts) > 1 else chosen
    rejected_response = rejected_parts[-1].strip() if len(rejected_parts) > 1 else rejected
    # Format prompt using tokenizer's chat template
    messages = [{"role": "user", "content": prompt}] if prompt else []
    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return {
        "prompt": formatted_prompt,
        "tokens": tokenizer(formatted_prompt).input_ids,
        "chosen": chosen_response,
        "rejected": rejected_response
    }

dataset = dataset.select(range(1000)).map(format_hh_rlhf, batched=False)  # Use 1000 samples for testing

# Calculate maximum prompt length
max_prompt_length = max(len(x["tokens"]) for x in dataset) + 1

# Define reward functions
def match_format_exactly(prompts, completions, completion_ids=None, **kwargs):
    scores = []
    for completion in completions:
        response = completion  # completions is a list of strings
        # Reward non-empty responses longer than 10 characters
        score = 1.0 if response.strip() and len(response) > 10 else -1.0
        scores.append(score)
    return scores

def conversational_quality(prompts, completions, completion_ids=None, chosen=None, rejected=None, **kwargs):
    scores = []
    for completion, chosen_answer, rejected_answer in zip(completions, chosen, rejected):
        response = completion  # completions is a list of strings
        # Reward if response is closer in length to chosen than rejected
        chosen_similarity = min(len(response) / (len(chosen_answer) + 1), 1.0)
        rejected_similarity = min(len(response) / (len(rejected_answer) + 1), 1.0)
        score = 1.5 if chosen_similarity > rejected_similarity else -0.5
        scores.append(score)
    return scores

# Configure GRPO trainer
training_args = GRPOConfig(
    learning_rate=5e-6,
    weight_decay=0.1,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    optim="adamw_8bit",
    logging_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,  # Increased for memory efficiency
    num_generations=2,  # Minimum required for GRPO
    max_completion_length=256,  # Reduced to save memory
    max_steps=500,
    save_steps=250,
    max_grad_norm=1.0,
    report_to="none",
    output_dir="./outputs",
)

# Initialize and run the trainer
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[
        match_format_exactly,
        conversational_quality,
    ],
    args=training_args,
    train_dataset=dataset,
)

trainer.train()

# Save the full fine-tuned model
model.save_pretrained_merged("./finetuned_llama31_8b", tokenizer, save_method="merged_16bit")

In [None]:
import os
import re
import torch
import difflib
from unsloth import FastLanguageModel  # Moved to top for Unsloth optimization
from datasets import load_dataset
from transformers import AutoTokenizer
from trl import GRPOConfig, GRPOTrainer

# Uninstall and reinstall bitsandbytes to ensure a compatible version
!pip uninstall bitsandbytes -y
!pip install bitsandbytes>=0.43.3 --extra-index-url https://download.pytorch.org/whl/cu124

# Verify bitsandbytes version
!pip show bitsandbytes

# Install other dependencies for Colab environment
if "COLAB_" in "".join(os.environ.keys()):
    !pip install unsloth==2025.6.5 transformers datasets trl torch
else:
    !pip install unsloth==2025.6.5 transformers datasets trl torch

# IMPORTANT: Restart the runtime after installing bitsandbytes to ensure the new version is used.
# In Colab, go to Runtime > Restart runtime, then rerun the script.

# Load the quantized model and tokenizer
model_name = "devatar/quantized_Llama-3.1-8B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name,
    load_in_4bit=True,  # Use 4-bit quantization
    device_map="auto",
)

# Add LoRA adapters for fine-tuning
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,  # No dropout for efficiency
    bias="none",
    use_gradient_checkpointing="unsloth",  # Use Unsloth's gradient checkpointing
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# Set padding token explicitly
tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
tokenizer.padding_side = "right"

# Enable inference optimizations
FastLanguageModel.for_inference(model)

# Load the dataset
dataset = load_dataset("Anthropic/hh-rlhf", split="train")

# Preprocess dataset: Handle raw string format of hh-rlhf
def format_hh_rlhf(example):
    # Extract prompt by removing the last assistant response
    chosen = example["chosen"]
    rejected = example["rejected"]
    # Split on 'Assistant:' to isolate the last response, keeping the rest as prompt
    chosen_parts = chosen.rsplit("Assistant:", 1)
    rejected_parts = rejected.rsplit("Assistant:", 1)
    prompt = chosen_parts[0].strip() if len(chosen_parts) > 1 else ""
    chosen_response = chosen_parts[-1].strip() if len(chosen_parts) > 1 else chosen
    rejected_response = rejected_parts[-1].strip() if len(rejected_parts) > 1 else rejected
    # Format prompt using tokenizer's chat template
    messages = [{"role": "user", "content": prompt}] if prompt else []
    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return {
        "prompt": formatted_prompt,
        "tokens": tokenizer(formatted_prompt).input_ids,
        "chosen": chosen_response,
        "rejected": rejected_response
    }

dataset = dataset.select(range(1000)).map(format_hh_rlhf, batched=False)  # Use 1000 samples for testing

# Calculate maximum prompt length
max_prompt_length = max(len(x["tokens"]) for x in dataset) + 1

# Define reward functions
def match_format_exactly(prompts, completions, completion_ids=None, **kwargs):
    scores = []
    for completion in completions:
        response = completion  # completions is a list of strings
        # Reward non-empty responses longer than 10 characters
        score = 1.0 if response.strip() and len(response) > 10 else -1.0
        scores.append(score)
    return scores

def conversational_quality(prompts, completions, completion_ids=None, chosen=None, rejected=None, **kwargs):
    scores = []
    for completion, chosen_answer, rejected_answer in zip(completions, chosen, rejected):
        response = completion  # completions is a list of strings
        # Use sequence similarity (difflib) to compare response with chosen/rejected
        chosen_similarity = difflib.SequenceMatcher(None, response, chosen_answer).ratio()
        rejected_similarity = difflib.SequenceMatcher(None, response, rejected_answer).ratio()
        # Reward if response is more similar to chosen than rejected
        score = 1.5 if chosen_similarity > rejected_similarity else -0.5
        scores.append(score)
    return scores

# Configure GRPO trainer
training_args = GRPOConfig(
    learning_rate=5e-6,
    weight_decay=0.1,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    optim="adamw_8bit",
    logging_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,  # Increased for memory efficiency
    num_generations=2,  # Minimum required for GRPO
    max_completion_length=512,  # Increased to reduce clipping
    max_steps=50,  # Reduced for quick training
    save_steps=25,  # Save twice during training
    max_grad_norm=1.0,
    report_to="none",
    output_dir="./outputs",
)

# Initialize and run the trainer
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[
        match_format_exactly,
        conversational_quality,
    ],
    args=training_args,
    train_dataset=dataset,
)

trainer.train()

# Save the full fine-tuned model
model.save_pretrained_merged("./finetuned_llama31_8b", tokenizer, save_method="merged_16bit")