In [13]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import sys
import math
import torch
from tqdm import tqdm
tqdm.pandas()
from dataclasses import dataclass, field
from typing import Optional
import transformers
import datasets
from datasets import load_dataset
from torch.optim import Adam
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    RobertaForSequenceClassification,
    RobertaTokenizer,
    HfArgumentParser,
    GPT2Tokenizer,
    GPT2LMHeadModel,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    default_data_collator,
    set_seed,
)
from peft import LoraConfig, get_peft_model

from trl import (
    AutoModelForCausalLMWithValueHead,
    PPOConfig,
    PPOTrainer,
    create_reference_model,
    set_seed,
)
from trl.core import LengthSampler
import evaluate
import matplotlib.pyplot as plt

In [17]:
# -----------------------------
# 1. Define Script Arguments
# -----------------------------
@dataclass
class ScriptArguments:
    """
    GPT-J + LoRA + PPO for detox (or other) fine-tuning
    """
    model_name: Optional[str] = field(
        default="EleutherAI/gpt-j-6B",
        metadata={"help": "The base model name or path, e.g., EleutherAI/gpt-j-6B"}
    )
    log_with: Optional[str] = field(
        default=None,
        metadata={"help": "Use 'wandb' or None for logging"}
    )
    learning_rate: Optional[float] = field(
        default=1e-5,
        metadata={"help": "Learning rate for PPO/LoRA training"}
    )
    mini_batch_size: Optional[int] = field(
        default=2,
        metadata={"help": "Minibatch size for PPO updates"}
    )
    batch_size: Optional[int] = field(
        default=8,
        metadata={"help": "Batch size for sampling in PPOTrainer"}
    )
    gradient_accumulation_steps: Optional[int] = field(
        default=1,
        metadata={"help": "Number of gradient accumulation steps"}
    )
    ppo_epochs: Optional[int] = field(
        default=3,
        metadata={"help": "Number of PPO training epochs"}
    )
    model_save_path: Optional[str] = field(
        default="./gptj-lora-ppo-detox",
        metadata={"help": "Directory to save the final model"}
    )
    seed: Optional[int] = field(
        default=42,
        metadata={"help": "Random seed for reproducibility"}
    )

# Parse the script arguments
parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses()[0]

# Configure PPO training parameters
ppo_config = PPOConfig(
    model_name=script_args.model_name,
    learning_rate=script_args.learning_rate,
    log_with=script_args.log_with,
    ppo_epochs=script_args.ppo_epochs,
    mini_batch_size=script_args.mini_batch_size,
    batch_size=script_args.batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
)

TypeError: PPOConfig.__init__() got an unexpected keyword argument 'model_name'

In [None]:
# -----------------------------
# 2. Build/Load Dataset
#    Using allenai/real-toxicity-prompts
# -----------------------------
def build_dataset(
    config,
    dataset_name="allenai/real-toxicity-prompts",
    input_min_text_length=5,
    input_max_text_length=10
):
    """
    Load and preprocess the real-toxicity-prompts dataset.
    Filters samples with toxicity > 0.3 and truncates to [5, 10] token lengths.
    You can customize this function to use a different dataset or preprocessing logic.
    """
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    # GPT-J likely already has an eos_token; set pad_token to eos_token
    tokenizer.pad_token = tokenizer.eos_token

    # Load the dataset
    ds = load_dataset(dataset_name, split="train")

    # Filter out samples with toxicity <= 0.3 or missing toxicity scores
    def filter_fn(sample):
        toxicity = sample["prompt"]["toxicity"]
        return (toxicity is not None) and (toxicity > 0.3)

    ds = ds.filter(filter_fn, batched=False)

    # Sample input sizes between min and max lengths
    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    # Tokenize and truncate the samples
    def tokenize(sample):
        prompt = sample["prompt"]["text"]
        continuation = sample["continuation"]["text"]
        tokens = tokenizer.encode(prompt + continuation)
        tokens = tokens[: input_size()]  # Truncate
        sample["input_ids"] = tokens
        sample["query"] = tokenizer.decode(tokens)
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")

    # Split the dataset into training and testing; here we take the training portion
    ds = ds.train_test_split(test_size=0.1, shuffle=True)["train"]
    return ds

# Collate function for PPOTrainer
def collator(data):
    return {key: [d[key] for d in data] for key in data[0]}

# We retrieve the dataloader by calling the `build_dataset` function.
min_input_length = 30
max_input_length = 40
dataset = build_dataset(ppo_config, input_min_text_length=min_input_length, input_max_text_length=max_input_length)

In [None]:
# -----------------------------
# 3. Set Random Seed
# -----------------------------
set_seed(script_args.seed)

In [None]:
# -----------------------------
# 4. Load GPT-J Base Model + Apply LoRA
# -----------------------------
# Load the base GPT-J model with float16 precision to save memory
print(">>> Loading GPT-J base model:", script_args.model_name)
base_model = AutoModelForCausalLM.from_pretrained(
    script_args.model_name,
    torch_dtype=torch.float16,
    device_map="auto",
)

# Apply LoRA to the base model
print(">>> Applying LoRA ...")
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # GPT-J's attention projection layer names
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    fan_in_fan_out=False,
)
base_model = get_peft_model(base_model, lora_config)
base_model.print_trainable_parameters()

In [None]:
# -----------------------------
# 5. Convert to Model with Value Head
# -----------------------------
print(">>> Converting to AutoModelForCausalLMWithValueHead ...")
model = AutoModelForCausalLMWithValueHead.from_pretrained(base_model)

In [None]:
# -----------------------------
# 6. Create Reference Model & Optimizer
# -----------------------------
# PPO requires a reference model to compute KL divergence for policy updates
# Here, we do not share any layers (num_shared_layers=0) to keep models independent
ref_model = create_reference_model(model, num_shared_layers=0)

# Initialize the optimizer with parameters that require gradients
optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=ppo_config.learning_rate)

In [None]:
# -----------------------------
# 7. Initialize Tokenizer and PPOTrainer
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained(script_args.model_name)
tokenizer.pad_token = tokenizer.eos_token

ppo_trainer = PPOTrainer(
    ppo_config,
    model,
    ref_model=ref_model,
    tokenizer=tokenizer,
    dataset=dataset,
    data_collator=collator,
    optimizer=optimizer,
)

In [None]:
# -----------------------------
# 8. Prepare Reward Model: Roberta Hate-Speech
#    (facebook/roberta-hate-speech-dynabench-r4-target)
#    You can replace this with your own reward model, e.g., for toxicity/sentiment
# -----------------------------
toxicity_model_id = "facebook/roberta-hate-speech-dynabench-r4-target"
toxicity_tokenizer = RobertaTokenizer.from_pretrained(toxicity_model_id)
toxicity_model = RobertaForSequenceClassification.from_pretrained(
    toxicity_model_id,
    torch_dtype=torch.float16
).to(ppo_trainer.accelerator.device)

In [None]:
# -----------------------------
# 9. Define Generation Parameters & Training Loop
# -----------------------------
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}

# Sample output lengths between 5 and 15 tokens
output_length_sampler = LengthSampler(5, 15)
model_save_path = script_args.model_save_path

print(">>> Starting PPO training ...")

for step, batch in tqdm(enumerate(ppo_trainer.dataloader), total=len(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    # 1) Generate responses using the policy model
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len

        # ppo_trainer.generate() wraps model.generate()
        response = ppo_trainer.generate(query, **generation_kwargs)
        # Take only the last gen_len tokens as the response
        response_tensors.append(response.squeeze()[-gen_len:])

    # Decode responses to text
    batch["response"] = [tokenizer.decode(r) for r in response_tensors]

    # 2) Compute rewards using the reward model
    texts = batch["response"]
    toxicity_inputs = toxicity_tokenizer(
        texts, padding=True, truncation=True, return_tensors="pt"
    ).to(ppo_trainer.accelerator.device)

    logits = toxicity_model(**toxicity_inputs).logits.float()
    # Assuming logits[:, 0] represents the hate-speech/toxicity score
    # In practice, you might need to apply sigmoid or softmax and map scores appropriately
    toxicity_labels = (logits[:, 0]).tolist()
    rewards = [torch.tensor(score) for score in toxicity_labels]

    # 3) Perform a PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

    # 4) Save the model checkpoint periodically
    if step % 100 == 0:
        if ppo_trainer.accelerator.is_main_process:
            print(f">>> Saving model checkpoint at step {step} ...")
            ppo_trainer.save_pretrained(model_save_path)

print(">>> PPO training done. Saving final model ...")
if ppo_trainer.accelerator.is_main_process:
    ppo_trainer.save_pretrained(model_save_path)

print("All finished!")