## ⬇️ Imports

In [82]:
import numpy as np
from tqdm import tqdm

from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer,
    RobertaForSequenceClassification,   # reward model
    RobertaTokenizer,                   # reward model tokenizer
)
from datasets import load_dataset

import torch
from torch.optim import Adam
from trl import (
    AutoModelForCausalLMWithValueHead, 
    PPOConfig, 
    PPOTrainer,
    create_reference_model,
    set_seed,
)
from trl.core import LengthSampler

## 🤖 Model

### Configs

In [59]:
# model_name = 'ybelkada/gpt-j-6b-sharded-bf16'
model_name = 'EleutherAI/gpt-neo-125m'      # smallest one
log_with = None                     # or 'wandb'
learning_rate = (1.47e-5) * 2       # default one (?)
mini_batch_size = 4                 # PPO minibatch size
batch_size = 14                     # batchsize (for the model?)
ppo_epochs = 100
gradient_accumulation_steps = 1     # [TODO]: check what is this!
model_save_path = './checkpoints/'


In [60]:
config = PPOConfig(
    model_name = model_name,
    learning_rate = learning_rate,
    log_with = log_with,
    ppo_epochs = ppo_epochs,
    mini_batch_size = mini_batch_size,
    batch_size = batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
)

### Dataset
Method that loads the dataset, customize it to train on different data!

In [27]:
def build_dataset(
    config, 
    dataset_name="allenai/real-toxicity-prompts", 
    input_min_text_length=10, 
    input_max_text_length=15,
):
    """
    Build dataset for training. This builds the dataset from `load_dataset` and returns 
    a Dataloader class (from torch.utils.data)

    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded. (from load_dataset)
    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """
    # from the PPO trainer
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token

    ds = load_dataset(dataset_name, split = "train")

    # function to filter only prompts that lead to a toxicity score > 0.3
    # in order to increase the chances for the model to generate toxicity prompts
    def filter_fn(sample):
        toxicity = sample["prompt"]["toxicity"]
        return toxicity is not None and toxicity > 0.3

    ds = ds.filter(filter_fn, batched=False)
    
    # only prompts in between input_min_text_length : input_max_text_length
    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        prompt = sample["prompt"]["text"]
        continuation = sample["continuation"]["text"]

        sample["input_ids"] = tokenizer.encode(prompt + continuation)[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    # tokenize
    ds = ds.map(tokenize, batched = False)
    # set as torch.utils.data.DataLoader
    ds.set_format(type="torch")

    ds = ds.train_test_split(test_size = 0.2, shuffle = False)["train"]

    return ds

# Getting dataset
min_input_length = 30
max_input_length = 40
dataset = build_dataset(
    config, 
    input_min_text_length = min_input_length, 
    input_max_text_length = max_input_length
)

Found cached dataset json (/Users/daniel/.cache/huggingface/datasets/allenai___json/allenai--real-toxicity-prompts-eb8779dd2693db47/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)
Loading cached processed dataset at /Users/daniel/.cache/huggingface/datasets/allenai___json/allenai--real-toxicity-prompts-eb8779dd2693db47/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-e598224e6b946c4a.arrow


Map:   0%|          | 0/35109 [00:00<?, ? examples/s]

In [28]:
# other stuff

# set seed before initializing value head for deterministic eval
set_seed(config.seed)

# dunno
def collator(data):
    return dict(
        (key, [d[key] for d in data]) for key in data[0]
    )


### Reference model
We load the model in bfloat16 to save memory

In [79]:
# model loaded in float16 to reduce memory usage
model = AutoModelForCausalLM.from_pretrained(config.model_name, torch_dtype = torch.bfloat16)
model = AutoModelForCausalLMWithValueHead.from_pretrained(model)

# GPT-2 / GPT-J tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token.
# only for this model.
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
## so ONLY FOR GPT (?)
tokenizer.pad_token = tokenizer.eos_token

# reference model sharing 20 layers
# reference models are frozen copies of the model that is trained (in eval mode) [from doc]
ref_model = create_reference_model(
    model, 
    num_shared_layers = 20,
    pattern = 'transformer.h.{layer}'
)

# We make sure to use `Adam` optimizer on the model parameters that require gradients.
optimizer = Adam(
    filter(lambda p: p.requires_grad, model.parameters()), 
    lr = config.learning_rate,
)

ValueError: optimizer got an empty parameter list

#### debug session

In [76]:
model.pretrained_model.transformer.h[20]

ModuleList(
  (0-11): 12 x GPTNeoBlock(
    (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (attn): GPTNeoAttention(
      (attention): GPTNeoSelfAttention(
        (attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_dropout): Dropout(p=0.0, inplace=False)
        (k_proj): Linear(in_features=768, out_features=768, bias=False)
        (v_proj): Linear(in_features=768, out_features=768, bias=False)
        (q_proj): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
    )
    (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (mlp): GPTNeoMLP(
      (c_fc): Linear(in_features=768, out_features=3072, bias=True)
      (c_proj): Linear(in_features=3072, out_features=768, bias=True)
      (act): NewGELUActivation()
      (dropout): Dropout(p=0.0, inplace=False)
    )
  )
)

In [69]:
create_reference_model(model.pretrained_model.transformer.h, num_shared_layers = 20)

ValueError: Layer pattern could not be matched.

In [71]:
pattern = 'transformer.h.{layer}'
pattern.format(layer = 20)

'transformer.h.20'

### PPO trainer

In [80]:
ppo_trainer = PPOTrainer(
    config,
    model,
    ref_model=ref_model,
    tokenizer=tokenizer,
    dataset=dataset,
    data_collator=collator,
    optimizer=optimizer,
)

NameError: name 'optimizer' is not defined

### Reward model

In [84]:
toxicity_model_id = "facebook/roberta-hate-speech-dynabench-r4-target"
toxicity_tokenizer = RobertaTokenizer.from_pretrained(toxicity_model_id)
# Also load the toxicity model in fp16 to save memory
toxicity_model = RobertaForSequenceClassification.from_pretrained(
    toxicity_model_id, 
    torch_dtype=torch.float16
).to(ppo_trainer.accelerator.device)

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

NameError: name 'ppo_trainer' is not defined

In [86]:
# We then define the arguments to pass to the `generate` function. These arguments
# are passed to the `generate` function of the PPOTrainer, which is a wrapper around
# the `generate` function of the trained model.
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}
output_min_length = 20
output_max_length = 30
output_length_sampler = LengthSampler(output_min_length, output_max_length)


### Train

In [None]:
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    # Get response from the policy model
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **generation_kwargs)
        response_tensors.append(response.squeeze()[-gen_len:])
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    # Compute sentiment score # noqa
    texts = batch["response"]
    toxicity_inputs = toxicity_tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(
        ppo_trainer.accelerator.device
    )
    logits = toxicity_model(**toxicity_inputs).logits.float()
    toxicity_labels = (logits[:, 0]).tolist()

    rewards = [torch.tensor(output) for output in toxicity_labels]

    # Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

    # Save model every 100 epochs
    if epoch % 100 == 0:
        if ppo_trainer.accelerator.is_main_process:
            ppo_trainer.save_pretrained(model_save_path)