#### colab requirements

In [1]:
## ONLY on colab
# !pip install transformers
# !pip install datasets
# !pip install trl

## ⬇️ Imports

In [2]:
import numpy as np
from tqdm import tqdm

from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer,
    RobertaForSequenceClassification,   # reward model
    RobertaTokenizer,                   # reward model tokenizer
)
from datasets import load_dataset

import torch
from torch.optim import Adam
from trl import (
    AutoModelForCausalLMWithValueHead, 
    PPOConfig, 
    PPOTrainer,
    create_reference_model,
    set_seed,
)
from trl.core import LengthSampler


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /Users/daniel/Documents/Work/RewardLM/.venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so
CUDA SETUP: Loading binary /Users/daniel/Documents/Work/RewardLM/.venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so...
dlopen(/Users/daniel/Documents/Work/RewardLM/.venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so, 0x0006): tried: '/Users/daniel/Documents/Work/RewardLM/.venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so' (not a mach-o file), '/System/Volumes/Preboot/Cryptexes/OS/Users/daniel/Documents/Work/RewardLM/.venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so' (no such file), '/Users/daniel/Documents/Work/RewardLM/.venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so' (not a mach-o f

  warn("The installed version of bitsandbytes was compiled without GPU support. "


## 🤖 Model

### Configs

In [3]:
# model_name = 'ybelkada/gpt-j-6b-sharded-bf16'       # has 28 layers
model_name = 'MBZUAI/LaMini-GPT-124M'              # smallest one, having 12 layers
log_with = None                     # or 'wandb'
learning_rate = (1.47e-5) * 2       # default one (?)
mini_batch_size = 4                 # PPO minibatch size
batch_size = 12                     # batchsize (number of element for each reward computed (ppo_trainer.dataloader batchsize))
ppo_epochs = 100
gradient_accumulation_steps = 1     # [TODO]: check what is this!
model_save_path = './checkpoints/'


In [4]:
config = PPOConfig(
    model_name = model_name,
    learning_rate = learning_rate,
    log_with = log_with,
    ppo_epochs = ppo_epochs,
    mini_batch_size = mini_batch_size,
    batch_size = batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    accelerator_kwargs = {'cpu': True}
)

### Dataset
Method that loads the dataset, customize it to train on different data!

In [6]:
# def build_dataset(
#     config, 
#     dataset_name="allenai/real-toxicity-prompts", 
#     input_min_text_length = 10, 
#     input_max_text_length = 15,
# ):
#     """
#     Build dataset for training. This builds the dataset from `load_dataset` and returns 
#     a Dataloader class (from torch.utils.data)

#     Args:
#         dataset_name (`str`):
#             The name of the dataset to be loaded. (from load_dataset)
#     Returns:
#         dataloader (`torch.utils.data.DataLoader`):
#             The dataloader for the dataset.
#     """
#     # from the PPO trainer
#     tokenizer = AutoTokenizer.from_pretrained(config.model_name)
#     tokenizer.pad_token = tokenizer.eos_token

#     ds = load_dataset(dataset_name, split = "train")

#     # function to filter only prompts that lead to a toxicity score > 0.3
#     # in order to increase the chances for the model to generate toxicity prompts
#     def filter_fn(sample):
#         toxicity = sample["prompt"]["toxicity"]
#         return toxicity is not None and toxicity > 0.3

#     ds = ds.filter(filter_fn, batched=False)
    
#     # only prompts in between input_min_text_length : input_max_text_length
#     input_size = LengthSampler(input_min_text_length, input_max_text_length)

#     def tokenize(sample):
#         prompt = sample["prompt"]["text"]
#         continuation = sample["continuation"]["text"]

#         sample["input_ids"] = tokenizer.encode(prompt + continuation)[: input_size()]
#         sample["query"] = tokenizer.decode(sample["input_ids"])
#         return sample

#     # tokenize
#     ds = ds.map(tokenize, batched = False)
#     # set as torch.utils.data.DataLoader
#     ds.set_format(type="torch")

#     ds = ds.train_test_split(test_size = 0.2, shuffle = False)["train"]

#     return ds

# # Getting dataset
# min_input_length = 30
# max_input_length = 40
# dataset = build_dataset(
#     config, 
#     input_min_text_length = min_input_length, 
#     input_max_text_length = max_input_length
# )
import datasets
def generate_dataset(
        text: list[str], 
        max_len: int = 256, 
        custom_prompt: str = '{prompt}'
    ) -> torch.utils.data.Dataset | datasets.Dataset:
    """Build dataset from training

    Args:
        text (list[str]): List of prompts (str)
        max_len (int, optional): max length for the tokenizer. Defaults to 256.
        custom_prompt (str, optional): format string containing '{prompt}' to modify the original prompt. Defaults to '{prompt}'.

    Returns:
        torch.utils.data.Dataset: The torch dataset used for training
    """
    # legacy:
    # return PromptsDataset(
    #     tokenizer = self.generator_manager.tokenizer,
    #     text = text,
    #     max_len = max_len,
    #     custom_prompt = custom_prompt,
    # )

    adj_prompt = list(map(
        lambda s: custom_prompt.format(prompt = s), 
        text)
    )
    
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    ds = datasets.Dataset.from_dict({'text': adj_prompt})

    def tokenize(sample):
        prompt = sample['text']
        # continuation = sample['continuation']

        sample['input_ids'] = tokenizer.encode(prompt)
        sample['query'] = tokenizer.decode(sample['input_ids'])
        return sample
    
    ds = ds.map(tokenize, batched=False)
    ds.set_format(type='torch')
    ds = ds.train_test_split(test_size = .15, shuffle = False)['train']
    return ds


from rewardlm.data.data_utils import get_DIALOCONAN_prepro

data = get_DIALOCONAN_prepro(delete_last_assistant_response = True)
dataset = generate_dataset(text = data)

Map:   0%|          | 0/8312 [00:00<?, ? examples/s]

### Reference model
We load the model in bfloat16 to save memory

In [7]:
# model loaded in float16 to reduce memory usage
model_stock = AutoModelForCausalLM.from_pretrained(
    config.model_name, 
    torch_dtype = torch.bfloat16
)
model = AutoModelForCausalLMWithValueHead.from_pretrained(model_stock)

# GPT-2 / GPT-J tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token.
# only for this model.
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
## so ONLY FOR GPT (?)
tokenizer.pad_token = tokenizer.eos_token

# reference model sharing 5 layers out of 12 total layers (for 'gpt-neo-125m')
# reference models are frozen copies of the model that is trained (in eval mode) [from doc]
ref_model = create_reference_model(
    model, 
    num_shared_layers = 5,
    pattern = 'transformer.h.{layer}'
)

# We make sure to use `Adam` optimizer on the model parameters that require gradients.
optimizer = Adam(
    filter(lambda p: p.requires_grad, model.parameters()), 
    lr = config.learning_rate
)

![shared layers](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/trl-shared-layers.png)

### PPO trainer

In [9]:
def collator(data):
    return dict(
        (key, [d[key] for d in data]) for key in data[0]
    )

ppo_trainer = PPOTrainer(
    config,
    model,                          # model to be optimized w/ value head
    ref_model = ref_model,          # reference model used for KL penalty
    tokenizer = tokenizer,          # used for encoding
    dataset = dataset,
    data_collator = collator,
    optimizer = optimizer,
    # lr_scheduler = ,              # <optional> lr scheduler
)

In [10]:
dataset

Dataset({
    features: ['text', 'input_ids', 'query'],
    num_rows: 7065
})

### Reward model

In [11]:
toxicity_model_id = "facebook/roberta-hate-speech-dynabench-r4-target"
toxicity_tokenizer = RobertaTokenizer.from_pretrained(toxicity_model_id)
# Also load the toxicity model in fp16 to save memory
toxicity_model = RobertaForSequenceClassification.from_pretrained(
    toxicity_model_id, 
    torch_dtype=torch.float32
).to(ppo_trainer.accelerator.device)

In [12]:
# We then define the arguments to pass to the `generate` function. These arguments
# are passed to the `generate` function of the PPOTrainer, which is a wrapper around
# the `generate` function of the trained model.
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}
output_min_length = 20
output_max_length = 30
output_length_sampler = LengthSampler(output_min_length, output_max_length)

### Train

In [13]:
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    # Get response from the policy model
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **generation_kwargs)
        response_tensors.append(response.squeeze()[-gen_len:])
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]
    print('Response obtained')

    # Compute sentiment score # noqa
    texts = batch["response"]
    print('Asking toxicity')
    toxicity_inputs = toxicity_tokenizer(
        texts, 
        padding=True, 
        truncation=True, 
        return_tensors="pt"
    ).to(ppo_trainer.accelerator.device)
    logits = toxicity_model(**toxicity_inputs).logits.float()
    toxicity_labels = (logits[:, 0]).tolist()
    print('Got toxicity')

    rewards = [torch.tensor(output) for output in toxicity_labels]
    print('Reward computed, running PPO')

    # Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)
    print('PPO updated')

    # Save model every 100 epochs
    if epoch % 100 == 0:
        if ppo_trainer.accelerator.is_main_process:
            ppo_trainer.save_pretrained(model_save_path)

0it [00:25, ?it/s]


KeyboardInterrupt: 