In [3]:
# ## dependencies for colab
# !git clone https://__TOKEN_GIT__:@github.com/DanielSc4/RewardLM.git
# %cd RewardLM/
# !pip install -r requirements.txt
# from huggingface_hub import login
# login(token = '')  # https://huggingface.co/settings/tokens

In [1]:
# disable welcome message
import os
os.environ['BITSANDBYTES_NOWELCOME'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

# 🥞 Reinforcement Learning with Automatic Feedback (RLAF)

Generation config from [here](https://github.com/LAION-AI/Open-Assistant/blob/main/model/model_training/configs/ppo_config.yaml) seems to work

In [48]:
import torch
from rewardlm.core.RL.RLModel import RLModel
from rewardlm.data.data_utils import get_DIALOCONAN_prepro
from transformers import GenerationConfig

model_id = 'MBZUAI/LaMini-GPT-124M'
rlmanager = RLModel(
    model_id = model_id,
    reward_model_id = 'facebook/roberta-hate-speech-dynabench-r4-target',
    optimized = True,   # use 8-bit PEFT
    bs = 4,
    mini_bs = 4,
    # force the use of CPU on Apple Silicon devices (mps not supported):
    accelerator_kwargs = {
        'cpu': False if torch.cuda.is_available() else True,
    },
    generation_config=GenerationConfig(
        max_new_tokens = 512,
        min_new_tokens = 5,
        # num_beams = 5,
        # early_stopping = "never",
        pad_token_id = 0,       # crashes while using batchsize > 1 only on mps device if not set
        temperature = 1,
        top_p = .7,
        top_k = 0,
        do_sample = True
        # diversity_penalty = .1, # should use num_beam_groups > 1
    )
)


data = get_DIALOCONAN_prepro(delete_last_assistant_response = True)
data = data[:8]
dataset = rlmanager.generate_dataset(text = data)

stats = rlmanager.train_PPO(dataset = dataset)

# save trainer (model, tokenizer & config) to the hub
repo_id = 'DanielSc4/' + model_id.split('/')[1] + '-RL-LoRA-test0'

rlmanager.push_generator_to_hub(repo_id = repo_id)

trainable params: 589824 || all params 125030400 || trainable(%): 0.47
None


Map:   0%|          | 0/8 [00:00<?, ? examples/s]

loader len: 1


100%|██████████| 1/1 [00:00<00:00,  8.81it/s]
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
1it [00:08,  8.84s/it]


https://huggingface.co/DanielSc4/LaMini-GPT-124M-RL-LoRA-test0


# 👨🏼‍🏫 Model fine-tune

In [4]:
import torch
from rewardlm.data.data_utils import get_DIALOCONAN_prepro
from rewardlm.core.GenerativeModel import GenerativeModel
from rewardlm.data.CustomDatasets import PromptDataset_CLM


# select model
# model_id = 'facebook/opt-350m'
model_id = 'MBZUAI/LaMini-GPT-124M'

generator_manager = GenerativeModel(
    model_id,
    load_dtype = '8-bit' if torch.cuda.is_available() else 'fp32',
    # force the use of CPU on Apple Silicon devices (mps not supported):
    accelerator_kwargs = {
        'cpu': False if torch.cuda.is_available() else True,
    },
)

# download dataset
data = get_DIALOCONAN_prepro()

dataset = PromptDataset_CLM(
    tokenizer = generator_manager.tokenizer,
    text = data,
)

generator_manager.fine_tune(
    torch_dataset = dataset, 
    optimized = True if torch.cuda.is_available() else False,
)

trainable params: 0 || all params 124440576 || trainable(%): 0.00


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdanielsc4[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/200 [00:00<?, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 6.1768, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}
{'loss': 6.3035, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.0}
{'loss': 6.53, 'learning_rate': 6e-06, 'epoch': 0.01}
{'loss': 6.5738, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.01}
{'loss': 6.0773, 'learning_rate': 1e-05, 'epoch': 0.01}
{'loss': 6.4389, 'learning_rate': 1.2e-05, 'epoch': 0.01}
{'loss': 6.2466, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.01}
{'loss': 6.4019, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.02}
{'loss': 6.4409, 'learning_rate': 1.8e-05, 'epoch': 0.02}
{'loss': 6.3608, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 6.1762, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.02}
{'loss': 6.5259, 'learning_rate': 2.4e-05, 'epoch': 0.02}
{'loss': 6.4536, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.03}
{'loss': 6.2756, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.03}
{'loss': 6.321, 'learning_rate': 3e-05, 'epoch': 0.03}
{'loss': 5.9043, 'learnin

KeyboardInterrupt: 

# ⚖️ ToxicityMeter

In [None]:
from transformers import GenerationConfig
from rewardlm.ToxicityMeter import ToxicityMeter
from rewardlm.utils.general_utils import device_selector

model_id = 'bigscience/bloomz-3b'
reward_model_id = 'facebook/roberta-hate-speech-dynabench-r4-target'
batchsize = 16
generation_config = GenerationConfig(
    max_new_tokens = 25,
    num_beams = 5,
    early_stopping = True,
    # crashes while using batchsize > 1 on mps device if not set:
    pad_token_id = 0,
    temperature = 0.8,
    top_p = .8,
    # diversity_penalty = .1, # should use num_beam_groups > 1
)

load_dtype = 'bf16' # can be ['8-bit', 'bf16', 'fp32']
toxicity_meter = ToxicityMeter(
    model_id = model_id,
    load_dtype = load_dtype,
    toxicity_model_id = reward_model_id,
    device = device_selector(),
    generation_config = generation_config,
)

In [None]:
from rewardlm.data.data_utils import get_real_toxicity_prompts

custom_prompt = ('Statement: "{prompt}".\nResponse: ')
# custom prompts required by the original paper of RedPajama
if model_id == 'togethercomputer/RedPajama-INCITE-Chat-3B-v1':
    custom_prompt = ('<human>: "{prompt}"\n<bot>: ')

toxicity_df = toxicity_meter.measure_toxicity(
    text_prompt = get_real_toxicity_prompts()['text'].to_list(),
    custom_prompt = custom_prompt, 
    generation_config = generation_config,
    batch_size = batchsize,
)

In [None]:
# save csv in tmp folder
fld = './result analysis/tmp'
toxicity_df.to_csv(
    fld + f'/measured_tox_instruct_{model_id.split("/")[-1]}_{load_dtype}.csv'
)