In [3]:
# ## dependencies for colab
# !git clone https://__TOKEN_GIT__:@github.com/DanielSc4/RewardLM.git
# %cd RewardLM/
# !pip install -r requirements.txt
# from huggingface_hub import login
# login(token = '')  # https://huggingface.co/settings/tokens

In [8]:
# disable welcome message
import os
os.environ['BITSANDBYTES_NOWELCOME'] = '1'
# os.environ['TOKENIZERS_PARALLELISM'] = 'true'
# os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

# 🥞 Reinforcement Learning with Automatic Feedback (RLAF)

Generation config from [here](https://github.com/LAION-AI/Open-Assistant/blob/main/model/model_training/configs/ppo_config.yaml) seems to work

In [None]:
# test
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('AlekseyKorshuk/vicuna-7b')

In [6]:
import torch
from rewardlm.core.RL.RLModel import RLModel
from rewardlm.data.data_utils import get_DIALOCONAN_prepro
from transformers import GenerationConfig

# model_id = 'MBZUAI/LaMini-GPT-124M'
model_id = 'MBZUAI/LaMini-GPT-774M'
# model_id = 'AlekseyKorshuk/vicuna-7b'
# model_id = 'togethercomputer/RedPajama-INCITE-Chat-3B-v1'
rlmanager = RLModel(
    model_id = model_id,
    reward_model_id = 'facebook/roberta-hate-speech-dynabench-r4-target',
    optimized = True,   # use LoRA
    bs = 128,       # batch size used for each optimization step
    mini_bs = 4,    # mini batch size (<= batch size) for each model forward pass 
    # force the use of CPU on Apple Silicon devices (mps not supported):
    accelerator_kwargs = {
        'cpu': False if torch.cuda.is_available() else True,
    },
    generation_config=GenerationConfig(
        max_new_tokens = 512,
        min_new_tokens = 5,
        # num_beams = 5,
        # early_stopping = "never",
        pad_token_id = 0,       # crashes while using batchsize > 1 only on mps device if not set
        temperature = 1,
        top_p = .7,
        top_k = 0,
        do_sample = True
        # diversity_penalty = .1, # should use num_beam_groups > 1
    )
)


data = get_DIALOCONAN_prepro(delete_last_assistant_response = True)
dataset = rlmanager.generate_dataset(text = data)

stats = rlmanager.train_PPO(dataset = dataset)

# save trainer (model, tokenizer & config) to the hub
repo_id = 'DanielSc4/' + model_id.split('/')[1] + '-RL-LoRA-test0'

rlmanager.push_generator_to_hub(repo_id = repo_id)

bin /Users/daniel/Documents/Work/RewardLM/.venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so


  warn("The installed version of bitsandbytes was compiled without GPU support. "


Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]



trainable params: 589824 || all params 125030400 || trainable(%): 0.47


Map:   0%|          | 0/8312 [00:00<?, ? examples/s]

loader len: 27


0it [00:12, ?it/s]


KeyboardInterrupt: 

In [7]:
print(rlmanager.ppo_trainer.current_device)

cpu


# 👨🏼‍🏫 Model fine-tune

In [9]:
import torch
from rewardlm.data.data_utils import get_DIALOCONAN_prepro
from rewardlm.core.GenerativeModel import GenerativeModel
from rewardlm.data.CustomDatasets import PromptDataset_CLM

# select model
model_id = 'facebook/opt-350m'
# # model_id = 'MBZUAI/LaMini-GPT-124M'
# model_id = 'MBZUAI/LaMini-GPT-774M'

generator_manager = GenerativeModel(
    model_id,
    load_dtype = '8-bit' if torch.cuda.is_available() else 'fp32',
    # force the use of CPU on Apple Silicon devices (mps not supported):
    accelerator_kwargs = {
        'cpu': False if torch.cuda.is_available() else True,
    },
)

# download dataset
data = get_DIALOCONAN_prepro()
dataset = PromptDataset_CLM(
    tokenizer = generator_manager.tokenizer,
    text = data,
)

generator_manager.fine_tune(
    torch_dataset = dataset, 
    optimized = True # if torch.cuda.is_available() else False,
)

# save model to the hub
repo_id = 'DanielSc4/' + model_id.split('/')[1] + '-FT-LoRA-test0'
generator_manager.push_to_hub(repo_id = repo_id)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


trainable params: 1572864 || all params 332769280 || trainable(%): 0.47


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

# ⚖️ ToxicityMeter

In [None]:
from transformers import GenerationConfig
from rewardlm.ToxicityMeter import ToxicityMeter
from rewardlm.utils.general_utils import device_selector

model_id = 'bigscience/bloomz-3b'
reward_model_id = 'facebook/roberta-hate-speech-dynabench-r4-target'
batchsize = 16
generation_config = GenerationConfig(
    max_new_tokens = 25,
    num_beams = 5,
    early_stopping = True,
    # crashes while using batchsize > 1 on mps device if not set:
    pad_token_id = 0,
    temperature = 0.8,
    top_p = .8,
    # diversity_penalty = .1, # should use num_beam_groups > 1
)

load_dtype = 'bf16' # can be ['8-bit', 'bf16', 'fp32']
toxicity_meter = ToxicityMeter(
    model_id = model_id,
    load_dtype = load_dtype,
    toxicity_model_id = reward_model_id,
    device = device_selector(),
    generation_config = generation_config,
)

In [None]:
from rewardlm.data.data_utils import get_real_toxicity_prompts

custom_prompt = ('Statement: "{prompt}".\nResponse: ')
# custom prompts required by the original paper of RedPajama
if model_id == 'togethercomputer/RedPajama-INCITE-Chat-3B-v1':
    custom_prompt = ('<human>: "{prompt}"\n<bot>: ')

toxicity_df = toxicity_meter.measure_toxicity(
    text_prompt = get_real_toxicity_prompts()['text'].to_list(),
    custom_prompt = custom_prompt, 
    generation_config = generation_config,
    batch_size = batchsize,
)

In [None]:
# save csv in tmp folder
fld = './result analysis/tmp'
toxicity_df.to_csv(
    fld + f'/measured_tox_instruct_{model_id.split("/")[-1]}_{load_dtype}.csv'
)