In [5]:
# ## dependencies for colab
# !git clone https://__TOKEN_GIT__:@github.com/DanielSc4/RL-on-LM.git
# %cd RL-on-LM/
# !pip install -r requirements.txt

## 0. 🗽 Imports

In [31]:
import torch
from rewardlm.data.data_utils import download_DIALOCONAN
from rewardlm.ToxicityMeter import ToxicityMeter
from transformers import GenerationConfig
from rewardlm.utils.general_utils import device_selector

# from rewardlm.data.data_utils import gen_benchmark_data

#### Choose model

In [38]:
# model_id = 'EleutherAI/gpt-neo-125m'
model_id = 'EleutherAI/pythia-70m'      # prototype model
# model_id = 'EleutherAI/pythia-2.8b'
# model_id = 'MBZUAI/LaMini-GPT-124M'
# model_id = 'MBZUAI/LaMini-GPT-774M'
# model_id = 'MBZUAI/LaMini-GPT-1.5B'
# model_id = 'togethercomputer/RedPajama-INCITE-Instruct-3B-v1'
model_ids = [
    'MBZUAI/LaMini-GPT-124M',
    'MBZUAI/LaMini-GPT-774M',
    'MBZUAI/LaMini-GPT-1.5B',
    'togethercomputer/RedPajama-INCITE-Instruct-3B-v1',
    'togethercomputer/RedPajama-INCITE-Chat-3B-v1',
    'bigscience/bloomz-1b7',    # 1.7 billion
    'bigscience/bloomz-3b',     # 3 billion
    'bigscience/bloomz-7b1',    # 7.1 billion

]

reward_model_id = 'facebook/roberta-hate-speech-dynabench-r4-target'

## Dataset Selection

In [9]:
###
# Temporary management of the dataset:
# Decided to focus only on the firts 2 turns of the dialog.
# The prompt will be the toxic text reported in the dataset and the model 
# should produce a non-toxic response; 
# eventually, the model, will be instructed trough RL
# in debunking the initial statement

# selecting only the first text column

In [39]:
dataset = download_DIALOCONAN()
query = dataset['turn_id'] == 0

## Generate


In [41]:
for model_id in model_ids:
    print(f'Testing {model_id}')
    
    load_dtype = 'bf16' # can be ['8-bit', 'bf16',]
    toxicity_meter = ToxicityMeter(
        model_id = model_id,
        load_dtype = load_dtype, # can be ['8-bit', 'bf16',]
        toxicity_model_id = reward_model_id,
        device = device_selector(),
    )

    custom_prompt = ('Statement: "{prompt}".\nResponse: ')
    # custom prompts required by papers
    if model_id == 'togethercomputer/RedPajama-INCITE-Chat-3B-v1':
        custom_prompt = ('<human>: "{prompt}"\n<bot>: ')
    

    # samples: 25, top-p: 0.9, temperature = 1, max_len = 20
    generation_config = GenerationConfig(
        max_new_tokens = 25,
        num_beams = 5,
        early_stopping = True,
        pad_token_id = 0,       # crashes while using batchsize > 1 only on mps device if not set
        temperature = 0.8,
        top_p = .8,
        # diversity_penalty = .1, # should use num_beam_groups > 1
    )


    toxicity_df = toxicity_meter.measure_toxicity(
        text_prompt = dataset[query]['text'].to_list()[:30],   # REMOVE [:100], just for testing purpose
        custom_prompt = custom_prompt, 
        generation_config = generation_config,
        batch_size = 16,
    )

    # save csv in tmp folder
    toxicity_df.to_csv(f'./result analysis/tmp/measured_tox_instruct_{model_id.split("/")[-1]}_{load_dtype}.csv')

100%|██████████| 4/4 [00:08<00:00,  2.24s/it]
100%|██████████| 1/1 [00:01<00:00,  1.82s/it]
100%|██████████| 4/4 [00:08<00:00,  2.14s/it]
100%|██████████| 1/1 [00:01<00:00,  1.90s/it]


In [4]:
toxicity_df.to_csv(f'../measured_toxicity_{model_id.split("/")[-1]}_{load_dtype}_(2)_normal_prompt.csv')