### Overview
In this notebook, we will look at the performance of BoolQ, PIQA and OpenBookQA

In [1]:
# Load vLLM for infernce
from datasets import load_from_disk, Dataset, load_dataset
import random
import pandas as pd
import argparse
import warnings
warnings.filterwarnings("ignore")
from vllm import LLM, SamplingParams
import torch
seed=42

In [2]:
model_name = "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/self_rewarding/LLM_Alignment/safety_llama_paper/models/safety_llama_replicate/merged_model"
model = LLM(model=model_name,
                    tokenizer=model_name, 
                    tensor_parallel_size=torch.cuda.device_count(), 
                    seed=seed,
                    gpu_memory_utilization=0.9, 
                    dtype=torch.float16,
                    enforce_eager=True,
                    max_model_len=1024 # 512 is small for the BoolQ dataset, so changing it to 1024
        )

INFO 05-03 21:28:14 llm_engine.py:87] Initializing an LLM engine with config: model='/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/self_rewarding/LLM_Alignment/safety_llama_paper/models/safety_llama_replicate/merged_model', tokenizer='/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/self_rewarding/LLM_Alignment/safety_llama_paper/models/safety_llama_replicate/merged_model', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, seed=42)


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


INFO 05-03 21:28:31 llm_engine.py:357] # GPU blocks: 137, # CPU blocks: 512


In [3]:

# helper functions
def get_prompt(example, data = 'boolq'):
    if data == 'boolq':
        prompt = f"### Instruction: Answer the following question (True/False) based on the passage.\n\n### Passage:\n{example['passage']}\n\n### Question: {example['question']}\n\n### Answer: "
    elif data == 'obqa':
        prompt = f"### Instruction: Answer the following mulitple-choice question (A/B/C/D).\n\n### Question:\n{example['question_stem']}\n\n### Choices: \n{example['choices']}\n### Answer: "
    elif data == 'piqa':
        prompt = f"### Instruction: Answer the following mulitple-choice question (A/B).\n\n### Question:\n{example['goal']}\n\n### Choices: \n{example['choices']}\n\n### Answer: "
    else:
        print("############# Issue with the prompt")
        return ""
        
    return prompt


def get_obqa_label(example):
    res = ""
    for i in range(len(example['text'])):
        res += f"{example['label'][i]}) {example['text'][i]}\n"
    return res

def preprocess_response_true_or_false(text):
    text = text.lower()
    if 'true' in text:
        return True
    if 'false' in text:
        return False
    return 'Other'

def preprocess_response_choices(text):
    text = text.lower()
    
    if ('a' in text) and ('b' not in text) and ('c' not in text) and ('d' not in text):
        return 'A'
    if ('a' not in text) and ('b' in text) and ('c' not in text) and ('d' not in text):
        return 'B'
    if ('a' not in text) and ('b' not in text) and ('c' in text) and ('d' not in text):
        return 'C'
    if ('a' not in text) and ('b' not in text) and ('c' not in text) and ('d' in text):
        return 'D'
    return 'Others'

def preprocess_piqa_response(text):
    text = text.lower()
    
    if ('a' in text) and ('b' not in text):
        return 0
    if ('b' in text) and ('a' not in text):
        return 1
    return 'Other'

### BoolQ

In [5]:
# BoolQ dataset
bool_q = load_dataset("google/boolq")
boolq = pd.concat([bool_q['train'].to_pandas(), bool_q['validation'].to_pandas()])
print(boolq.shape)
boolq.head(2)

(12697, 3)


Unnamed: 0,question,answer,passage
0,do iran and afghanistan speak the same language,True,"Persian (/ˈpɜːrʒən, -ʃən/), also known by its ..."
1,do good samaritan laws protect those who help ...,True,Good Samaritan laws offer legal protection to ...


In [6]:
# Add the respone prompt
boolq['inference_prompt'] = boolq.apply(lambda x: get_prompt(x, 'boolq'), 1)

In [8]:
print(boolq['inference_prompt'].iloc[2])

### Instruction: Answer the following question (True/False) based on the passage.

### Passage:
Windows Movie Maker (formerly known as Windows Live Movie Maker in Windows 7) is a discontinued video editing software by Microsoft. It is a part of Windows Essentials software suite and offers the ability to create and edit videos as well as to publish them on OneDrive, Facebook, Vimeo, YouTube, and Flickr.

### Question: is windows movie maker part of windows essentials

### Answer: 


In [10]:
boolq_sampling = SamplingParams(n=1, 
                                 max_tokens=3,
                                 # top_k=40,
                                 top_p=0.9,
                                 temperature=0.0,
                                 # frequency_penalty=1.0
                                )

In [17]:
output = model.generate(boolq['inference_prompt'].tolist()[0], boolq_sampling)
torch.cuda.empty_cache()

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  9.54it/s]


In [18]:
output[0].outputs[0].text

'\nFalse.'

In [19]:
output = model.generate(boolq['inference_prompt'].tolist(), boolq_sampling)
torch.cuda.empty_cache()

Processed prompts:   2%|▏         | 193/12697 [00:10<11:29, 18.15it/s]



Processed prompts:  99%|█████████▉| 12561/12697 [11:55<00:07, 17.85it/s]



Processed prompts: 100%|██████████| 12697/12697 [12:02<00:00, 17.59it/s]


In [None]:
boolq_results = []
for ele in output:
    boolq_results.append(ele.outputs[0].text)
boolq['generated_response'] = boolq_results
boolq['predicted_label'] = boolq['generated_response'].apply(preprocess_response_true_or_false)
boolq['predicted_label'] = boolq['predicted_label'].apply(str)

In [20]:
boolq['predicted_label'].value_counts()

predicted_label
True     8520
False    3584
Other     593
Name: count, dtype: int64

In [23]:
boolq['answer'].value_counts(normalize=True)

answer
True     0.622746
False    0.377254
Name: proportion, dtype: float64

In [25]:
boolq[boolq['answer'].apply(str) == boolq['predicted_label']].shape[0]*100.0/boolq.shape[0]

71.08765850200835

In [49]:
boolq.to_csv("./results/boolq_no_safety.csv", index=False)

### OpenBookQA

In [27]:
# OpenBookQA
ob_qa = load_dataset("allenai/openbookqa")
obqa = pd.concat([ob_qa['train'].to_pandas(), ob_qa['validation'].to_pandas()])
print(obqa.shape)
obqa.head(2)

(5457, 4)


Unnamed: 0,id,question_stem,choices,answerKey
0,7-980,The sun is responsible for,"{'text': ['puppies learning new tricks', 'chil...",D
1,7-584,When standing miles away from Mount Rushmore,"{'text': ['the mountains seem very close', 'th...",D


In [28]:
obqa['choices'] = obqa['choices'].apply(get_obqa_label)
obqa['inference_prompt'] = obqa.apply(lambda x: get_prompt(x, 'obqa'), 1)

In [29]:
obqa_sampling = SamplingParams(n=1, 
                               max_tokens=2,
                               top_p=0.9,
                               temperature=0.0,
                               # frequency_penalty=1.0
                               )

In [31]:
import random
for i in range(5):
    index = random.randint(0, len(obqa)-1)
    output = model.generate(obqa['inference_prompt'].tolist()[index], obqa_sampling)
    torch.cuda.empty_cache()
    print(output[0].outputs[0].text, obqa['answerKey'].tolist()[index])

Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 12.09it/s]



C C


Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 12.05it/s]



B A


Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 15.61it/s]



C D


Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 13.20it/s]



C B


Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 13.12it/s]



B B


Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 12.96it/s]



C C


Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 13.37it/s]



C C


Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 12.78it/s]



C A


Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 12.95it/s]



C C


Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 12.84it/s]


C A





In [32]:
output = model.generate(obqa['inference_prompt'].tolist(), obqa_sampling)
torch.cuda.empty_cache()

Processed prompts: 100%|██████████| 5457/5457 [02:05<00:00, 43.66it/s]


In [33]:
all_results = []
for ele in output:
    all_results.append(ele.outputs[0].text)
obqa['generated_response'] = all_results
obqa['predicted_label'] = obqa['generated_response'].apply(lambda x: x.replace("\n", ""))
obqa['predicted_label'] = obqa['predicted_label'].apply(preprocess_response_choices)
obqa.predicted_label.value_counts()

predicted_label
C         4654
B          479
D          256
A           65
Others       3
Name: count, dtype: int64

In [34]:
obqa['answerKey'].value_counts(normalize=True)

answerKey
A    0.275609
B    0.247389
D    0.246839
C    0.230163
Name: proportion, dtype: float64

In [35]:
obqa[obqa['answerKey'] == obqa['predicted_label']].shape[0]*100.0/obqa.shape[0]

30.071467839472238

In [50]:
obqa.to_csv("./results/obqa_no_safety.csv", index=False)

### PIQA

In [4]:
# PIQA dataset
piqa_data = load_dataset("piqa")
piqa = pd.concat([piqa_data['train'].to_pandas(), piqa_data['validation'].to_pandas(), piqa_data['test'].to_pandas()])
piqa = piqa[piqa['label'] != -1]
print(piqa.shape)
piqa.head(2)

(17951, 4)


Unnamed: 0,goal,sol1,sol2,label
0,"When boiling butter, when it's ready, you can",Pour it onto a plate,Pour it into a jar,1
1,"To permanently attach metal legs to a chair, y...",Weld the metal together to get it to stay firm...,Nail the metal together to get it to stay firm...,0


In [5]:
piqa.label.value_counts()

label
1    8988
0    8963
Name: count, dtype: int64

In [15]:
piqa['choices'] = piqa.apply(lambda x: f"A) {x['sol1']}\nB) {x['sol2']}", 1)
piqa['inference_prompt'] = piqa.apply(lambda x: get_prompt(x, 'piqa'), 1)

In [16]:
piqa_sampling = SamplingParams(n=1, 
                               max_tokens=2,
                               top_p=0.9,
                               temperature=0.0,
                               # frequency_penalty=1.0
                               )

In [17]:
import random
for i in range(5):
    index = random.randint(0, len(piqa)-1)
    output = model.generate(piqa['inference_prompt'].tolist()[index], piqa_sampling)
    torch.cuda.empty_cache()
    print(output[0].outputs[0].text, piqa['label'].tolist()[index])

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 14.75it/s]



A 0


Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 12.61it/s]



A 0


Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 14.79it/s]



A 1


Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 13.33it/s]



B 1


Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 13.14it/s]


A 0





In [18]:
output = model.generate(piqa['inference_prompt'].tolist(), piqa_sampling)
torch.cuda.empty_cache()

Processed prompts:  31%|███       | 5505/17951 [03:08<06:48, 30.45it/s]



Processed prompts:  74%|███████▎  | 13208/17951 [07:27<02:08, 36.99it/s]



Processed prompts: 100%|██████████| 17951/17951 [10:10<00:00, 29.40it/s]


In [19]:
all_results = []
for ele in output:
    all_results.append(ele.outputs[0].text)
piqa['generated_response'] = all_results
piqa['predicted_label'] = piqa['generated_response'].apply(preprocess_piqa_response)
piqa['predicted_label'] = piqa['predicted_label'].apply(str)
piqa.predicted_label.value_counts()

predicted_label
1        10985
0         6964
Other        2
Name: count, dtype: int64

In [20]:
piqa['label'].value_counts(normalize=True)

label
1    0.500696
0    0.499304
Name: proportion, dtype: float64

In [21]:
piqa[piqa['label'].apply(str) == piqa['predicted_label']].shape[0]*100.0/piqa.shape[0]

60.38103726811877

In [22]:
piqa.to_csv("./results/piqa_no_safety.csv", index=False)

### Helpfulness Reward Model

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

rm_tokenizer = AutoTokenizer.from_pretrained('Ray2333/gpt2-large-helpful-reward_model')
reward_model = AutoModelForSequenceClassification.from_pretrained(
                'Ray2333/gpt2-large-helpful-reward_model',
                num_labels=1, 
                torch_dtype=torch.bfloat16,
                device_map=0,
                )
q, a = "\n\nHuman: I just came out of from jail, any suggestion of my future? \n\nAssistant:", "Sorry, I don't understand."
inputs = rm_tokenizer(q, a, return_tensors='pt', truncation=True)
with torch.no_grad():
  reward = reward_model(**(inputs.to(0))).logits[0].cpu().detach().item()




In [6]:
from datasets import load_dataset

# Load all helpfulness/harmless subsets (share the same schema)
dataset = load_dataset("Anthropic/hh-rlhf", data_dir='helpful-base')

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [11]:
dataset = dataset['test'].to_pandas()
dataset.head()

Unnamed: 0,chosen,rejected
0,\n\nHuman: Do you know why turkeys became the ...,\n\nHuman: Do you know why turkeys became the ...
1,\n\nHuman: How can I find out what types of bu...,\n\nHuman: How can I find out what types of bu...
2,\n\nHuman: Can you tell me about wild dogs? No...,\n\nHuman: Can you tell me about wild dogs? No...
3,\n\nHuman: what do i do if i stepped in dog po...,\n\nHuman: what do i do if i stepped in dog po...
4,\n\nHuman: Search for best ac/dc songs.\n\nAss...,\n\nHuman: Search for best ac/dc songs.\n\nAss...


In [18]:
dataset['chosen'].apply(lambda x: x.split("Assistant:")[-1])

0        To be honest, I don’t know anything about tha...
1        About 150 species of butterflies live in Oreg...
2        Sure! Here’s some fun facts: A wolf is a larg...
3        That’s reasonable, but even that is probably ...
4        You probably already know a lot about AC/DC. ...
                              ...                        
2349     Here’s a list of a few of my favorite artists...
2350     Oh yes, I meant, free in that people can get ...
2351                                      Just follow me.
2352     Well, first you’ll need to fill out the forms...
2353     Why not give him a banana or an apple or a ha...
Name: chosen, Length: 2354, dtype: object