# Load model

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import re
from utils import retrieve_dataset, display_results

In [None]:
# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6b").cuda()
model_name = 'GPT-J'

# Inference function

In [4]:
'''
Function to format the prompt to be given to a LLM

Inputs:
    sample (dict): a dictionary containing a single sample from a dataset
    category (str): the category of harmful content
    fs_all (list): list of few shot prompts of the dataste
    cot_prompt (str): the chain of thought prompt of the dataset
    num_shot (int): number of shots, default is 0
    cot (bool): whether it is chain of thought, default is False

Returns:
    prompt (str): the prompt to be given to a LLM
    label (str): the label of the text

'''
def format_prompt(sample, category, fs_all, cot_prompt, num_shot=0, cot=False):
    text = sample['text']
    label = sample['label']
    label_dict = {0: 'NO',
                 1: 'YES'}

    # define the prompt based on the prompting technique
    if num_shot:
        fs_prompt = 'Below are a set questions along with answers.\n'
        fs_prompt += '\n\n'.join(fs_all[0:num_shot])
        prompt = f'''{fs_prompt} 

Now answer the following question.
Question: Is the following text <{text}> considered as {category}?  
Answer:'''

    elif cot:
        prompt = f'''Below is a question, along with the answer to the question and the rationale.
{cot_prompt}

Now answer the following question.
Question: Is the following text <{text}> considered as {category}?  
Answer:'''
        
    else:     
        prompt = f'''Question: Is the following text <{text}> considered as {category}?
Answer:'''
    
    return prompt, label_dict[label]

In [5]:
'''
Function to inference and evaluate a LLM

Inputs:
    model_name (str): name of the language model
    dataset_name (str): name of the dataset
    category (str): the category of harmful content
    num_shot (int): the number of shots, default is 0
    cot (bool): whether it is chain of thought, default is False
    num_examples (int): the number of examples to pass to the LLM, default is None

Returns:
    None

'''
def llm_inf(model_name, dataset_name, category, num_shot=0, cot=False, num_examples=None):
    
    # retrieve the dataset details
    ds, fs_all, cot_prompt = retrieve_dataset(dataset_name, category, instruct=False)
    print('Length of', dataset_name, 'test set:', len(ds))

    # if no number of examples are provided, use the size of the test set
    if num_examples is None:
        num_examples = len(ds)
        
    correct = 0
    true_labels = []
    pred_labels = []
    
    # go through each example
    for i in range(num_examples):
        # format the prompt for the LLM
        prompt, label = format_prompt(ds[i], category, fs_all, cot_prompt, num_shot, cot)
    
        # setting up the inputs and inference the model
        input_ids = tokenizer(prompt, return_tensors="pt").to('cuda').input_ids
        outputs = model.generate(input_ids, 
                                max_new_tokens=150, 
                                pad_token_id=tokenizer.eos_token_id,
                                do_sample=False,
                                temperature=None,
                                top_p=None,
                                top_k=None)
        
        # extract the response of the model
        response = tokenizer.batch_decode(outputs[:, input_ids.shape[1]:-1])[0].strip()
        print('Q', i+1)
        # print(prompt)
        # print("model answer: ", response)

        llm_answer = 'NO'

        # use regular expression to extract exact answer from the model
        match1 = re.findall(r'^(yes|no)', response, re.IGNORECASE)
        match2 = re.findall(r'is considered', response)
        match3 = re.findall(r'is not considered', response)

        if match1:
            llm_answer = match1[0].upper()
        elif match2:
            llm_answer = 'YES'
        elif match3:
            llm_answer = 'NO'
            
        # print('llm answer:', llm_answer)
            
        # check if answer from model matches the actual answer
        if llm_answer.upper().strip() == label:
            correct += 1
    
        true_labels.append(label)
        pred_labels.append(llm_answer)

    # display the metric scores when all samples have been run
    print()
    display_results(true_labels, pred_labels, ['YES', 'NO'], model_name, dataset_name, num_shot, cot)

# HateXplain

In [6]:
dataset_name = 'HateXplain'
category = 'hate speech'

In [None]:
## Zero-shot, one-shot and two-shot
for i in range(3):
    num_shot = i
    llm_inf(model_name, dataset_name, category, num_shot)

In [None]:
## Chain of thought
llm_inf(model_name, dataset_name, category, cot=True)

# Toraman hate speech

In [7]:
dataset_name = 'Toraman hate speech'
category = 'hate speech'

In [None]:
## Zero-shot, one-shot and two-shot
for i in range(3):
    num_shot = i
    llm_inf(model_name, dataset_name, category, num_shot)

In [None]:
## Chain of thought
llm_inf(model_name, dataset_name, category, cot=True)

# OLID

In [6]:
dataset_name = 'OLID'
category = 'offensive'

In [None]:
## Zero-shot, one-shot and two-shot
for i in range(3):
    num_shot = i
    llm_inf(model_name, dataset_name, category, num_shot)

In [None]:
## Chain of thought
llm_inf(model_name, dataset_name, category, cot=True)

# Offenseval2020_tr

In [9]:
dataset_name = 'OffensEval-TR 2020'
category = 'offensive'

In [None]:
## Zero-shot, one-shot and two-shot
for i in range(3):
    num_shot = i
    llm_inf(model_name, dataset_name, category, num_shot)

In [None]:
## Chain of thought
llm_inf(model_name, dataset_name, category, cot=True)

# Toxigen

In [None]:
dataset_name = 'Toxigen'
category = 'toxic'

In [None]:
## Zero-shot, one-shot and two-shot
for i in range(3):
    num_shot = i
    llm_inf(model_name, dataset_name, category, num_shot)

In [None]:
## Chain of thought
llm_inf(model_name, dataset_name, category, cot=True)

# LLM-JP Toxicity

In [6]:
dataset_name = 'LLM-JP Toxicity'
category = 'toxic'

In [None]:
## Zero-shot, one-shot and two-shot
for i in range(3):
    num_shot = i
    llm_inf(model_name, dataset_name, category, num_shot)

In [None]:
## Chain of thought
llm_inf(model_name, dataset_name, category, cot=True)

# Ejaz cyberbullying

In [6]:
dataset_name = 'Ejaz cyberbullying'
category = 'cyberbullying'

In [None]:
## Zero-shot, one-shot and two-shot
for i in range(3):
    num_shot = i
    llm_inf(model_name, dataset_name, category, num_shot)

In [None]:
## Chain of thought
llm_inf(model_name, dataset_name, category, cot=True)

# SOSNet cyberbullying

In [9]:
dataset_name = 'SOSNet cyberbullying'
category = 'cyberbullying'

In [None]:
## Zero-shot, one-shot and two-shot
for i in range(3):
    num_shot = i
    llm_inf(model_name, dataset_name, category, num_shot)

In [None]:
## Chain of thought
llm_inf(model_name, dataset_name, category, cot=True)