In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
import torch, re
from utils import retrieve_dataset, display_results

# Load model

In [None]:
# Log in to huggingface for access to Llama3
hf_token = 'HUGGING_FACE_TOKEN'
login(token = hf_token)

In [None]:
# load model and tokenizer
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

model_name = 'Llama-3'

# Inference function

In [7]:
'''
Function to format the prompt to be given to a LLM

Inputs:
    sample (dict): a dictionary containing a single sample from a dataset
    category (str): the category of harmful content
    fs_all (list): list of few shot prompts of the dataste
    cot_prompt (str): the chain of thought prompt of the dataset
    num_shot (int): number of shots, default is 0
    cot (bool): whether it is chain of thought, default is False

Returns:
    prompt (str): the prompt to be given to a LLM
    label (str): the label of the text

'''
def format_prompt(sample, category, fs_all, cot_prompt, num_shot=0, cot=False):
    text = '###TEXT: ' + sample['text']
    label = sample['label']
    label_dict = {0: 'NO',
                 1: 'YES'}

    # define the instruction
    if cot:
        instruction = f'###INSTRUCTION: Given the following text, classify it as either {category} (YES) or not (NO). \
Provide a rationale to your answer then give the answer as YES or NO. You must give a YES or NO answer.'
    else:
        instruction = f'###INSTRUCTION: Given the following text, classify it as either {category} (YES) or not (NO). \
If it is {category}, answer YES. If it is not {category}, answer NO. You must answer with only YES or NO.'    

    # define the prompt based on the prompting technique
    if num_shot:
        fs_prompt = 'Below are a set of instructions with a question, along with answers.\n'
        fs_prompt += '\n\n'.join(fs_all[0:num_shot])
        prompt = f'''{fs_prompt} 
        
Now answer the following question.
{instruction}
{text}
###ANSWER:'''

    elif cot:
        prompt = f'''Below is an instruction with a question, along with the answer to the question and the rationale.
{instruction}
{cot_prompt}

Now answer the following question by giving a rationale and answer.
{instruction}
{text}'''

    else:     
        prompt = f'''{instruction}
{text}
###ANSWER:'''
    
    return prompt, label_dict[label]

In [8]:
'''
Function to inference and evaluate a LLM

Inputs:
    model_name (str): name of the language model
    dataset_name (str): name of the dataset
    category (str): the category of harmful content
    num_shot (int): the number of shots, default is 0
    cot (bool): whether it is chain of thought, default is False
    num_examples (int): the number of examples to pass to the LLM, default is None

Returns:
    None

'''
def llm_inf(model_name, dataset_name, category, num_shot=0, cot=False, num_examples=None):
    
    # retrieve the dataset details
    ds, fs_all, cot_prompt = retrieve_dataset(dataset_name, category)
    print('Length of', dataset_name, 'test set:', len(ds))

    # if no number of examples are provided, use the size of the test set
    if num_examples is None:
        num_examples = len(ds)
        
    correct = 0
    true_labels = []
    pred_labels = []
    
    # go through each example
    for i in range(num_examples):
        # format the prompt for the LLM
        prompt, label = format_prompt(ds[i], category, fs_all, cot_prompt, num_shot, cot)
        
        # setting up the inputs and inference the model
        messages = [
            {"role": "system", "content": f"You are a content moderator that detects whether a piece of text is considered {category} (YES) or not (NO)"},
            {"role": "user", "content": prompt},
        ]
    
        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)
        
        outputs = model.generate(
            input_ids,
            max_new_tokens=256,
            eos_token_id=terminators,
            do_sample=False,
            temperature=None,
            top_p=None,
            top_k=None,
            pad_token_id=tokenizer.eos_token_id
        )

        # extract the response of the model
        response = outputs[0][input_ids.shape[-1]:]
        response = tokenizer.decode(response, skip_special_tokens=True)

        print('Q', i+1)
        # print(prompt)
        # print("model answer: ", response)

        llm_answer = 'NO'

        # use regular expression to extract exact answer from the model
        match1 = re.findall(r'###Answer: (yes|no)', response, re.IGNORECASE)
        match2 = re.findall(r'\b(yes|no)\b', response, re.IGNORECASE)
        match3 = re.findall(r'cannot (moderate|classify)', response)

        if match1:
            llm_answer = match1[0].upper()
        elif match2:
            llm_answer = match2[0].upper()
        elif match3:
            llm_answer = 'YES'

        # print('llm answer:', llm_answer)
            
        # check if answer from model matches the actual answer
        if llm_answer.upper().strip() == label:
            correct += 1
    
        true_labels.append(label)
        pred_labels.append(llm_answer)

    # display the metric scores when all samples have been run
    print()
    display_results(true_labels, pred_labels, ['YES', 'NO'], model_name, dataset_name, num_shot, cot)

# HateXplain

In [28]:
dataset_name = 'HateXplain'
category = 'hate speech'

In [None]:
## Zero-shot, one-shot and two-shot
for i in range(3):
    num_shot = i
    llm_inf(model_name, dataset_name, category, num_shot)

In [None]:
## Chain of thought
llm_inf(model_name, dataset_name, category, cot=True)

# Toraman hate speech

In [30]:
dataset_name = 'Toraman hate speech'
category = 'hate speech'

In [None]:
## Zero-shot, one-shot and two-shot
for i in range(3):
    num_shot = i
    llm_inf(model_name, dataset_name, category, num_shot)

In [None]:
# Chain of thought
llm_inf(model_name, dataset_name, category, cot=True)

# OLID

In [32]:
dataset_name = 'OLID'
category = 'offensive'

In [None]:
## Zero-shot, one-shot and two-shot
for i in range(3):
    num_shot = i
    llm_inf(model_name, dataset_name, category, num_shot)

In [None]:
## Chain of thought
llm_inf(model_name, dataset_name, category, cot=True)

# OffensEval-TR 2020

In [7]:
dataset_name = 'OffensEval-TR 2020'
category = 'offensive'

In [None]:
## Zero-shot, one-shot and two-shot
for i in range(3):
    num_shot = i
    llm_inf(model_name, dataset_name, category, num_shot)

In [None]:
## Chain of thought
llm_inf(model_name, dataset_name, category, cot=True)

# Toxigen

In [7]:
dataset_name = 'Toxigen'
category = 'toxic'

In [None]:
## Zero-shot, one-shot and two-shot
for i in range(3):
    num_shot = i
    llm_inf(model_name, dataset_name, category, num_shot)

In [None]:
## Chain of thought
llm_inf(model_name, dataset_name, category, cot=True)

# LLM-JP Toxicity

In [9]:
dataset_name = 'LLM-JP Toxicity'
category = 'toxic'

In [None]:
## Zero-shot, one-shot and two-shot
for i in range(3):
    num_shot = i
    llm_inf(model_name, dataset_name, category, num_shot)

In [None]:
## Chain of thought
llm_inf(model_name, dataset_name, category, cot=True)

# Ejaz cyberbullying

In [13]:
dataset_name = 'Ejaz cyberbullying'
category = 'cyberbullying'

In [None]:
## Zero-shot, one-shot and two-shot
for i in range(3):
    num_shot = i
    llm_inf(model_name, dataset_name, category, num_shot)

In [None]:
## Chain of thought
llm_inf(model_name, dataset_name, category, cot=True)

# SOSNet cyberbullying

In [16]:
dataset_name = 'SOSNet cyberbullying'
category = 'cyberbullying'

In [None]:
## Zero-shot, one-shot and two-shot
for i in range(3):
    num_shot = i
    llm_inf(model_name, dataset_name, category, num_shot)

In [None]:
## Chain of thought
llm_inf(model_name, dataset_name, category, cot=True)