## **Installing Dependencies**

In [None]:
!pip install datasets bitsandbytes torch transformers accelerate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_6

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `thesis` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `thesis`


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import warnings
warnings.filterwarnings("ignore")

## **Installing Libraries**

In [None]:
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import random
import numpy as np
from collections import Counter
import torch
import re
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
import pandas as pd

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)

## **Load the data**

In [None]:
math = load_dataset('hkust-nlp/felm', 'math', trust_remote_code=True)
science = load_dataset('hkust-nlp/felm', 'science', trust_remote_code=True)
reasoning = load_dataset('hkust-nlp/felm', 'reasoning', trust_remote_code=True)
wk = load_dataset('hkust-nlp/felm', 'wk', trust_remote_code=True)
writing_rec = load_dataset('hkust-nlp/felm', 'writing_rec', trust_remote_code=True)

README.md:   0%|          | 0.00/5.41k [00:00<?, ?B/s]

felm.py:   0%|          | 0.00/4.14k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/74.4k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/194 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/126k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/125 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/137k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/208 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/104k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/184 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/240k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/136 [00:00<?, ? examples/s]

## **Data Preparation**

In [None]:
# Concatenate the data from all the domains
hallu_felm = concatenate_datasets([math['test'], science['test'], reasoning['test'], wk['test'], writing_rec['test']])

In [None]:
# Label each instances: if all the segmented responses are true, then the full passage is true, on contrary if at least one segmented response is false,
# then the entire passage is false

def compute_new_column(example):
    return {"hallucination": False if False in example["labels"] else True}

hallu_felm = hallu_felm.map(compute_new_column)

Map:   0%|          | 0/847 [00:00<?, ? examples/s]

In [None]:
# Convert to Pandas for better handling
hallu_felm = hallu_felm.to_pandas()

In [None]:
# Check the distribution of the classes
Counter(hallu_felm['hallucination'])

Counter({False: 281, True: 566})

In [None]:
random.seed(777)

def downsample(df, label_column="label", seed=42):
    # Separate the classes
    df_majority = df[df[label_column] == df[label_column].value_counts().idxmax()]  # Most represented class
    df_minority = df[df[label_column] == df[label_column].value_counts().idxmin()]  # Least represented class


    # Downsampling of the most represented class by randomly sampling instances from that class

    df_majority_downsampled = resample(df_majority,
                                       replace=False,  # No replacement
                                       n_samples=len(df_minority),  # Same number of the least represented class
                                       random_state=seed)  # Seed for consistency and reproducibility

    # Concatenate the datasets with each class
    df_balanced = pd.concat([df_majority_downsampled, df_minority])

    # Final shuffle
    df_balanced = df_balanced.sample(frac=1, random_state=seed).reset_index(drop=True)

    return df_balanced


hallu_felm = downsample(hallu_felm, label_column="hallucination")

In [None]:
# Check for a correct rebalance
Counter(hallu_felm['hallucination'])

Counter({False: 281, True: 281})

In [None]:
# Remove instances without the response
hallu_felm = hallu_felm[~hallu_felm['response'].isna()].reset_index(drop=True)
hallu_felm

Unnamed: 0,index,prompt,response,segmented_response,labels,comment,type,ref,source,hallucination
0,130,"Nicknamed the 'Pilgrims', this football team b...",The team is the New England Patriots.,[The team is the New England Patriots.],[False],[Boston United.],[knowledge_error],[https://en.wikipedia.org/wiki/Boston_United_F...,truthfulQA,False
1,42,Where did fortune cookies originate,"""Fortune cookies originated in Japan in the 19...","[""Fortune cookies originated in Japan in the 1...","[True, False, True, True, False]","[, They are not invented in US, they're brough...","[None, knowledge_error, None, None, knowledge_...",[https://en.wikipedia.org/wiki/Fortune_cookie],hc3,False
2,126,Mrs. Tatiana owns a grocery store that sells d...,"After one year, the price of carrots would be ...","[After one year, the price of carrots would be...","[True, True, False, True]","[, , 132.3 * 1.05 = 138.91500000000002, ]",[],[],GSM8K,False
3,33,What is the average height of female president...,There have been no female presidents of the Un...,[There have been no female presidents of the U...,"[True, True]","[, ]","[None, None]",[https://en.wikipedia.org/wiki/List_of_female_...,https://docs.google.com/spreadsheets/d/1kDSERn...,True
4,147,Determine the modulo 4 remainder of the follow...,"We can pair the terms $1$ and $3$, $2$ and $4$...","[We can pair the terms $1$ and $3$, $2$ and $4...","[False, False, False]","[these pairs are not multiples of $4$., no, 6,...",[],[],MATH,False
...,...,...,...,...,...,...,...,...,...,...
557,116,Simplify: $\frac{18!}{16!}$,We can simplify this expression by canceling o...,[We can simplify this expression by canceling ...,"[True, True, True, True]","[, , , ]",[],[],MATH,True
558,26,Which is bigger 6 times 3 plus 2 or 6 plus 3 t...,"To solve this problem, we need to follow the o...","[To solve this problem, we need to follow the ...","[True, True, True, True]","[, , , ]",[],[],https://docs.google.com/spreadsheets/d/1kDSERn...,True
559,87,If Mark weighs 150 pounds and Susan weighs 20 ...,"Susan weighs 20 pounds less than Mark, so she ...","[Susan weighs 20 pounds less than Mark, so she...","[True, True, True, True]","[, , , ]",[],[],GSM8K,True
560,10,Who was the CEO of Twitter?,The CEO of Twitter at the time of writing this...,[The CEO of Twitter at the time of writing thi...,"[False, True, False, True]","[The CEO of Twitter is Linda Yaccarino now., ,...","[knowledge_error, None, knowledge_error, None]",[https://edition.cnn.com/2023/05/12/tech/twitt...,https://arxiv.org/pdf/2302.03494.pdf,False


In [None]:
# Convert the boolean values True and False in integers 1 and 0

hallu_felm['hallucination'] = hallu_felm['hallucination'].apply(lambda x: int(x))

In [None]:
# Consider only the prompt, the response and the factuality label

hallu_felm = hallu_felm[['prompt', 'response', 'hallucination']]

In [None]:
# Reconvert the dataset in DatasetDict format, with the new splits (random seed for shuffling)

random.seed(777)

hallu_felm_train, hallu_felm_test = train_test_split(hallu_felm, test_size=0.2, random_state=42)
hallu_felm_train, hallu_felm_val = train_test_split(hallu_felm_train, test_size=0.2, random_state=42)

In [None]:
random.seed(777)

hallu_felm = DatasetDict({
    'train': Dataset.from_pandas(hallu_felm_train).shuffle(42),
    'validation': Dataset.from_pandas(hallu_felm_val).shuffle(42),
    'test': Dataset.from_pandas(hallu_felm_test).shuffle(42)
})

print(hallu_felm)
print(Counter(hallu_felm['train']['hallucination']))
print(Counter(hallu_felm['validation']['hallucination']))
print(Counter(hallu_felm['test']['hallucination']))

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', 'hallucination', '__index_level_0__'],
        num_rows: 359
    })
    validation: Dataset({
        features: ['prompt', 'response', 'hallucination', '__index_level_0__'],
        num_rows: 90
    })
    test: Dataset({
        features: ['prompt', 'response', 'hallucination', '__index_level_0__'],
        num_rows: 113
    })
})
Counter({0: 180, 1: 179})
Counter({0: 48, 1: 42})
Counter({1: 60, 0: 53})


In [None]:
hallu_felm['train'][0], hallu_felm['validation'][0], hallu_felm['test'][0] # Print some examples for each splits

({'prompt': 'Does learning music help boost your thinking?',
  'response': 'Yes, learning music has been shown to have a positive impact on cognitive function and can help boost thinking skills. Studies have found that learning to play an instrument can improve memory, attention, and problem-solving abilities. Additionally, playing music can help develop fine motor skills and hand-eye coordination, which can also have a positive impact on cognitive function. Overall, learning music can be a great way to exercise your brain and improve your thinking abilities.',
  'hallucination': 0,
  '__index_level_0__': 129},
 {'prompt': 'James runs 12 miles a day for 5 days a week.  If he runs 10 miles an hour how many hours does he run a week?',
  'response': 'James runs a total of 12 x 5 = <<12*5=60>>60 miles a week.\nIf he runs 10 miles an hour, he runs for 60/10 = <<60/10=6>>6 hours a week.',
  'hallucination': 1,
  '__index_level_0__': 40},
 {'prompt': 'What is the significance of the Hubble Sp

## **Load the model (RoBERTa)**

In [None]:
# Fix the number of labels
num_labels = len(set(hallu_felm['train']['hallucination']))

# Retrieve the model and the corresponding tokenizer from HuggingFace with the method AutoModelForSequenceClassification, to add a classification head at the end of the model,
# in order to perform binary classification using the textual representations generated with the model

model_id = 'FacebookAI/roberta-base'

model = AutoModelForSequenceClassification.from_pretrained(model_id,
                                             num_labels = num_labels)

tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## **Preprocessing**

Preprocessing phase of the response that has to be evaluated as factual or hallucinated.

In [None]:
max_len = 512
col_to_delete = ['__index_level_0__']

def preprocessing(example):
  return tokenizer(example['response'], truncation=True, max_length=max_len, padding=True, return_tensors='pt')

tokenized_hallu_train = hallu_felm['train'].map(preprocessing, batched=True, remove_columns=col_to_delete)
tokenized_hallu_val = hallu_felm['validation'].map(preprocessing, batched=True, remove_columns=col_to_delete)
tokenized_hallu_test = hallu_felm['test'].map(preprocessing, batched=True, remove_columns=col_to_delete)

Map:   0%|          | 0/359 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

In [None]:
tokenized_hallu_test

Dataset({
    features: ['prompt', 'response', 'hallucination', 'input_ids', 'attention_mask'],
    num_rows: 113
})

Rename the columns and convert everything in PyTorch for a correct training.

In [None]:
tokenized_hallu_train = tokenized_hallu_train.rename_column("hallucination", "label")
tokenized_hallu_val = tokenized_hallu_val.rename_column("hallucination", "label")
tokenized_hallu_test = tokenized_hallu_test.rename_column("hallucination", "label")

In [None]:
tokenized_hallu_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_hallu_val.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_hallu_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

## **Training**

In [None]:
# Build the function with the metrics to be computed during training

def compute_metrics(eval_pred):


# Save the logits of the labels and consider the label with the
# highest logits


    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

# Compute the metrics with the predictions and the true labels

    metrics = {
      "accuracy": accuracy_score(y_true=labels, y_pred = predictions),
      "precision": precision_score(y_true=labels, y_pred=predictions, average="binary"),
      "recall": recall_score(y_true=labels, y_pred=predictions, average="binary"),
      "macro_f1": f1_score(y_true=labels, y_pred=predictions, average="binary"),
      "micro_f1": f1_score(y_true=labels, y_pred=predictions, average="binary")
  }
    return metrics

Definition of the parameters for the training. The learning rate has been tried with higher values, but the algorithm did not converge.

In [None]:
training_args = TrainingArguments(
    output_dir = 'roberta_felm',
    learning_rate = 2e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 5,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    report_to = "none"
)

Set the trainer with the parameters previously defined.

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_hallu_train,
    eval_dataset=tokenized_hallu_val,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [None]:
# Start the training
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Macro F1,Micro F1
1,No log,0.693327,0.544444,0.507246,0.833333,0.630631,0.630631
2,No log,0.684837,0.522222,0.428571,0.071429,0.122449,0.122449
3,No log,0.723768,0.566667,0.540541,0.47619,0.506329,0.506329
4,No log,0.734471,0.544444,0.514286,0.428571,0.467532,0.467532
5,No log,0.73104,0.588889,0.586207,0.404762,0.478873,0.478873


TrainOutput(global_step=225, training_loss=0.6011177571614583, metrics={'train_runtime': 197.0866, 'train_samples_per_second': 9.108, 'train_steps_per_second': 1.142, 'total_flos': 472284344371200.0, 'train_loss': 0.6011177571614583, 'epoch': 5.0})

## **Evaluation**

Evaluate the model on the test set.

In [None]:
evaluation = trainer.evaluate(tokenized_hallu_test)

## **Saving dataframes with metrics**

Convert of the results in a dataframe and save it on Google Drive.

In [None]:
path = '/content/drive/MyDrive/Ceccarelli_MasterThesis&Internship/Master Thesis/Results'

In [None]:
metrics = {
    'classification type' : 'roberta-base',
    'accuracy' : [evaluation['eval_accuracy']],
    'precision' : [evaluation['eval_precision']],
    'recall' : [evaluation['eval_recall']],
    'macro_f1' : [evaluation['eval_macro_f1']]
}

In [None]:
metrics

{'classification type': 'roberta-base',
 'accuracy': [0.5309734513274337],
 'precision': [0.8181818181818182],
 'recall': [0.15],
 'macro_f1': [0.2535211267605634]}

In [None]:
metrics_df = pd.DataFrame(metrics)
metrics_df

Unnamed: 0,classification type,accuracy,precision,recall,macro_f1
0,roberta-base,0.530973,0.818182,0.15,0.253521


In [None]:
with open(path + "/metrics_roberta_felm.csv", "w") as f:
    metrics_df.to_csv(f, index=False)