## **Installing Dependencies**

In [None]:
!pip install datasets bitsandbytes torch transformers accelerate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_6

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `thesis` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `thesis`


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import warnings
warnings.filterwarnings("ignore")

## **Installing Libraries**

In [None]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import random
import numpy as np
import torch
from collections import Counter
from sklearn.utils import resample
import re
from sklearn.model_selection import train_test_split
import pandas as pd

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)

## **Load the data**

In [None]:
hallu_factbench = pd.read_json('/content/drive/MyDrive/Ceccarelli_MasterThesis&Internship/Master Thesis/Factbench.jsonl', lines=True)

## **Data Preparation**

In [None]:
# Remove instances without response or factuality label
hallu_factbench = hallu_factbench[~hallu_factbench['response'].isna()].reset_index(drop=True)
hallu_factbench = hallu_factbench[hallu_factbench['response_label'] != 'NA'].reset_index(drop=True)

#Convert the boolean values True and False in integers 1 and 0
hallu_factbench['response_label'] = hallu_factbench['response_label'].apply(lambda x: int(x))

In [None]:
hallu_factbench

Unnamed: 0,prompt,response,response_label,claims,claim_labels,ability_to_test,source,hallucination_spans
0,Which country or city has the maximum number o...,The United States has the highest number of nu...,0,[The United States has the highest number of n...,"[True, False, True, True, True, True]",knowledge,factool-qa,
1,Who is the CEO of Twitter?,The CEO of Twitter at the time of writing this...,0,"[Jack Dorsey is the CEO of Twitter, Jack Dorse...","[False, True, False, True, True, True, True]",knowledge,factool-qa,
2,Is Jupiter more dense than Saturn?,"No, Jupiter is less dense than Saturn. Jupiter...",0,"[Jupiter is less dense than Saturn, Jupiter ha...","[False, True, False, True, False]",knowledge,factool-qa,
3,How many sons had eise eisinga in total?,Eise Eisinga was a Dutch mathematician and ast...,0,"[Eise Eisinga was a Dutch mathematician, Eise ...","[True, True, True, False, False, True]",knowledge,factool-qa,
4,How many times did Argentina win the FIFA worl...,Argentina have won the FIFA World Cup two time...,0,[Argentina have won the FIFA World Cup two tim...,"[False, True, True, True, True, False, False, ...",knowledge,factool-qa,
...,...,...,...,...,...,...,...,...
4827,Brainstorm a list of 10 possible headlines to ...,"1. ""China's cutting-edge facial recognition so...",0,[],[],knowledge,halueval-dolly15k,"[2. ""Civil liberties advocates voice concerns ..."
4828,Generate an example input to an instruction th...,"""Hey assistant, please write a report on the c...",1,[],[],knowledge,halueval-dolly15k,[]
4829,Do the following equation: 8 + 4 / 2,"The answer is 10. \n\nTo solve the equation, y...",1,[],[],knowledge,halueval-dolly15k,[]
4830,Predict the 2021 wildfire season in California.,"As an AI language model, I do not have the cap...",1,[],[],knowledge,halueval-dolly15k,[]


In [None]:
random.seed(777)

def downsample(df, label_column="label", seed=42):
    # Separate the classes
    df_majority = df[df[label_column] == df[label_column].value_counts().idxmax()]  # Most represented class
    df_minority = df[df[label_column] == df[label_column].value_counts().idxmin()]  # Least represented class

    # Downsampling of the most represented class by randomly sampling instances from that class

    df_majority_downsampled = resample(df_majority,
                                       replace=False,  # No replacement
                                       n_samples=len(df_minority),  # Same number of the least represented class
                                       random_state=seed)  # Seed for consistency and reproducibility

    # Concatenate the datasets with each class
    df_balanced = pd.concat([df_majority_downsampled, df_minority])

    # Final shuffle
    df_balanced = df_balanced.sample(frac=1, random_state=seed).reset_index(drop=True)

    return df_balanced


hallu_factbench = downsample(hallu_factbench, label_column="response_label")

In [None]:
# Check for a correct rebalance
Counter(hallu_factbench['response_label'])

Counter({1: 995, 0: 995})

In [None]:
# Consider only the prompt, the response and the factuality label
hallu_factbench = hallu_factbench[['prompt', 'response', 'response_label']]

In [None]:
# Using a random seed for consistency, build again new splits for training, test and validation sets.

random.seed(777)

hallu_factbench_train, hallu_factbench_test = train_test_split(hallu_factbench, test_size=0.2, random_state=42)
hallu_factbench_train, hallu_factbench_val = train_test_split(hallu_factbench_train, test_size=0.2, random_state=42)

In [None]:
# Reconvert the dataset in DatasetDict format, with the new splits (random seed for shuffling)

hallu_factbench_data = DatasetDict({
    'train': Dataset.from_pandas(hallu_factbench_train),
    'validation': Dataset.from_pandas(hallu_factbench_val),
    'test': Dataset.from_pandas(hallu_factbench_test)
})

In [None]:
hallu_factbench_data

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', 'response_label', '__index_level_0__'],
        num_rows: 1273
    })
    validation: Dataset({
        features: ['prompt', 'response', 'response_label', '__index_level_0__'],
        num_rows: 319
    })
    test: Dataset({
        features: ['prompt', 'response', 'response_label', '__index_level_0__'],
        num_rows: 398
    })
})

In [None]:
hallu_factbench_data['train'][0], hallu_factbench_data['validation'][0], hallu_factbench_data['test'][0] # Print some examples for each splits

({'prompt': 'Generate a 10-line poem that talks about the beauty of autumn',
  'response': "The leaves are changing, oh what a sight,\nColors so warm, a pure delight,\nThe breeze is crisp, yet calm and nice,\nAutumn's here, a paradise.\n\nThe trees put on a colorful show,\nA spectacle to see, before the snow,\nThe air smells of apples and spice,\nAutumn's here, a paradise.\n\nThe days are shorter, nights are clear,\nThe sky a canvass, painted with cheer,\nFireside talks, surrounded by light,\nAutumn's here, a paradise.\n\nSo let's embrace this season with glee,\nIt won't last long, let's enjoy it fully,\nAutumn's here, a time to be wise,\nFor soon we'll enter winter's surprise.",
  'response_label': 0,
  '__index_level_0__': 1806},
 {'prompt': 'Compute the area of a triangle with the vertices (3,4), (5,6), (7,8)',
  'response': 'We can use the formula for the area of a triangle given its vertices:\n$$\\text{Area} = \\frac{1}{2}\\left|\\begin{matrix}x_1 & y_1 & 1 \\\\ x_2 & y_2 & 1 \\\\

## **Load the model (RoBERTa)**

In [None]:
# Fix the number of labels
num_labels = len(set(hallu_factbench_data['train']['response_label']))

# Retrieve the model and the corresponding tokenizer from HuggingFace with the method AutoModelForSequenceClassification, to add a classification head at the end of the model,
# in order to perform binary classification using the textual representations generated with the model

model_id = 'FacebookAI/roberta-base'

model = AutoModelForSequenceClassification.from_pretrained(model_id,
                                             num_labels = num_labels)

tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## **Preprocessing**

Preprocessing phase of the response that has to be evaluated as factual or hallucinated.

In [None]:
max_len = 512
col_to_delete = ['__index_level_0__']

def preprocessing(example):
  return tokenizer(example['response'], truncation=True, max_length=max_len, padding=True, return_tensors='pt')

tokenized_hallu_train = hallu_factbench_data['train'].map(preprocessing, batched=True, remove_columns=col_to_delete)
tokenized_hallu_val = hallu_factbench_data['validation'].map(preprocessing, batched=True, remove_columns=col_to_delete)
tokenized_hallu_test = hallu_factbench_data['test'].map(preprocessing, batched=True, remove_columns=col_to_delete)

Map:   0%|          | 0/1273 [00:00<?, ? examples/s]

Map:   0%|          | 0/319 [00:00<?, ? examples/s]

Map:   0%|          | 0/398 [00:00<?, ? examples/s]

In [None]:
tokenized_hallu_test

Dataset({
    features: ['prompt', 'response', 'response_label', 'input_ids', 'attention_mask'],
    num_rows: 398
})

Rename the columns and convert everything in PyTorch for a correct training.

In [None]:
tokenized_hallu_train = tokenized_hallu_train.rename_column("response_label", "label")
tokenized_hallu_val = tokenized_hallu_val.rename_column("response_label", "label")
tokenized_hallu_test = tokenized_hallu_test.rename_column("response_label", "label")

In [None]:
tokenized_hallu_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_hallu_val.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_hallu_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

## **Training**

In [None]:
# Build the function with the metrics to be computed during training

def compute_metrics(eval_pred):

# Save the logits of the labels and consider the label with the
# highest logits

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

# Compute the metrics with the predictions and the true labels

    metrics = {
      "accuracy": accuracy_score(y_true=labels, y_pred = predictions),
      "precision": precision_score(y_true=labels, y_pred=predictions, average="binary"),
      "recall": recall_score(y_true=labels, y_pred=predictions, average="binary"),
      "macro_f1": f1_score(y_true=labels, y_pred=predictions, average="binary"),
      "micro_f1": f1_score(y_true=labels, y_pred=predictions, average="binary")
  }
    return metrics

Definition of the parameters for the training. The learning rate has been tried with higher values, but the algorithm did not converge.

In [None]:
training_args = TrainingArguments(
    output_dir = 'roberta_factbench',
    learning_rate = 2e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 5,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    report_to = "none"
)

Set the trainer with the parameters previously defined.

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_hallu_train,
    eval_dataset=tokenized_hallu_val,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [None]:
# Start the training
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Macro F1,Micro F1
1,No log,0.625666,0.617555,0.639078,0.615634,0.599905,0.617555
2,No log,0.61091,0.689655,0.689716,0.689716,0.689655,0.689655
3,No log,0.677951,0.642633,0.64286,0.642366,0.642208,0.642633
4,0.606400,0.74002,0.673981,0.67819,0.67466,0.672562,0.673981
5,0.606400,0.879793,0.658307,0.661882,0.658955,0.656959,0.658307


TrainOutput(global_step=800, training_loss=0.5239970302581787, metrics={'train_runtime': 659.4055, 'train_samples_per_second': 9.653, 'train_steps_per_second': 1.213, 'total_flos': 1674647909044500.0, 'train_loss': 0.5239970302581787, 'epoch': 5.0})

## **Evaluation**

Evaluate the model on the test set

In [None]:
evaluation = trainer.evaluate(tokenized_hallu_test)

## **Saving dataframes with metrics**

Convert the results in a dataframe and save it on Google Drive.

In [None]:
path = '/content/drive/MyDrive/Ceccarelli_MasterThesis&Internship/Master Thesis/Results'

In [None]:
metrics = {
    'classification type' : 'roberta-base',
    'accuracy' : [evaluation['eval_accuracy']],
    'precision' : [evaluation['eval_precision']],
    'recall' : [evaluation['eval_recall']],
    'macro_f1' : [evaluation['eval_macro_f1']]
}

In [None]:
metrics

{'classification type': 'roberta-base',
 'accuracy': [0.6381909547738693],
 'precision': [0.6414567249085737],
 'recall': [0.6412558328261311],
 'macro_f1': [0.6381818181818182]}

In [None]:
metrics_df = pd.DataFrame(metrics)
metrics_df

Unnamed: 0,classification type,accuracy,precision,recall,macro_f1
0,roberta-base,0.638191,0.641457,0.641256,0.638182


In [None]:
with open(path + "/metrics_roberta_factbench.csv", "w") as f:
    metrics_df.to_csv(f, index=False)