## **Installing Dependencies**

In [None]:
!pip install datasets bitsandbytes torch transformers accelerate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_6

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `thesis` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `thesis`


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import warnings
warnings.filterwarnings("ignore")

## **Importing libraries**

In [None]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import random
import numpy as np
from collections import Counter
import torch
import re
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
import pandas as pd

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)

## **Load the data**

In [None]:
hallu_factalign = load_dataset('chaoweihuang/factalign-gemma2-f1_0.75', trust_remote_code=True)

README.md:   0%|          | 0.00/839 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/738k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2177 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/385 [00:00<?, ? examples/s]

## **Data Preparation**

In [None]:
#Extraction of the prompt and the response from the JSON format

def get_question_answer(example):
    return {
        "prompt": example["prompt"][0]["content"],
        "completion": example["completion"][0]["content"]
    }

hallu_factalign = hallu_factalign.map(get_question_answer)

Map:   0%|          | 0/2177 [00:00<?, ? examples/s]

Map:   0%|          | 0/385 [00:00<?, ? examples/s]

In [None]:
# Convert to Pandas Dataframe for better handling

hallu_factalign_train = Dataset.to_pandas(hallu_factalign['train'])
hallu_factalign_test = Dataset.to_pandas(hallu_factalign['test'])

In [None]:
# Convert the boolean values True or False in integers

hallu_factalign_train['label'] = hallu_factalign_train['label'].apply(lambda x: int(x))
hallu_factalign_test['label'] = hallu_factalign_test['label'].apply(lambda x: int(x))

In [None]:
# Remove the instances without response

hallu_factalign_train = hallu_factalign_train[~hallu_factalign_train['completion'].isna()].reset_index(drop=True)
hallu_factalign_test = hallu_factalign_test[~hallu_factalign_train['completion'].isna()].reset_index(drop=True)

In [None]:
# Remove the suffix "Provide as many specific details and examples as possible (such as names of people, numbers, events, locations, dates, times, etc.)"

def extract_question(prompt):
    return prompt.split("?")[0] + "?" if "?" in prompt else prompt

hallu_factalign_train['prompt'] = hallu_factalign_train['prompt'].apply(extract_question)
hallu_factalign_test['prompt'] = hallu_factalign_test['prompt'].apply(extract_question)

In [None]:
# Consider only the prompt, the response and the factuality label

hallu_factalign_train = hallu_factalign_train[['prompt', 'completion', 'label']]
hallu_factalign_test = hallu_factalign_test[['prompt', 'completion', 'label']]

In [None]:
# Concatenate the train and test splits

hallu_factalign = pd.concat([hallu_factalign_train, hallu_factalign_test], axis=0).reset_index(drop=True)

# Using a random seed for consistency, build again new splits for training, test and validation sets

random.seed(777)

hallu_factalign_train, hallu_factalign_test = train_test_split(hallu_factalign, test_size=0.2, random_state=42)
hallu_factalign_train, hallu_factalign_val = train_test_split(hallu_factalign_train, test_size=0.2, random_state=42)

In [None]:
# Reconvert the dataset in DatasetDict format, with the new splits (random seed for shuffling)

random.seed(777)

hallu_factalign = DatasetDict({
    'train': Dataset.from_pandas(hallu_factalign_train).shuffle(42),
    'validation': Dataset.from_pandas(hallu_factalign_val).shuffle(42),
    'test': Dataset.from_pandas(hallu_factalign_test).shuffle(42)
})

hallu_factalign

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion', 'label', '__index_level_0__'],
        num_rows: 1639
    })
    validation: Dataset({
        features: ['prompt', 'completion', 'label', '__index_level_0__'],
        num_rows: 410
    })
    test: Dataset({
        features: ['prompt', 'completion', 'label', '__index_level_0__'],
        num_rows: 513
    })
})

In [None]:
hallu_factalign['train'][0], hallu_factalign['validation'][0], hallu_factalign['test'][0] # Print some examples for each splits

({'prompt': 'What is the Red Bull Stratos Jump?',
  'completion': "The Red Bull Stratos Jump was a successful human-powered spaceflight that took place on June 22, 2012, at the New Mexico Test Site in the United States. The mission was led by Felix Baumgartner, a professional skydiver and Red Bull athlete, and was designed to break the sound barrier and reach sub-orbital space.\n\nHere are some specific details and examples related to the Red Bull Stratos Jump:\n\n* Felix Baumgartner, a professional skydiver and Red Bull athlete, was the pilot of the Red Bull Stratos Jump. Baumgartner had previously achieved the highest skydive in the world, which was a tandem skydive from a height of 39,000 feet.\n* The Red Bull Stratos Jump was a human-powered spaceflight, which means that Baumgartner was the sole occupant of the balloon and capsule. The balloon was designed to carry Baumgartner to the edge of space and to allow him to freefall towards the Earth.\n* The Red Bull Stratos Jump was a co

## **Load the model (Roberta)**

In [None]:
# Fix the number of labels
num_labels = len(set(hallu_factalign['train']['label']))

# Retrieve the model and the corresponding tokenizer from HuggingFace with the method AutoModelForSequenceClassification, to add a classification head at the end of the model,
# in order to perform binary classification using the textual representations generated with the model

model_id = 'FacebookAI/roberta-base'

model = AutoModelForSequenceClassification.from_pretrained(model_id,
                                             num_labels = num_labels)

tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## **Preprocessing**

Preprocessing phase of the response that has to be evaluated as factual or hallucinated.

In [None]:
max_len = 512
col_to_delete = ['__index_level_0__']

def preprocessing(example):
  return tokenizer(example['completion'], truncation=True, max_length=max_len, padding=True, return_tensors='pt')

tokenized_hallu_train = hallu_factalign['train'].map(preprocessing, batched=True, remove_columns=col_to_delete)
tokenized_hallu_val = hallu_factalign['validation'].map(preprocessing, batched=True, remove_columns=col_to_delete)
tokenized_hallu_test = hallu_factalign['test'].map(preprocessing, batched=True, remove_columns=col_to_delete)

Map:   0%|          | 0/1639 [00:00<?, ? examples/s]

Map:   0%|          | 0/410 [00:00<?, ? examples/s]

Map:   0%|          | 0/513 [00:00<?, ? examples/s]

In [None]:
tokenized_hallu_test

Dataset({
    features: ['prompt', 'completion', 'label', 'input_ids', 'attention_mask'],
    num_rows: 513
})

In [None]:
# Convert everything in torch format for the training

tokenized_hallu_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_hallu_val.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_hallu_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

## **Training**

In [None]:
# Build the function with the metrics to be computed during training

def compute_metrics(eval_pred):
    logits, labels = eval_pred # Save the logits of the labels
    predictions = np.argmax(logits, axis=-1) # Consider the label with the highest logit

# Compute the metrics with the predictions and the true labels

    metrics = {
      "accuracy": accuracy_score(y_true=labels, y_pred = predictions),
      "precision": precision_score(y_true=labels, y_pred=predictions, average="binary"),
      "recall": recall_score(y_true=labels, y_pred=predictions, average="binary"),
      "macro_f1": f1_score(y_true=labels, y_pred=predictions, average="binary"),
      "micro_f1": f1_score(y_true=labels, y_pred=predictions, average="binary")
  }
    return metrics

Definition of the parameters for the training. The learning rate has been tried with higher values, but the algorithm did not converge.

In [None]:
training_args = TrainingArguments(
    output_dir = 'roberta_factalign',
    learning_rate = 2e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 5,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    report_to = "none"
)

Set the trainer with the parameters previously defined.

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_hallu_train,
    eval_dataset=tokenized_hallu_val,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [None]:
# Start the training

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Macro F1,Micro F1
1,No log,0.494471,0.770732,0.741935,0.859813,0.796537,0.796537
2,No log,0.513099,0.746341,0.823529,0.654206,0.729167,0.729167
3,0.501100,0.502833,0.787805,0.784753,0.817757,0.800915,0.800915
4,0.501100,0.770001,0.797561,0.769547,0.873832,0.818381,0.818381
5,0.319000,0.807095,0.807317,0.80543,0.831776,0.818391,0.818391


TrainOutput(global_step=1025, training_loss=0.4079786123880526, metrics={'train_runtime': 841.4167, 'train_samples_per_second': 9.74, 'train_steps_per_second': 1.218, 'total_flos': 2156195098675200.0, 'train_loss': 0.4079786123880526, 'epoch': 5.0})

## **Evaluation**

Evaluate the model on the test set

In [None]:
evaluation = trainer.evaluate(tokenized_hallu_test)

## **Saving dataframes with metrics**

Convert the results in a dataframe and save it on Google Drive.

In [None]:
path = '/content/drive/MyDrive/Ceccarelli_MasterThesis&Internship/Master Thesis/Results'

In [None]:
metrics = {
    'classification type' : 'roberta-base',
    'accuracy' : [evaluation['eval_accuracy']],
    'precision' : [evaluation['eval_precision']],
    'recall' : [evaluation['eval_recall']],
    'macro_f1' : [evaluation['eval_macro_f1']]
}

In [None]:
metrics

{'classification type': 'roberta-base',
 'accuracy': [0.7504873294346979],
 'precision': [0.6868686868686869],
 'recall': [0.8535564853556485],
 'macro_f1': [0.7611940298507462]}

In [None]:
metrics_df = pd.DataFrame(metrics)
metrics_df

Unnamed: 0,classification type,accuracy,precision,recall,macro_f1
0,roberta-base,0.750487,0.686869,0.853556,0.761194


In [None]:
with open(path + "/metrics_roberta_factalign.csv", "w") as f:
    metrics_df.to_csv(f, index=False)