In [None]:
import gc
import os
import sys
import random
import logging
import numpy as np
import pandas as pd
from pathlib import Path
from huggingface_hub import login

base_path = Path(".")
HF_TOKEN = None

if "google.colab" in sys.modules:
    from google.colab import drive
    from google.colab import userdata

    drive.mount("/content/drive")

    # if we're in colab, change the base path to google drive mount
    base_path = Path("/content/drive/MyDrive/Datasets/HalluDetect")
    HF_TOKEN = userdata.get('HF_TOKEN')

login(token = HF_TOKEN)

# Fix if you want to use a different path
output_path = base_path / "output"
data_path = base_path / "HaluEval"

if not os.path.exists(output_path):
    os.makedirs(output_path, exist_ok=True)

if not os.path.exists(data_path):
    os.makedirs(data_path, exist_ok=True)

# output_path.mkdir(exist_ok=True)

Mounted at /content/drive


## Deep Learning Installations

In [None]:
%%capture
%pip install loguru
%pip install datasets
%pip install evaluate
%pip install rouge_score
%pip install unsloth
%pip install -U accelerate peft bitsandbytes transformers trl

In [None]:
import evaluate
from loguru import logger

# Setting Device to use the GPU

We use the T4 GPU in Colab since the heaviest computation for us is the inference of the LLM-Evaluator. Therefore, T4 seem as the better fit.

In [None]:
import torch
import torch.nn.functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device)

print(device)
print(torch.cuda.get_device_name(0))

cuda
Tesla T4


## Generic LLMModel class to reuse the functionality of extracting the features.


In [None]:
class LLMModel:
    def __init__(self, model_name, model, tokenizer):
        self.model_name = model_name
        self.model = model
        self.tokenizer = tokenizer

    def getName(self) -> str:
        return self.model_name

    def getSanitizedName(self) -> str:
        return self.model_name.replace("/", "__")

    def generate(self, inpt):
        pass

    # Move in future commits this method to an utils.py
    def truncate_string_by_len(self, s, truncate_len):
        words = s.split()
        truncated_words = words[:-truncate_len] if truncate_len > 0 else words
        return " ".join(truncated_words)

    # Method to get the vocabulary probabilities of the LLM for a given token on the generated text from LLM-Generator
    def getVocabProbsAtPos(self, pos, token_probs):
        sorted_probs, sorted_indices = torch.sort(token_probs[pos, :], descending=True)
        return sorted_probs

    def getMaxLength(self):
        return self.model.config.max_position_embeddings

    def extractFeatures(
        self,
        knowledge="",
        conditionted_text="",
        generated_text="",
        features_to_extract={},
    ):
        """
        By default knowledge is the empty string. If you want to add extra knowledge
        you can do it like in the cases of the qa_data.json and dialogue_data.json

        TODO: document each of the function parameter
        """
        self.model.eval()

        total_len = len(knowledge) + len(conditionted_text) + len(generated_text)
        truncate_len = min(total_len - self.tokenizer.model_max_length, 0)

        # Truncate knowledge in case is too large
        knowledge = self.truncate_string_by_len(knowledge, truncate_len // 2)
        # Truncate text_A in case is too large
        conditionted_text = self.truncate_string_by_len(
            conditionted_text, truncate_len - (truncate_len // 2)
        )

        inputs = self.tokenizer(
            [knowledge + conditionted_text + generated_text],
            return_tensors="pt",
            max_length=self.getMaxLength(),
            truncation=True,
        )

        for key in inputs:
            inputs[key] = inputs[key].to(device)

        with torch.no_grad():
            outputs = self.model(**inputs)
            logits = outputs.logits

        probs = F.softmax(logits, dim=-1)
        probs = probs.to(device)

        tokens_generated_length = len(self.tokenizer.tokenize(generated_text))
        start_index = logits.shape[1] - tokens_generated_length
        conditional_probs = probs[0, start_index :]

        token_ids_generated = inputs["input_ids"][0, start_index :].tolist()
        token_probs_generated = [
            conditional_probs[i, tid].item()
            for i, tid in enumerate(token_ids_generated)
        ]

        tokens_generated = self.tokenizer.convert_ids_to_tokens(token_ids_generated)

        minimum_token_prob = min(token_probs_generated)
        average_token_prob = sum(token_probs_generated) / len(token_probs_generated)

        maximum_diff_with_vocab = -1
        minimum_vocab_extreme_diff = 100000000000

        if features_to_extract["MDVTP"] == True or features_to_extract["MMDVP"] == True:
            size = len(token_probs_generated)
            for pos in range(size):
                vocabProbs = self.getVocabProbsAtPos(pos, conditional_probs)
                maximum_diff_with_vocab = max([
                    maximum_diff_with_vocab,
                    self.getDiffVocab(vocabProbs, token_probs_generated[pos]),
                ])
                minimum_vocab_extreme_diff = min([
                    minimum_vocab_extreme_diff,
                    self.getDiffMaximumWithMinimum(vocabProbs),
                ])

        # allFeatures = [minimum_token_prob, average_token_prob, maximum_diff_with_vocab, minimum_vocab_extreme_diff]

        allFeatures = {
            "MTP": minimum_token_prob,
            "AVGTP": average_token_prob,
            "MDVTP": maximum_diff_with_vocab,
            "MMDVP": minimum_vocab_extreme_diff,
        }

        selectedFeatures = {}
        for key, feature in features_to_extract.items():
            if feature == True:
                selectedFeatures[key] = allFeatures[key]

        return selectedFeatures

    def getDiffVocab(self, vocabProbs, tprob):
        return (vocabProbs[0] - tprob).item()

    def getDiffMaximumWithMinimum(self, vocabProbs):
        return (vocabProbs[0] - vocabProbs[-1]).item()

## Definition of the specific Models

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

class LLama(LLMModel):
    def __init__(self):
        model_name = "meta-llama/Llama-2-7b-chat-hf"
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
        )
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        super().__init__(model_name, model, tokenizer)


    def generate(self, inpt):
        inputs = self.tokenizer([inpt], max_length=1024, return_tensors="pt", truncation=True)
        summary_ids = self.model.generate(inputs["input_ids"])
        summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary


In [None]:
from unsloth import FastLanguageModel

class UnslothLLaMA(LLMModel):
    def __init__(self):
        model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name = model_name,
            max_seq_length = 2048,
            load_in_4bit = True,
            dtype = None
        )
        super().__init__(model_name, model, tokenizer)


    def generate(self, inpt):
        # put print statements in the LLaMA model generate function, to compare if LLaMA and Unsloth
        # LLaMA are giving interoperable inputs and outputs. tokenized input's won't make much sense
        # so rather print the summary on both the models for some given input, and compare if those
        # are semantically equivalent.

        model_inputs = self.tokenizer(inpt, return_tensors='pt', padding=True, truncation=True).to(device)
        model_outputs = self.model.generate(**model_inputs, max_new_tokens=2048, num_return_sequences=1)

        # TODO: is decoding model_output and taking decoded_text[0] same as decoding model_output[0]?
        # if yes, that can save a lot of compute, so need to test that once, easiest way to test is to
        # print decoded_text[0] and decode(model_outputs[0]) and check if those are equal.

        # OPTION - 01
        # decoded_text = self.tokenizer.batch_decode(model_outputs, skip_special_tokens=True)
        # summary = decoded_text[0]

        # OPTION - 02
        summary = self.tokenizer.decode(model_outputs[0], skip_special_token=True)

        return summary


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


### The Dictionary `features_to_extract` defines which features will be use in this experiment.

#### Features Meaning:

- `MTP` : Take the minimum of the probabilities that the LLM_E gives to the tokens on the generated-text.
- `AVGTP` : Take the average of the probabilities that the LLM_E
gives to the tokens on the generated-text.
- `MDVTP` : Take the maximum from all the differences
between the token with the highest probability
according to LLM_E at position i and the
assigned probability from LLM_E to the token at position i in the generated_text.
- `MMDVP` : Take the maximum from all the differences between the token with the highest probability according to $LLM_E$ at position $i$ ($v^*$) and the token with the lowest probability according to $LLM_E$ at position $i$ ($v^-$).


In [None]:
feature_to_extract = 'all'

available_features_to_extract = ["MTP", "AVGTP", "MDVTP", "MMDVP"]

if feature_to_extract == 'all':
    features_to_extract = {
        feature: True for feature in available_features_to_extract
    }
else:
    features_to_extract = {
        feature: True if feature == feature_to_extract else False
        for feature in available_features_to_extract
    }

features_to_extract

{'MTP': True, 'AVGTP': True, 'MDVTP': True, 'MMDVP': True}

## Cleaning Cache on GPU to save memory

In [None]:
gc.collect()
torch.cuda.empty_cache()

## This cell is to instantiate the model you intend to use for the experiment

In [None]:
model = UnslothLLaMA()

==((====))==  Unsloth 2025.3.14: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

### This cell creates the dataset separation of `10%` for training and `90%` for testing depending on what task you are addressing. The following explanation is what happens if summarization is the task used. But the same explanation applies to all tasks and also you cand pass as parameter how many data points you want to include in training.

#### Example: The data is separated on 2000 (1000 of document with right summary and 1000 with the same document but with the hallucinated summary). The rest which is 18000 is used to for testing.

In [None]:
from datasets import Dataset
from datasets import DatasetDict
from datasets import load_dataset

## As a recomendation keep these two with the same naming if you do not want to change many things
ORIGINAL_DATASET_ACNT = 'AdityaMayukhSom'
ORIGINAL_DATASET_REPO = 'MixSub-With-Hallucinated-Highlights'
ORIGINAL_DATASET_NAME = f"{ORIGINAL_DATASET_ACNT}/{ORIGINAL_DATASET_REPO}"

def loadDataset():
    ds = load_dataset(ORIGINAL_DATASET_NAME)
    ds = ds['train']
    # ds = ds.select(range(5))
    data = ds.to_pandas()
    return data

data = loadDataset()
data.head()

README.md:   0%|          | 0.00/857 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Unnamed: 0,Filename,Abstract,Highlight,Hallucination
0,S0001457518304810,Recent field data analyses have shown that lum...,Risk factors of lumbar spine fractures in fron...,Lumbar spine fractures more common in late mod...
1,S0001457518307395,Though U.S. motor vehicle crashes as a whole h...,Deaths from pedestrian crashes and drug overdo...,Pedestrian deaths in Georgia increased by 40 b...
2,S0001457518308108,Mindful organizing is a team level construct t...,Development of a Spanish version of the Mindfu...,Mindful organizing is a key factor in achievin...
3,S0001457518308145,Numerous studies have previously used a variet...,A tobit model coupled with correlated random p...,Crash rates directly modeled as a continuous v...
4,S000145751830887X,Advance guide signs for exit ramps along urban...,Better design alternatives of advance guide si...,Advance guide signs improve traffic flow by re...


In [None]:
from sklearn.model_selection import train_test_split

# Adapt the dataset to have a data point of conditioned-text with right-generation and another with the
# same conditioned-text and tha hallucinated answer. If it is hallucinated then corresponding label is 1
# and if it is non hallucinated, the corresponding label is 0.
def refactorDataset(data: pd.DataFrame, train_size: float = 0.8):
    # Whether to drop `Filename` column from dataset
    # data.drop(['Filename'], inplace = True)

    right_data = data.copy()
    hallu_data = data.copy()

    # right_data.drop(['Filename'], inplace=True)
    # hallu_data.drop(['Filename'], inplace=True)
    right_data.drop('Hallucination', axis = 1, inplace = True)
    hallu_data.drop('Highlight', axis = 1, inplace = True)

    right_data.rename(columns={'Abstract': 'ConditionedText', 'Highlight': 'GeneratedText'}, inplace = True)
    hallu_data.rename(columns={'Abstract': 'ConditionedText', 'Hallucination': 'GeneratedText'}, inplace = True)

    right_data['IsHallucinated'] = False
    hallu_data['IsHallucinated'] = True

    data = pd.concat([right_data, hallu_data], axis = 0)

    # Randomly shuffle the data
    data = data.sample(frac = 1).reset_index(drop = True)

    # How to split a dataframe into X and Y
    # https://stackoverflow.com/questions/53991131/how-to-split-data-frame-into-x-and-y
    X = data.iloc[:, 0:-1]
    Y = data.iloc[:, -1]

    return X, Y

In [None]:
X, Y = refactorDataset(data, 0.8)

In [None]:
X

Unnamed: 0,Filename,ConditionedText,GeneratedText
0,S0001457520315621,Driving under the influence is illegal in the ...,DUI crashes in rural areas are associated with...
1,S0001457520301482,Driving anger increases risk taking in traffic...,Driving anger was successfully induced in a dr...
2,S0003347219303781,Sexual conflict generates a reproductive arms ...,Rates of male attention to females correlate w...
3,S001448352030049X,Basement membranes are highly specialized extr...,Major components of the EBM are collagens lami...
4,S0009279720309820,Kinetic modeling of the behavior of complex ch...,The proposed kinetic model describes the dynam...
...,...,...,...
1995,S0014483519304415,Childhood glaucoma is an important cause of bl...,Eleven genes responsible for childhood forms o...
1996,S0001457520316213,Connected Vehicles technology has been used to...,Propose an advanced curve speed warning system...
1997,S0001457520315931,Road accidents are one of the leading causes o...,We examined the effect of a school based road ...
1998,S0005789419300619,Severe irritability is a common and clinically...,This three wave longitudinal study explored me...


## Extracting the features for the Training Data

In [None]:
from tqdm import tqdm

def extract_features_from_dataset(X: pd.DataFrame):
    # For summarization task, knowledge string is empty
    knowledge = ""

    mtp_list = []
    avgtp_list = []
    mdvtp_list = []
    mmdvp_list = []

    for filename, conditioned_text, generated_text in tqdm(X.itertuples(index = False), desc="Processing"):
        features = model.extractFeatures(knowledge, conditioned_text, generated_text, features_to_extract)

        mtp_list.append(features["MTP"])
        avgtp_list.append(features["AVGTP"])
        mdvtp_list.append(features["MDVTP"])
        mmdvp_list.append(features["MMDVP"])

        torch.cuda.empty_cache()  # Clean cache in every step for memory saving.

    X['MTP'] = mtp_list
    X['AVGTP'] = avgtp_list
    X['MDVTP'] = mdvtp_list
    X['MMDVP'] = mmdvp_list

    return X

In [None]:
X = extract_features_from_dataset(X)

Processing: 2000it [04:36,  7.22it/s]


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, train_size = 0.8, random_state = 69, shuffle = True
)

In [None]:
print(len(X_train), len(Y_train))
print(len(X_test), len(Y_test))  # verify the sizes look right
print(X_test.iloc[0])
print(Y_test.iloc[0])

1600 1600
400 400
Filename                                           S0001457519308735
ConditionedText    Automated vehicles represent an opportunity to...
GeneratedText      Automated vehicles AVs can reduce crashes by r...
MTP                                                              0.0
AVGTP                                                       0.000034
MDVTP                                                       0.997559
MMDVP                                                       0.120178
Name: 1338, dtype: object
False


In [None]:
train_df = pd.concat([X_train, Y_train], axis=1)
test_df = pd.concat([X_test, Y_test], axis=1)

In [None]:
GENERATED_DATASET_ACNT = 'AdityaMayukhSom'
GENERATED_DATASET_REPO = 'MixSub-Hallucinated-Highlight-Features'
GENERATED_DATASET_NAME = f"{GENERATED_DATASET_ACNT}/{GENERATED_DATASET_REPO}"

In [None]:
train_df.to_csv(output_path / (GENERATED_DATASET_REPO + '_TRAIN.csv'), index=False)
test_df.to_csv(output_path / (GENERATED_DATASET_REPO + '_TEST.csv'), index=False)

In [None]:
train_ds = Dataset.from_pandas(train_df, preserve_index = False)
test_ds = Dataset.from_pandas(test_df, preserve_index = False)

dataset_dict = DatasetDict({
    "train": train_ds,
    "test": test_ds
})

In [None]:
logger.info("pushing dataset to huggingface")
dataset_dict.push_to_hub(GENERATED_DATASET_NAME)
logger.success(f"dataset pushed to huggingface at {GENERATED_DATASET_NAME}")

[32m2025-03-14 23:25:21.742[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m1[0m - [1mpushing dataset to huggingface[0m


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/627 [00:00<?, ?B/s]

[32m2025-03-14 23:25:25.203[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m3[0m - [32m[1mdataset pushed to huggingface at AdityaMayukhSom/MixSub-Hallucinated-Highlight-Features[0m
