In [None]:
import os
import sys

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [None]:
%%capture
%pip install loguru
%pip install datasets
%pip install evaluate
%pip install rouge_score
%pip install unsloth
%pip install -U accelerate peft bitsandbytes transformers trl

In [None]:
import os
import pathlib
import sys

import pandas as pd
from huggingface_hub import login
from loguru import logger
from tqdm import tqdm

from src.extractor import UnslothLLaMA, LLMModel
from src.memory import empty_all_memory

base_path = pathlib.Path(".")
HF_TOKEN = None

if "google.colab" in sys.modules:
    from google.colab import drive, userdata

    drive.mount("/content/drive")

    # if we're in colab, change the base path to google drive mount
    base_path = pathlib.Path("/content/drive/MyDrive/Datasets/HalluDetect")
    HF_TOKEN = userdata.get("HF_TOKEN")

login(token=HF_TOKEN)

# Fix if you want to use a different path
output_path = base_path / "output"
data_path = base_path / "HaluEval"

if not os.path.exists(output_path):
    os.makedirs(output_path, exist_ok=True)

if not os.path.exists(data_path):
    os.makedirs(data_path, exist_ok=True)

# output_path.mkdir(exist_ok=True)

##### Features Meaning:

- `MTP` : Take the minimum of the probabilities that the LLM_E gives to the tokens on the generated-text.
- `AVGTP` : Take the average of the probabilities that the LLM_E
  gives to the tokens on the generated-text.
- `MDVTP` : Take the maximum from all the differences
  between the token with the highest probability
  according to LLM_E at position i and the
  assigned probability from LLM_E to the token at position i in the generated_text.
- `MMDVP` : Take the maximum from all the differences between the token with the highest probability according to $LLM_E$ at position $i$ ($v^*$) and the token with the lowest probability according to $LLM_E$ at position $i$ ($v^-$).


This cell creates the dataset separation of `10%` for training and `90%` for testing depending on what task you are addressing. The following explanation is what happens if summarization is the task used. But the same explanation applies to all tasks and also you cand pass as parameter how many data points you want to include in training.

Example: The data is separated on 2000 (1000 of document with right summary and 1000 with the same document but with the hallucinated summary). The rest which is 18000 is used to for testing.


In [None]:
from datasets import Dataset
from datasets import DatasetDict
from datasets import load_dataset
from sklearn.model_selection import train_test_split

ORIGINAL_DATASET_ACNT = "AdityaMayukhSom"
ORIGINAL_DATASET_REPO = "MixSub-With-Hallucinated-Highlights"
ORIGINAL_DATASET_NAME = f"{ORIGINAL_DATASET_ACNT}/{ORIGINAL_DATASET_REPO}"


def loadDataset():
    ds = load_dataset(ORIGINAL_DATASET_NAME)
    ds = ds["train"]
    # ds = ds.select(range(5))
    data = ds.to_pandas()
    return data


def refactorDataset(data: pd.DataFrame, train_size: float = 0.8):
    """
    Adapt the dataset to have a data point of conditioned-text with right-generation
    and another with the same conditioned-text and the hallucinated answer. If it
    is hallucinated then corresponding label is 1 and if it is non hallucinated,
    the corresponding label is 0.
    """
    # Whether to drop `Filename` column from dataset
    # data.drop(['Filename'], inplace = True)

    right_data = data.copy()
    hallu_data = data.copy()

    # right_data.drop(['Filename'], inplace=True)
    # hallu_data.drop(['Filename'], inplace=True)
    right_data.drop("Hallucination", axis=1, inplace=True)
    hallu_data.drop("Highlight", axis=1, inplace=True)

    right_data.rename(
        columns={
            "Abstract": "ConditionedText",
            "Highlight": "GeneratedText",
        },
        inplace=True,
    )
    hallu_data.rename(
        columns={
            "Abstract": "ConditionedText",
            "Hallucination": "GeneratedText",
        },
        inplace=True,
    )

    right_data["IsHallucinated"] = False
    hallu_data["IsHallucinated"] = True

    data = pd.concat([right_data, hallu_data], axis=0)

    # Randomly shuffle the data
    data = data.sample(frac=1).reset_index(drop=True)

    # How to split a dataframe into X and Y
    # https://stackoverflow.com/questions/53991131/how-to-split-data-frame-into-x-and-y
    X = data.iloc[:, 0:-1]
    Y = data.iloc[:, -1]

    return X, Y


def extract_features_from_dataset(model: LLMModel, X: pd.DataFrame):
    # For summarization task, knowledge string is empty
    knowledge = ""

    mtp_list = []
    avgtp_list = []
    mdvtp_list = []
    mmdvp_list = []

    for filename, conditioned_text, generated_text in tqdm(
        X.itertuples(index=False), desc="Processing"
    ):
        features = model.extract_features(
            knowledge,
            conditioned_text,
            generated_text,
        )
        mtp_list.append(features.MTP)
        avgtp_list.append(features.AVGTP)
        mdvtp_list.append(features.MDVTP)
        mmdvp_list.append(features.MMDVP)

    X["MTP"] = mtp_list
    X["AVGTP"] = avgtp_list
    X["MDVTP"] = mdvtp_list
    X["MMDVP"] = mmdvp_list

    return X


In [None]:
empty_all_memory()

data = loadDataset()
model = UnslothLLaMA()

X, Y = refactorDataset(data)
X = extract_features_from_dataset(X)

_split = train_test_split(X, Y, train_size=0.8, random_state=42, shuffle=True)
X_train, X_test, Y_train, Y_test = _split

print(len(X_train), len(Y_train))
print(len(X_test), len(Y_test))
print(X_test.iloc[0])
print(Y_test.iloc[0])

GENERATED_DATASET_ACNT = "AdityaMayukhSom"
GENERATED_DATASET_REPO = "MixSub-Hallucinated-Highlight-Features"
GENERATED_DATASET_NAME = f"{GENERATED_DATASET_ACNT}/{GENERATED_DATASET_REPO}"

train_df = pd.concat([X_train, Y_train], axis=1)
test_df = pd.concat([X_test, Y_test], axis=1)

train_df.to_csv(output_path / (GENERATED_DATASET_REPO + "_TRAIN.csv"), index=False)
test_df.to_csv(output_path / (GENERATED_DATASET_REPO + "_TEST.csv"), index=False)

train_ds = Dataset.from_pandas(train_df, preserve_index=False)
test_ds = Dataset.from_pandas(test_df, preserve_index=False)

dataset_dict = DatasetDict(
    {
        "train": train_ds,
        "test": test_ds,
    }
)

logger.info("pushing dataset to huggingface")
dataset_dict.push_to_hub(GENERATED_DATASET_NAME)
logger.success(f"dataset pushed to huggingface at {GENERATED_DATASET_NAME}")

del model
empty_all_memory()