### Setup Commands

These commands needs to be run in every session of Colab. Sets up Google Drive for dataset access, logger format and generates constants used throughout the notebook.

#### Setup Logger For Logging

In [1]:
import logging

log_format = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=log_format, force=True)
logger = logging.getLogger()

#### Setup Constants

In [2]:
dataset_name = "TRnlp/MixSub"
num_rows_in_dup = 1000

In [3]:
# BASE_DATASET_DIR = "/content/drive/MyDrive"
BASE_DATASET_DIR = "/kaggle/working"
ori_dataset_dir = BASE_DATASET_DIR + "/Datasets/MixSub/Original"
dup_dataset_dir = BASE_DATASET_DIR + "/Datasets/MixSub/Duplicate"
hal_dataset_dir = BASE_DATASET_DIR + "/Datasets/MixSub/Hallucinated"

dup_csv_file_name = "duplicate_mixsub.csv"
hal_csv_file_name= "hallucinated_mixsub.csv"

### Generate Smaller Dataset For Easier Processing

Loaded the original dataset from huggingface to generate a sliced smalled version which is easier to process.

In [4]:
%%capture
!pip install datasets

In [None]:
import os
from pathlib import Path
from datasets import load_dataset, load_from_disk, Dataset

If we do not mention `split` argument while in `load_dataset` function, three seperate folders will be downloaded for `train`, `test` and `validation`. In that case, the `dataset` object generated can be indexed for the three splits which will then return the dataset which contains the actual rows. Mentioning which split to load during in `load_dataset` will return the actual dataset on which we can index with the attribute names.

In [None]:
if not os.path.exists(ori_dataset_dir):
    dataset = load_dataset(dataset_name)
    dataset.save_to_disk(ori_dataset_dir)
else:
    dataset = load_from_disk(ori_dataset_dir)

In [7]:
train_ds = dataset['train']
test_ds = dataset['test']
validation_ds = dataset['validation']

In [None]:
# Selects num rows from the original and generates a small dataset
# with first `num_rows_in_dup` entries.
small_ds = train_ds.select(range(num_rows_in_dup))

# Converts the huggingface Datasets type to Pandas Dataframe type
small_pd_df = small_ds.to_pandas()

small_pd_df.head()

Checks whether the path to save the small dataset exists or not, if not, creates the directory to in which it will save the generated file. It writes the small dataset in CSV format, which is different from the original Apache Arrow format.

In [9]:
if not os.path.exists(dup_dataset_dir):
    Path(dup_dataset_dir).mkdir(parents=True, exist_ok=True)

dup_ds_path = os.path.join(dup_dataset_dir, dup_csv_file_name)
small_pd_df.to_csv(dup_ds_path, index=False)

### Generate Hallucinated Dataset

#### Unsloth Installation Command Generator

In [None]:
from packaging.version import Version as V

try:
    import torch
    from torch.version import cuda
except:
    raise ImportError('Install torch via `pip install torch`')

v = V(torch.__version__)
is_ampere = torch.cuda.get_device_capability()[0] >= 8
xformers = "xformers==0.0.27" if v < V("2.4.0") else "xformers"

if cuda != "12.1" and cuda != "11.8" and cuda != "12.4":
    raise RuntimeError(f"CUDA = {cuda} not supported!")
if   v <= V('2.1.0'):
    raise RuntimeError(f"Torch = {v} too old!")
elif v <= V('2.1.1'):
    x = 'cu{}{}-torch211'
elif v <= V('2.1.2'):
    x = 'cu{}{}-torch212'
elif v  < V('2.3.0'):
    x = 'cu{}{}-torch220'
elif v  < V('2.4.0'):
    x = 'cu{}{}-torch230'
elif v  < V('2.5.0'):
    x = 'cu{}{}-torch240'
elif v  < V('2.6.0'):
    x = 'cu{}{}-torch250'
else:
    raise RuntimeError(f"Torch = {v} too new!")

x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
print(f'pip install --upgrade pip && pip install "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"')

In [11]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install --upgrade pip
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton
!pip install "unsloth[cu124-torch250] @ git+https://github.com/unslothai/unsloth.git"

#### Create Unsloth Model

In [12]:
import os
import torch
import pandas as pd
from typing import Literal
from pathlib import Path
from unsloth import FastLanguageModel

Unsloth: Patching Xformers to fix some performance issues.
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [13]:
model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
max_seq_length = 2048
load_in_4bit = True
dtype = None
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    load_in_4bit = load_in_4bit,
    dtype = dtype
)

#### Setup Model For Inference

In [None]:
FastLanguageModel.for_inference(model)

#### Hallucinated Highlight Generator

##### Dataset Loader For Hallucinated Dataset

In [16]:
# Loading only the Abstract section of the dataframe into huggingface dataset
ds_path = Path(os.path.join(dup_dataset_dir, dup_csv_file_name))
custom_df = pd.read_csv(ds_path)
abstract_ds = Dataset.from_pandas(custom_df)
# Only take the abstrack column from the dataset
# abstract_ds = Dataset.from_pandas(custom_df['Abstract'].to_frame())

In [17]:
instruction = """
You're instructed to generate scientifically inaccurate or hallucinated highlights of the provided passage
only without additional sentences like headings, introductions, or text before or after the generated output
as the output will be directly used as highlight in a custom dataset. The highlight should sound plausible
but contain incorrect information.Generate 3-5 concise highlight points from the provided research paper abstract,
covering key contributions, methods, and outcomes. Each point should contain 10 to 15 words only. Return the
points in plain text format without bullets.

No Additional Commentary: Exclude lines like "Here are 3-5 concise highlight points".
"""

def generate_hallucinated_highlights(dataset_row) -> dict[Literal['Hallucination'], str]:
    init_identifier = "<|start_header_id|>assistant<|end_header_id|>"
    term_identifier = "<|eot_id|>"

    abstract = dataset_row['Abstract']

    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": abstract}
    ]

    model_prompt = tokenizer.apply_chat_template(row_json, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer(model_prompt, return_tensors='pt', padding=True, truncation=True).to(device)
    model_outputs = model.generate(**model_inputs, max_new_tokens=150, num_return_sequences=1)
    decoded_text = tokenizer.batch_decode(model_outputs, skip_special_tokens=False)

    # the generaed output contains the prompt, init identifier, generated highlight and term identifier
    # the following code splits the output with init identifier, takes the second section which contains
    # the generated highlight followed by term identifier, now splits the second section based on term
    # identifier, takes the first section, which contains only the generated output. Then it strips the
    # generated content to get rid of any white spaces from the beginning and the end, and replaces
    # newline character with whitespace.
    hallucination = decoded_text[0].split(init_identifier)[1].split(term_identifier)[0]
    hallucination = hallucination.strip().replace("\n", " ")

    return {
        'Hallucination': hallucination,
    }

In [None]:
# Map the existing dataset rows into hallucinated highlights, the returned
# dictionary from the function passed to map, will automatically be appended
# to the dataset on which the map function is being called, and a new dataset
# will be returned, so note that mapping does not modify the dataset on which
# it is being called on.
logger.info("hallucinated dataset generation started")
hal_ds = abstract_ds.map(generate_hallucinated_highlights)
logger.info("hallucinated dataset generation finished")

In [None]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

# enter a huggingface token, which will authenticate this notebook session
# on huggingface server, after which we can publish the dataset. An alternate
# to `login()` is `notebook_login()`. This will prompt for a token.
# login()

user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")
login(token=hf_token)

hal_ds_hf_name = "AdityaMayukhSom/MixSub-With-Hallucinated-Highlights"

if not os.path.exists(hal_dataset_dir):
    Path(hal_dataset_dir).mkdir(parents=True, exist_ok=True)

hal_ds_path = os.path.join(hal_dataset_dir, hal_csv_file_name)

logger.info("hallucinated dataset saving started")
hal_ds.to_csv(hal_ds_path, index=False)
logger.info("hallucinated dataset saved to google drive as csv")
hal_ds.push_to_hub(hal_ds_hf_name)
logger.info("hallucinated dataset saved to huggingface as hf dataset")