### Setup Commands

These commands needs to be run in every session of Colab. Sets up Google Drive for dataset access, logger format and generates constants used throughout the notebook.

#### Mount Google Drive

In [None]:
from huggingface_hub import login
from google.colab import drive, userdata

drive.mount('/content/drive/')
HF_TOKEN = userdata.get('HF_TOKEN')

# enter a huggingface token, which will authenticate this notebook session
# on huggingface server, after which we can publish the dataset. An alternate
# to `login()` is `notebook_login()`. This will prompt for a token.
login(token = HF_TOKEN)

#### Install Libraries

In [2]:
%%capture
%pip install -U loguru
%pip install -U unsloth
%pip install -U datasets
%pip install -U accelerate
%pip install -U peft
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U trl
%pip install -U xformers

#### Setup Imports And Logger

In [3]:
# import logging
# log_format = "%(asctime)s - %(levelname)s - %(message)s"
# logging.basicConfig(level=logging.INFO, format=log_format, force=True)
# logger = logging.getLogger()

import gc
import os

import torch
import psutil
import ctypes
import pandas as pd
from pathlib import Path
from loguru import logger
from datasets import load_dataset, load_from_disk, Dataset
from unsloth import FastLanguageModel, is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


### Generate Smaller Dataset For Easier Processing

Loaded the original dataset from huggingface to generate a sliced smalled version which is easier to process.

If we do not mention `split` argument while in `load_dataset` function, three seperate folders will be downloaded for `train`, `test` and `validation`. In that case, the `dataset` object generated can be indexed for the three splits which will then return the dataset which contains the actual rows. Mentioning which split to load during in `load_dataset` will return the actual dataset on which we can index with the attribute names.

In [4]:
def generate_small_dataset(
    *,
    ori_dataset_dir: str | None = None,
    dup_dataset_dir: str | None = None,
    dup_csv_file_name: str = "duplicate_mixsub.csv",
    num_rows_in_dup: int = 10
):
    # if ori_dataset_dir is not None and os.path.exists(ori_dataset_dir):
    #     dataset = load_from_disk(ori_dataset_dir)
    # else:
    dataset = load_dataset("TRnlp/MixSub")
        # if ori_dataset_dir is not None:
        #     dataset.save_to_disk(ori_dataset_dir)

    train_ds = dataset['train']
    test_ds = dataset['test']
    validation_ds = dataset['validation']

    # Selects num rows from the original and generates a small dataset
    # with first `num_rows_in_dup` entries.
    small_ds = train_ds.select(range(num_rows_in_dup))

    # Converts the huggingface Datasets type to Pandas Dataframe type
    small_pd_df = small_ds.to_pandas()

    # if dup_dataset_dir is not None:
    #     if not os.path.exists(dup_dataset_dir):
    #         Path(dup_dataset_dir).mkdir(parents=True, exist_ok=True)
    #     if dup_csv_file_name is not None:
    #         dup_ds_path = os.path.join(dup_dataset_dir, dup_csv_file_name)
    #         logger.info("small csv save started")
    #         small_pd_df.to_csv(dup_ds_path, index=False)
    #         logger.success("small csv save success")

    return small_pd_df

Checks whether the path to save the small dataset exists or not, if not, creates the directory to in which it will save the generated file. It writes the small dataset in CSV format, which is different from the original Apache Arrow format.

### Generate Hallucinated Dataset

#### Create and setup unsloth model for inference

In [5]:
def get_inference_model():
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "AdityaMayukhSom/Llama-3.2-1B-Instruct-bnb-4bit-MixSub",
        max_seq_length = 2048,
        load_in_4bit = True,
        dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16
    )

    FastLanguageModel.for_inference(model)
    return model, tokenizer

#### Hallucinated Highlight Generator

##### Dataset Loader For Hallucinated Dataset

In [6]:
def get_dataset(
    *,
    dup_dataset_dir: str | None = None,
    dup_csv_file_name: str = "duplicate_mixsub.csv",
    get_small_dataset = True
):
    # if get_small_dataset and dup_dataset_dir is not None:
    #     # Loading only the Abstract section of the dataframe into huggingface dataset
    #     ds_path = Path(os.path.join(dup_dataset_dir, dup_csv_file_name))
    #     custom_df = pd.read_csv(ds_path, index_col = None)
    #     abstract_ds = Dataset.from_pandas(custom_df)
    # else:
        # Loading the dataset dict from huggingface
    abstract_dsd = load_dataset("TRnlp/MixSub")
    abstract_ds = abstract_dsd['train']

    return abstract_ds

In [7]:
def generate_eval_highlights(dataset_row, model, tokenizer):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)

    instruction = """
    You are instructed to generate a scientifically accurate highlight of the provided passage without additional
    sentences such as headings or introductions before or after the generated text as it will be used as summary
    in a custom dataset. The highlight should sound plausible and should not contain incorrect information. Generate
    3-5 concise highlight points from the provided research paper abstract, covering key contributions, methods and
    outcomes. Each point should contain 10 to 15 words only. Return the points in plain text format without bullets.

    No Additional Commentary: Exclude lines like "Here are 3-5 concise highlight points".
    """

    init_identifier = "<|start_header_id|>assistant<|end_header_id|>"
    term_identifier = "<|eot_id|>"

    batch_data = [
        [
            {"role": "system", "content": instruction},
            {"role": "user", "content": abstract}
        ]
        for abstract in dataset_row['Abstract']
    ]

    inputs = tokenizer.apply_chat_template(
        batch_data, add_generation_prompt=True, tokenize = True,
        padding = True, truncation = True, return_tensors = 'pt',
        return_dict = True
    ).to(device)

    outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True, pad_token_id = tokenizer.eos_token_id)
    decoded_texts = tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=False)

    # the generaed output contains the prompt, init identifier, generated highlight and term identifier
    # the following code splits the output with init identifier, takes the second section which contains
    # the generated highlight followed by term identifier, now splits the second section based on term
    # identifier, takes the first section, which contains only the generated output. Then it strips the
    # generated content to get rid of any white spaces from the beginning and the end, and replaces
    # newline character with whitespace.

    # print(decoded_texts)
    output = [decoded_text.split(term_identifier)[0].strip().replace("\n", " ") for decoded_text in decoded_texts]

    del batch_data
    del inputs
    del outputs
    del instruction
    del init_identifier
    del term_identifier

    return {
        'GeneratedHighlight': output,
    }


In [8]:
def print_mem_stats():
    stats = psutil.virtual_memory()
    free_gb = stats.free / 1e9
    print(f"Your runtime has {free_gb:.1f} gigabytes of free RAM")
    used_gb = stats.used / 1e9
    print(f"Your runtime has {used_gb:.1f} gigabytes of used RAM")
    avlb_gb = stats.available / 1e9
    print(f"Your runtime has {avlb_gb:.1f} gigabytes of available RAM")
    ram_gb = stats.total / 1e9
    print(f"Your runtime has {ram_gb:.1f} gigabytes of total RAM")
    print(f"Your runtime has {stats.percent:.1f}% usage of RAM")

In [None]:
def generate_finetuned_model_highlights_dataset():
    ori_dataset_dir = "/content/drive/MyDrive/Datasets/MixSub/Original"
    dup_dataset_dir = "/content/drive/MyDrive/Datasets/MixSub/Duplicate"

    out_ds_hf_name = "AdityaMayukhSom/MixSub-LLaMA-3.2-FineTuned-Outputs-Large"
    out_dataset_dir = "/content/drive/MyDrive/Datasets/MixSub/Outputs"
    out_csv_file_name = "llama-3_2-1B-outputs-mixsub.csv"

    abstract_ds = get_dataset(dup_dataset_dir = dup_dataset_dir)
    model, tokenizer = get_inference_model()

    # Map the existing dataset rows into hallucinated highlights, the returned
    # dictionary from the function passed to map, will automatically be appended
    # to the dataset on which the map function is being called, and a new dataset
    # will be returned, so note that mapping does not modify the dataset on which
    # it is being called on.
    logger.info("finetuned model dataset generation started")
    out_ds = abstract_ds.map(
        lambda row: generate_eval_highlights(row, model, tokenizer),
        batched = True,
        batch_size = 16,
    )
    logger.success("finetuned model dataset generation finished")

    # if not os.path.exists(out_dataset_dir):
        # Path(out_dataset_dir).mkdir(parents=True, exist_ok=True)
    # out_ds_path = os.path.join(out_dataset_dir, out_csv_file_name)

    # logger.info("started saving outlucinated dataset")
    # out_ds.to_csv(out_ds_path, index=False)
    # logger.success("finetuned model dataset saved to google drive as csv")

    logger.info("started pushing finetuned model dataet to huggingface")
    out_ds.push_to_hub(out_ds_hf_name)
    logger.success("finetuned model dataset saved to huggingface as hf dataset")

if __name__ == '__main__':
    print_mem_stats()
    libc = ctypes.CDLL("libc.so.6") # clearing cache
    libc.malloc_trim(0)
    print_mem_stats()
    # generate_small_dataset()
    generate_finetuned_model_highlights_dataset()