In [None]:
from google.colab import userdata
from huggingface_hub import login

hf_token = userdata.get('HF_TOKEN')
login(token = hf_token)

In [None]:
!pip install loguru
!pip install datasets



In [None]:
import gc
import json
import ctypes
import psutil
import pandas as pd
from loguru import logger
from tabulate import tabulate
from datasets import load_dataset, DatasetDict

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
def print_mem_stats():
    stats = psutil.virtual_memory()
    free_gb = stats.free / 1e9
    print(f"Your runtime has {free_gb:.1f} gigabytes of free RAM")
    used_gb = stats.used / 1e9
    print(f"Your runtime has {used_gb:.1f} gigabytes of used RAM")
    avlb_gb = stats.available / 1e9
    print(f"Your runtime has {avlb_gb:.1f} gigabytes of available RAM")
    ram_gb = stats.total / 1e9
    print(f"Your runtime has {ram_gb:.1f} gigabytes of total RAM")
    print(f"Your runtime has {stats.percent:.1f}% usage of RAM")

In [None]:
def grab_dataset(
    *,
    dataset_name: str,
    return_small: bool = False,
    small_rows_num: int | None = None
):
    dataset_full = load_dataset(dataset_name)

    if isinstance(dataset_full, DatasetDict):
        dataset_full = dataset_full['train']

    if small_rows_num is not None and return_small:
        dataset = dataset_full.select(range(small_rows_num))
    else:
        dataset = dataset_full

    return dataset

In [None]:
class Ent:
    def __init__(self, text: str, label: str):
        self.text = text
        self.label = label

    def __hash__(self):
        return hash(self.text)
        # return hash((self.text, self.label))

    def __eq__(self, other):
        return self.text == other.text
        # return self.text == other.text and self.label == other.label

Precision-source,
defined as, precs = N (h ∩ s)/N (h) is a metric that is used to determine the intensity of
hallucination in relation to the source. Note that precs represents the percentage of entities
mentioned in the generated summary that can be retrieved from the source. Low precs indicates that hallucination is possibly present in the generated text. However, precs does not
capture the computed summary’s entity-level correctness in relation to the ground-truth summary. Entity-level accuracy of the generated summary is calculated using the precision-target
as prect = N (h ∩ t)/N (h); the recall-target as recallt = N (h ∩ t)/N (t); and F1 score as
F1t =
2∗(recallt∗prect)
recallt+prect
. Here, N (h ∩ t) represents the number of matched named-entities in
the generated summary and the ground truth summary.

In [None]:
def calculate_metrices(row, idx):
    source = json.loads(row['AbstractEntities'])
    target = json.loads(row["HighlightEntities"])
    hypothesis = json.loads(row["GeneratedHighlightEntities"])

    s = set([Ent(el['ent'].lower().strip(), el['type']) for el in source])
    t = set([Ent(el['ent'].lower().strip(), el['type']) for el in target])
    h = set([Ent(el['ent'].lower().strip(), el['type']) for el in hypothesis])

    N = len

    if N(s) == 0 and N(h) == 0:
        prec_s = 1
    elif N(h) == 0:
        prec_s = 0
    else:
        prec_s = N(h & s) / N(h)

    if N(t) == 0 and N(h) == 0:
        prec_t = 1
        recall_t = 1
    elif N(t) == 0:
        prec_t = N(h & t) / N(h)
        recall_t = 0
    elif N(h) == 0:
        prec_t = 0
        recall_t = N(h & t) / N(t)
    else:
        prec_t = N(h & t) / N(h)
        recall_t = N(h & t) / N(t)

    F1_t = (2 * recall_t * prec_t) / (recall_t + prec_t) if (recall_t + prec_t) != 0 else 0

    return {
        "prec_s": prec_s,
        "prec_t": prec_t,
        "recall_t": recall_t,
        "F1_t": F1_t
    }

def generate_metrices_dataset(root_dataset_name: str, generated_dataset_name: str):
    hf_root_ds_name = root_dataset_name
    out_ds_hf_name = generated_dataset_name

    dataset = grab_dataset(
        dataset_name = hf_root_ds_name,
        # return_small = True,
        # small_rows_num = 20
    )

    logger.info("finetuned entities dataset generation started")

    try:
        out_ds = dataset.map(
            function = calculate_metrices,
            with_indices = True,
            batched = False,
            # batch_size = 1024,
        )
        logger.success("finetuned entities dataset generation finished")

        logger.info("started pushing finetuned entitites dataet to huggingface")
        out_ds.push_to_hub(out_ds_hf_name)
        logger.success("finetuned entitites dataset saved to huggingface as hf dataset")

        del out_ds
    except Exception as e:
        logger.exception(str(e))
    finally:
        del dataset

def generated_dataset_stats(generated_dataset_name: str):
    dataset = grab_dataset(dataset_name = generated_dataset_name)
    df = dataset.to_pandas()
    df = df[['Filename', 'prec_s', 'prec_t', 'recall_t', 'F1_t']]
    print(generated_dataset_name)
    print(tabulate(df.describe(), headers = 'keys', tablefmt = 'psql'))
    del df
    del dataset

def main():
    gc.collect()
    libc = ctypes.CDLL("libc.so.6") # clearing cache
    libc.malloc_trim(0)
    print_mem_stats()

    hf_root_ds_name = "AdityaMayukhSom/MixSub-LLaMA-3.2-FineTuned-CPU-Entities"
    out_ds_hf_name = "AdityaMayukhSom/MixSub-LLaMA-3.2-Text-Only-Overlap-CPU-Score"
    generate_metrices_dataset(hf_root_ds_name, out_ds_hf_name)
    generated_dataset_stats(out_ds_hf_name)

    libc = ctypes.CDLL("libc.so.6") # clearing cache
    libc.malloc_trim(0)
    print_mem_stats()
    gc.collect()

In [None]:
try:
    main()
except Exception as e:
    logger.exception(e)

Your runtime has 8.7 gigabytes of free RAM
Your runtime has 1.2 gigabytes of used RAM
Your runtime has 12.1 gigabytes of available RAM
Your runtime has 13.6 gigabytes of total RAM
Your runtime has 11.3% usage of RAM


[32m2025-03-28 04:50:45.135[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_metrices_dataset[0m:[36m51[0m - [1mfinetuned entities dataset generation started[0m


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

[32m2025-03-28 04:50:46.019[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mgenerate_metrices_dataset[0m:[36m60[0m - [32m[1mfinetuned entities dataset generation finished[0m
[32m2025-03-28 04:50:46.026[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_metrices_dataset[0m:[36m62[0m - [1mstarted pushing finetuned entitites dataet to huggingface[0m


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

[32m2025-03-28 04:50:50.335[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mgenerate_metrices_dataset[0m:[36m64[0m - [32m[1mfinetuned entitites dataset saved to huggingface as hf dataset[0m


README.md:   0%|          | 0.00/692 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/4.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4000 [00:00<?, ? examples/s]

AdityaMayukhSom/MixSub-LLaMA-3.2-Text-Only-Overlap-CPU-Score
+-------+-------------+-------------+-------------+-------------+
|       |      prec_s |      prec_t |    recall_t |        F1_t |
|-------+-------------+-------------+-------------+-------------|
| count | 4000        | 4000        | 4000        | 4000        |
| mean  |    0.601796 |    0.314932 |    0.278493 |    0.275135 |
| std   |    0.449666 |    0.410647 |    0.380268 |    0.36423  |
| min   |    0        |    0        |    0        |    0        |
| 25%   |    0        |    0        |    0        |    0        |
| 50%   |    1        |    0        |    0        |    0        |
| 75%   |    1        |    0.666667 |    0.5      |    0.5      |
| max   |    1        |    1        |    1        |    1        |
+-------+-------------+-------------+-------------+-------------+
Your runtime has 8.7 gigabytes of free RAM
Your runtime has 1.2 gigabytes of used RAM
Your runtime has 12.1 gigabytes of available RAM
Your runtime