In [None]:
from google.colab import userdata
from huggingface_hub import login

hf_token = userdata.get('HF_TOKEN')
login(token = hf_token)

In [None]:
# !pip install nltk
!pip install -U pip setuptools wheel
!pip install -U 'spacy[cuda12x]'
!pip install loguru
!pip install datasets
!python -m spacy download en_core_web_trf

In [None]:
import gc
import json
import spacy
import torch
import ctypes
import psutil
import requests
import en_core_web_trf
import pandas as pd
from loguru import logger
from datasets import load_dataset, DatasetDict

In [None]:
def print_mem_stats():
    stats = psutil.virtual_memory()
    free_gb = stats.free / 1e9
    logger.info(f"Your runtime has {free_gb:.1f} gigabytes of free RAM")
    used_gb = stats.used / 1e9
    logger.info(f"Your runtime has {used_gb:.1f} gigabytes of used RAM")
    avlb_gb = stats.available / 1e9
    logger.info(f"Your runtime has {avlb_gb:.1f} gigabytes of available RAM")
    ram_gb = stats.total / 1e9
    logger.info(f"Your runtime has {ram_gb:.1f} gigabytes of total RAM")
    logger.info(f"Your runtime has {stats.percent:.1f}% usage of RAM")

In [None]:
def grab_dataset(
    *,
    dataset_name: str,
    return_small: bool = False,
    small_rows_num: int | None = None
):
    dataset_full = load_dataset(dataset_name)

    if isinstance(dataset_full, DatasetDict):
        dataset_full = dataset_full['train']

    if small_rows_num is not None and return_small:
        dataset = dataset_full.select(range(small_rows_num))
    else:
        dataset = dataset_full

    return dataset

In [None]:
def generate_batched_entities(nlp, dataset_rows):
    source = dataset_rows['Abstract']
    target = dataset_rows["Highlight"]
    hypothesis = dataset_rows["GeneratedHighlight"]

    def entities_list_generator_from_rows(rows):
        # rows should be one column from dataset_rows
        # https://spacy.io/usage/processing-pipelines#processing

        docs_list = list(nlp.pipe(rows))

        ents = [
            [
                {
                    "ent": ent.text,
                    "start": ent.start_char,
                    "end": ent.end_char,
                    "type": ent.label_,
                    "lemma": ent.lemma_,
                }
                for ent in doc.ents
            ]
            for doc in docs_list
        ]

        ents_str = [json.dumps(ent, separators=(',', ':')) for ent in ents]
        return ents_str

    source_ents_str = entities_list_generator_from_rows(source)
    target_ents_str = entities_list_generator_from_rows(target)
    hypothesis_ents_str = entities_list_generator_from_rows(hypothesis)

    del source
    del target
    del hypothesis

    return {
        "AbstractEntities": source_ents_str,
        "HighlightEntities": target_ents_str,
        "GeneratedHighlightEntities": hypothesis_ents_str
    }


In [None]:
def generate_entities_dataset():
    hf_root_ds_name = "AdityaMayukhSom/MixSub-LLaMA-3.2-FineTuned-Outputs"
    entities_ds_hf_name = "AdityaMayukhSom/MixSub-LLaMA-3.2-FineTuned-GPU-Entities"

    spacy.prefer_gpu()

    nlp = spacy.load("en_core_web_trf")
    nlp = en_core_web_trf.load()

    dataset = grab_dataset(
        dataset_name = hf_root_ds_name,
        # return_small = True,
        # small_rows_num = 20
    )

    logger.info("finetuned entities dataset generation started")

    try:
        def process_batched_rows(rows, idxs):
            print_every = 100

            if idxs[0] % print_every == 0 or (((idxs[-1] // print_every) - (idxs[0] // print_every)) >= 1):
                print(f'Row {idxs[0]} to Row {idxs[-1]} starting...')

            return generate_batched_entities(nlp, rows)

        entites_ds = dataset.map(
            function = process_batched_rows,
            with_indices = True,
            batched = True,
            batch_size = 1024,
        )

        del process_batched_rows

        logger.success("finetuned entities dataset generation finished")
        logger.info("started pushing finetuned entitites dataet to huggingface")
        entites_ds.push_to_hub(entities_ds_hf_name)
        logger.success("finetuned entitites dataset saved to huggingface as hf dataset")

        del entites_ds
    except Exception as e:
        logger.exception(str(e))
    finally:
        del dataset
        del nlp

In [None]:
def main():
    gc.collect()
    torch.cuda.empty_cache()
    print_mem_stats()
    libc = ctypes.CDLL("libc.so.6") # clearing cache
    libc.malloc_trim(0)
    print_mem_stats()
    generate_entities_dataset()
    print_mem_stats()
    libc = ctypes.CDLL("libc.so.6") # clearing cache
    libc.malloc_trim(0)
    print_mem_stats()
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
try:
    main()
except Exception as e:
    logger.exception(e)