In [None]:
from google.colab import userdata
from huggingface_hub import login

hf_token = userdata.get('HF_TOKEN')
login(token = hf_token)

In [None]:
!pip install nltk
!pip install spacy
!pip install loguru
!pip install datasets
!python -m spacy download en_core_web_lg

Collecting loguru
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Downloading loguru-0.7.3-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: loguru
Successfully installed loguru-0.7.3
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━

In [None]:
import gc
import json
import spacy
import torch
import ctypes
import psutil
import requests
import pandas as pd
from loguru import logger
from datasets import load_dataset, DatasetDict

In [None]:
def print_mem_stats():
    stats = psutil.virtual_memory()
    free_gb = stats.free / 1e9
    print(f"Your runtime has {free_gb:.1f} gigabytes of free RAM")
    used_gb = stats.used / 1e9
    print(f"Your runtime has {used_gb:.1f} gigabytes of used RAM")
    avlb_gb = stats.available / 1e9
    print(f"Your runtime has {avlb_gb:.1f} gigabytes of available RAM")
    ram_gb = stats.total / 1e9
    print(f"Your runtime has {ram_gb:.1f} gigabytes of total RAM")
    print(f"Your runtime has {stats.percent:.1f}% usage of RAM")

In [None]:
def grab_dataset(
    *,
    dataset_name: str,
    return_small: bool = False,
    small_rows_num: int | None = None
):
    dataset_full = load_dataset(dataset_name)

    if isinstance(dataset_full, DatasetDict):
        dataset_full = dataset_full['train']

    if small_rows_num is not None and return_small:
        dataset = dataset_full.select(range(small_rows_num))
    else:
        dataset = dataset_full

    return dataset

In [None]:
def generate_batched_entities(nlp, dataset_rows):
    source = dataset_rows['Abstract']
    target = dataset_rows["Highlight"]
    hypothesis = dataset_rows["GeneratedHighlight"]

    def entities_list_generator_from_rows(rows):
        # rows should be one column from dataset_rows
        # https://spacy.io/usage/processing-pipelines#processing

        docs_list = list(nlp.pipe(rows))

        ents = [
            [
                {
                    "ent": ent.text,
                    "start": ent.start_char,
                    "end": ent.end_char,
                    "type": ent.label_,
                    "lemma": ent.lemma_,
                }
                for ent in doc.ents
            ]
            for doc in docs_list
        ]

        ents_str = [json.dumps(ent, separators=(',', ':')) for ent in ents]
        return ents_str

    source_ents_str = entities_list_generator_from_rows(source)
    target_ents_str = entities_list_generator_from_rows(target)
    hypothesis_ents_str = entities_list_generator_from_rows(hypothesis)

    return {
        "AbstractEntities": source_ents_str,
        "HighlightEntities": target_ents_str,
        "GeneratedHighlightEntities": hypothesis_ents_str
    }


In [None]:
def generate_entities_dataset():
    hf_root_ds_name = "AdityaMayukhSom/MixSub-LLaMA-3.2-FineTuned-Outputs"
    entities_ds_hf_name = "AdityaMayukhSom/MixSub-LLaMA-3.2-FineTuned-CPU-Entities"

    nlp = spacy.load("en_core_web_lg")

    dataset = grab_dataset(
        dataset_name = hf_root_ds_name,
        # return_small = True,
        # small_rows_num = 20
    )

    logger.info("finetuned entities dataset generation started")

    try:
        def process_batched_rows(rows, idxs):
            print_every = 100

            if idxs[0] % print_every == 0 or (((idxs[-1] // print_every) - (idxs[0] // print_every)) >= 1):
                print(f'Row {idxs[0]} to Row {idxs[-1]} starting...')

            return generate_batched_entities(nlp, rows)

        entites_ds = dataset.map(
            function = process_batched_rows,
            with_indices = True,
            batched = True,
            batch_size = 1024,
        )

        del process_batched_rows

        logger.success("finetuned entities dataset generation finished")
        logger.info("started pushing finetuned entitites dataet to huggingface")
        entites_ds.push_to_hub(entities_ds_hf_name)
        logger.success("finetuned entitites dataset saved to huggingface as hf dataset")

        del entites_ds
    except Exception as e:
        logger.exception(str(e))
    finally:
        del dataset
        del nlp

In [None]:
def main():
    gc.collect()
    torch.cuda.empty_cache()
    print_mem_stats()
    libc = ctypes.CDLL("libc.so.6") # clearing cache
    libc.malloc_trim(0)
    print_mem_stats()
    generate_entities_dataset()
    print_mem_stats()
    libc = ctypes.CDLL("libc.so.6") # clearing cache
    libc.malloc_trim(0)
    print_mem_stats()
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
try:
    main()
except Exception as e:
    logger.exception(e)

Your runtime has 3.5 gigabytes of free RAM
Your runtime has 1.1 gigabytes of used RAM
Your runtime has 12.1 gigabytes of available RAM
Your runtime has 13.6 gigabytes of total RAM
Your runtime has 10.8% usage of RAM
Your runtime has 3.5 gigabytes of free RAM
Your runtime has 1.1 gigabytes of used RAM
Your runtime has 12.2 gigabytes of available RAM
Your runtime has 13.6 gigabytes of total RAM
Your runtime has 10.6% usage of RAM


README.md:   0%|          | 0.00/400 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.55M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4000 [00:00<?, ? examples/s]

[32m2025-03-27 17:45:10.824[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_entities_dataset[0m:[36m13[0m - [1mfinetuned entities dataset generation started[0m


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Row 0 to Row 1023 starting...
Row 1024 to Row 2047 starting...
Row 2048 to Row 3071 starting...
Row 3072 to Row 3999 starting...


[32m2025-03-27 17:47:46.874[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mgenerate_entities_dataset[0m:[36m33[0m - [32m[1mfinetuned entities dataset generation finished[0m
[32m2025-03-27 17:47:46.875[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_entities_dataset[0m:[36m34[0m - [1mstarted pushing finetuned entitites dataet to huggingface[0m


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/501 [00:00<?, ?B/s]

[32m2025-03-27 17:47:49.277[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mgenerate_entities_dataset[0m:[36m36[0m - [32m[1mfinetuned entitites dataset saved to huggingface as hf dataset[0m


Your runtime has 2.4 gigabytes of free RAM
Your runtime has 2.1 gigabytes of used RAM
Your runtime has 11.2 gigabytes of available RAM
Your runtime has 13.6 gigabytes of total RAM
Your runtime has 17.8% usage of RAM
Your runtime has 2.5 gigabytes of free RAM
Your runtime has 2.1 gigabytes of used RAM
Your runtime has 11.2 gigabytes of available RAM
Your runtime has 13.6 gigabytes of total RAM
Your runtime has 17.6% usage of RAM
