In [None]:
from google.colab import userdata
from huggingface_hub import login

hf_token = userdata.get('HF_TOKEN')
login(token = hf_token)

In [None]:
!nvidia-smi

Thu Mar 27 20:29:17 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   41C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# !pip install nltk
!pip install -U pip setuptools wheel
!pip install -U 'spacy[cuda12x]'
!pip install loguru
!pip install datasets
!python -m spacy download en_core_web_trf

[0mCollecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import gc
import json
import spacy
import torch
import ctypes
import psutil
import requests
import en_core_web_trf
import pandas as pd
from loguru import logger
from datasets import load_dataset, DatasetDict

In [None]:
def print_mem_stats():
    stats = psutil.virtual_memory()
    free_gb = stats.free / 1e9
    logger.info(f"Your runtime has {free_gb:.1f} gigabytes of free RAM")
    used_gb = stats.used / 1e9
    logger.info(f"Your runtime has {used_gb:.1f} gigabytes of used RAM")
    avlb_gb = stats.available / 1e9
    logger.info(f"Your runtime has {avlb_gb:.1f} gigabytes of available RAM")
    ram_gb = stats.total / 1e9
    logger.info(f"Your runtime has {ram_gb:.1f} gigabytes of total RAM")
    logger.info(f"Your runtime has {stats.percent:.1f}% usage of RAM")

In [None]:
def grab_dataset(
    *,
    dataset_name: str,
    return_small: bool = False,
    small_rows_num: int | None = None
):
    dataset_full = load_dataset(dataset_name)

    if isinstance(dataset_full, DatasetDict):
        dataset_full = dataset_full['train']

    if small_rows_num is not None and return_small:
        dataset = dataset_full.select(range(small_rows_num))
    else:
        dataset = dataset_full

    return dataset

In [None]:
def generate_batched_entities(nlp, dataset_rows):
    source = dataset_rows['Abstract']
    target = dataset_rows["Highlight"]
    hypothesis = dataset_rows["GeneratedHighlight"]

    def entities_list_generator_from_rows(rows):
        # rows should be one column from dataset_rows
        # https://spacy.io/usage/processing-pipelines#processing

        docs_list = list(nlp.pipe(rows))

        ents = [
            [
                {
                    "ent": ent.text,
                    "start": ent.start_char,
                    "end": ent.end_char,
                    "type": ent.label_,
                    "lemma": ent.lemma_,
                }
                for ent in doc.ents
            ]
            for doc in docs_list
        ]

        ents_str = [json.dumps(ent, separators=(',', ':')) for ent in ents]
        return ents_str

    source_ents_str = entities_list_generator_from_rows(source)
    target_ents_str = entities_list_generator_from_rows(target)
    hypothesis_ents_str = entities_list_generator_from_rows(hypothesis)

    del source
    del target
    del hypothesis

    return {
        "AbstractEntities": source_ents_str,
        "HighlightEntities": target_ents_str,
        "GeneratedHighlightEntities": hypothesis_ents_str
    }


In [None]:
def generate_entities_dataset():
    hf_root_ds_name = "AdityaMayukhSom/MixSub-LLaMA-3.2-FineTuned-Outputs"
    entities_ds_hf_name = "AdityaMayukhSom/MixSub-LLaMA-3.2-FineTuned-GPU-Entities"

    spacy.prefer_gpu()

    nlp = spacy.load("en_core_web_trf")
    nlp = en_core_web_trf.load()

    dataset = grab_dataset(
        dataset_name = hf_root_ds_name,
        # return_small = True,
        # small_rows_num = 20
    )

    logger.info("finetuned entities dataset generation started")

    try:
        def process_batched_rows(rows, idxs):
            print_every = 100

            if idxs[0] % print_every == 0 or (((idxs[-1] // print_every) - (idxs[0] // print_every)) >= 1):
                print(f'Row {idxs[0]} to Row {idxs[-1]} starting...')

            return generate_batched_entities(nlp, rows)

        entites_ds = dataset.map(
            function = process_batched_rows,
            with_indices = True,
            batched = True,
            batch_size = 1024,
        )

        del process_batched_rows

        logger.success("finetuned entities dataset generation finished")
        logger.info("started pushing finetuned entitites dataet to huggingface")
        entites_ds.push_to_hub(entities_ds_hf_name)
        logger.success("finetuned entitites dataset saved to huggingface as hf dataset")

        del entites_ds
    except Exception as e:
        logger.exception(str(e))
    finally:
        del dataset
        del nlp

In [None]:
def main():
    gc.collect()
    torch.cuda.empty_cache()
    print_mem_stats()
    libc = ctypes.CDLL("libc.so.6") # clearing cache
    libc.malloc_trim(0)
    print_mem_stats()
    generate_entities_dataset()
    print_mem_stats()
    libc = ctypes.CDLL("libc.so.6") # clearing cache
    libc.malloc_trim(0)
    print_mem_stats()
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
try:
    main()
except Exception as e:
    logger.exception(e)

[32m2025-03-27 20:31:45.027[0m | [1mINFO    [0m | [36m__main__[0m:[36mprint_mem_stats[0m:[36m4[0m - [1mYour runtime has 3.4 gigabytes of free RAM[0m
[32m2025-03-27 20:31:45.027[0m | [1mINFO    [0m | [36m__main__[0m:[36mprint_mem_stats[0m:[36m6[0m - [1mYour runtime has 4.4 gigabytes of used RAM[0m
[32m2025-03-27 20:31:45.030[0m | [1mINFO    [0m | [36m__main__[0m:[36mprint_mem_stats[0m:[36m8[0m - [1mYour runtime has 8.9 gigabytes of available RAM[0m
[32m2025-03-27 20:31:45.030[0m | [1mINFO    [0m | [36m__main__[0m:[36mprint_mem_stats[0m:[36m10[0m - [1mYour runtime has 13.6 gigabytes of total RAM[0m
[32m2025-03-27 20:31:45.031[0m | [1mINFO    [0m | [36m__main__[0m:[36mprint_mem_stats[0m:[36m11[0m - [1mYour runtime has 34.8% usage of RAM[0m
[32m2025-03-27 20:31:45.097[0m | [1mINFO    [0m | [36m__main__[0m:[36mprint_mem_stats[0m:[36m4[0m - [1mYour runtime has 4.3 gigabytes of free RAM[0m
[32m2025-03-27 20:31:45.098[0m

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Row 0 to Row 1023 starting...
Row 1024 to Row 2047 starting...
Row 2048 to Row 3071 starting...
Row 3072 to Row 3999 starting...


[32m2025-03-27 20:35:40.679[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mgenerate_entities_dataset[0m:[36m36[0m - [32m[1mfinetuned entities dataset generation finished[0m
[32m2025-03-27 20:35:40.680[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_entities_dataset[0m:[36m37[0m - [1mstarted pushing finetuned entitites dataet to huggingface[0m


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/538 [00:00<?, ?B/s]

[32m2025-03-27 20:35:46.105[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mgenerate_entities_dataset[0m:[36m39[0m - [32m[1mfinetuned entitites dataset saved to huggingface as hf dataset[0m
[32m2025-03-27 20:35:46.110[0m | [1mINFO    [0m | [36m__main__[0m:[36mprint_mem_stats[0m:[36m4[0m - [1mYour runtime has 3.2 gigabytes of free RAM[0m
[32m2025-03-27 20:35:46.111[0m | [1mINFO    [0m | [36m__main__[0m:[36mprint_mem_stats[0m:[36m6[0m - [1mYour runtime has 4.4 gigabytes of used RAM[0m
[32m2025-03-27 20:35:46.112[0m | [1mINFO    [0m | [36m__main__[0m:[36mprint_mem_stats[0m:[36m8[0m - [1mYour runtime has 8.8 gigabytes of available RAM[0m
[32m2025-03-27 20:35:46.113[0m | [1mINFO    [0m | [36m__main__[0m:[36mprint_mem_stats[0m:[36m10[0m - [1mYour runtime has 13.6 gigabytes of total RAM[0m
[32m2025-03-27 20:35:46.114[0m | [1mINFO    [0m | [36m__main__[0m:[36mprint_mem_stats[0m:[36m11[0m - [1mYour runtime has 35.1% usage of