# Setting Up and Installing Dependencies

In [None]:
!apt-get install python3.12

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libpython3.12-stdlib
Suggested packages:
  python3.12-venv
The following NEW packages will be installed:
  libpython3.12-stdlib python3.12
0 upgraded, 2 newly installed, 0 to remove and 29 not upgraded.
Need to get 5,418 kB of archives.
After this operation, 22.6 MB of additional disk space will be used.
Get:1 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 libpython3.12-stdlib amd64 3.12.9-1+jammy1 [2,892 kB]
Get:2 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 python3.12 amd64 3.12.9-1+jammy1 [2,526 kB]
Fetched 5,418 kB in 3s (2,074 kB/s)
Selecting previously unselected package libpython3.12-stdlib:amd64.
(Reading database ... 125044 files and directories currently installed.)
Preparing to unpack .../libpython3.12-stdlib_3.12.9-1+jammy1_amd64.deb ...
Unpacking libpython3.12-stdlib:amd64

In [None]:
!python3.12 --version

Python 3.12.9


In [None]:
!pip install transformers datasets seqeval scikit-learn accelerate torch

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127

# Loading the Dataset

In [None]:
import json
import pandas as pd

In [None]:
dataset_path1 = ["/content/academic_report.json","/content/corporate_report.json","/content/thesis_report.json","/content/custom.json"]
data = []
for path in dataset_path1:
  with open(path,"r") as f:
    temp = json.load(f)
    data.extend(temp)
df = pd.DataFrame(data)
df.head()

Unnamed: 0,tokens,ner_tags
0,"[Prepared, by:, Alice, Johnson]","[0, 0, 1, 2]"
1,"[Roll, No:, S-33769]","[0, 0, 3]"
2,"[Affiliation:, Oxford, University]","[0, 4, 5]"
3,"[Guided, by:, Dr., Samuel, Clark]","[0, 0, 6, 7, 7]"
4,"[Submitted, on:, 2022-11-05]","[0, 0, 8]"


# Convert Data to Hugging Face Dataset Format

In [None]:
label_list = ["O","B-AUTHOR", "I-AUTHOR", "B-ROLL_NUM",
              "B-ORG", "I-ORG","B-SUPERVISOR", "I-SUPERVISOR", "B-DATE", "I-DATE"]

label_map = {label: idx for idx, label in enumerate(label_list)}


# New Section

In [None]:
from datasets import Dataset

# Convert JSON format to Hugging Face Dataset
def convert_to_hf_format(data):
    tokenized_data = []
    for entry in data:
        tokenized_data.append({
            "tokens": entry["tokens"],
            "ner_tags": [label_map[label_list[idx]] for idx in entry["ner_tags"]]
        })
    return Dataset.from_list(tokenized_data)

# Convert dataset
hf_dataset = convert_to_hf_format(data)

# Split into train/test
hf_dataset = hf_dataset.train_test_split(test_size=0.1)

# Show an example
hf_dataset["train"][0]


{'tokens': ['Supervised', 'by:', 'Dr.', 'Linda', 'Scott'],
 'ner_tags': [0, 0, 6, 7, 7]}

# Loading BERT and Data Preprocessing

In [None]:
from transformers import AutoTokenizer

# Load BERT tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# Check first few samples to confirm structure
for i in range(5):
    print(hf_dataset["train"][i])


{'tokens': ['Supervised', 'by:', 'Dr.', 'Linda', 'Scott'], 'ner_tags': [0, 0, 6, 7, 7]}
{'tokens': ['Affiliation:', 'Green', 'Energy', 'Ltd.'], 'ner_tags': [0, 4, 5, 5]}
{'tokens': ['Submission', 'Date:', '2024-05-15'], 'ner_tags': [0, 0, 8]}
{'tokens': ['Mentor:', 'Dr.', 'Linda', 'Scott'], 'ner_tags': [0, 6, 7, 7]}
{'tokens': ['Submission', 'Date:', 'April', '2022'], 'ner_tags': [0, 0, 8, 9]}


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding="max_length", max_length=128)

    all_labels = []

    for i in range(len(examples["tokens"])):  # Process each example in batch
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get word IDs per sentence
        previous_word_idx = None
        labels = []

        for word_idx in word_ids:
            if word_idx is None:
                labels.append(-100)  # Padding tokens get -100
            elif word_idx != previous_word_idx:
                labels.append(examples["ner_tags"][i][word_idx])  # Assign correct label
            else:
                labels.append(examples["ner_tags"][i][word_idx])  # Extend label to subwords

            previous_word_idx = word_idx

        all_labels.append(labels)

    tokenized_inputs["labels"] = all_labels  # Ensure consistent list format
    return tokenized_inputs

# Apply the function with batching
tokenized_datasets = hf_dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/942 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets["train"][0]

{'tokens': ['Supervised', 'by:', 'Dr.', 'Linda', 'Scott'],
 'ner_tags': [0, 0, 6, 7, 7],
 'input_ids': [101,
  13588,
  2011,
  1024,
  2852,
  1012,
  8507,
  3660,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0

# Model Training BERT

In [None]:
from transformers import AutoModelForTokenClassification

# Load pre-trained BERT model
model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label_list)
)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert_ner_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs"
)


In [None]:
from transformers import DataCollatorForTokenClassification, Trainer

# Define Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
    data_collator=data_collator
)


In [None]:
trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdipeshghimire-dg[0m ([33mdipeshghimire-dg-amrit-campus[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.006272
2,No log,0.003251
3,No log,0.006242
4,No log,0.000614
5,0.054900,0.000558


TrainOutput(global_step=590, training_loss=0.046814435164807204, metrics={'train_runtime': 368.6825, 'train_samples_per_second': 12.775, 'train_steps_per_second': 1.6, 'total_flos': 307699184563200.0, 'train_loss': 0.046814435164807204, 'epoch': 5.0})

In [None]:
from google.colab import drive
drive.mount('/content/drive')
model.save_pretrained("/content/drive/MyDrive/fine_tuned_bert_ner")
tokenizer.save_pretrained("/content/drive/MyDrive/fine_tuned_bert_ner")

Mounted at /content/drive


NameError: name 'model' is not defined

# Load fine tuned BERT and Test for short Text

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# Define model path (adjust if stored elsewhere)
model_path = "/content/drive/MyDrive/fine_tuned_bert_ner"

# Load fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Create NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)


Device set to use cpu


In [None]:
text = "Under the Supervision of Mr. Nabaraj Bahadur Negi Lecturer Submitted by: Dipesh Ghimire (199), Rabin Pant (200), Prabin Raj Amatya (201) Submitted To: Tribhuvan University February 2025"
results = ner_pipeline(text)

print(results)

[{'entity': 'LABEL_0', 'score': np.float32(0.9997441), 'index': 1, 'word': 'under', 'start': 0, 'end': 5}, {'entity': 'LABEL_0', 'score': np.float32(0.9997465), 'index': 2, 'word': 'the', 'start': 6, 'end': 9}, {'entity': 'LABEL_0', 'score': np.float32(0.99977), 'index': 3, 'word': 'supervision', 'start': 10, 'end': 21}, {'entity': 'LABEL_0', 'score': np.float32(0.99977297), 'index': 4, 'word': 'of', 'start': 22, 'end': 24}, {'entity': 'LABEL_6', 'score': np.float32(0.9988147), 'index': 5, 'word': 'mr', 'start': 25, 'end': 27}, {'entity': 'LABEL_6', 'score': np.float32(0.9984413), 'index': 6, 'word': '.', 'start': 27, 'end': 28}, {'entity': 'LABEL_2', 'score': np.float32(0.6755718), 'index': 7, 'word': 'na', 'start': 29, 'end': 31}, {'entity': 'LABEL_2', 'score': np.float32(0.6989972), 'index': 8, 'word': '##bara', 'start': 31, 'end': 35}, {'entity': 'LABEL_7', 'score': np.float32(0.83717036), 'index': 9, 'word': '##j', 'start': 35, 'end': 36}, {'entity': 'LABEL_7', 'score': np.float32

In [None]:
label_list = ["O","B-AUTHOR", "I-AUTHOR", "B-ROLL_NUM",
              "B-ORG", "I-ORG","B-SUPERVISOR", "I-SUPERVISOR", "B-DATE", "I-DATE"]
label_map = {f"LABEL_{i}": label for i, label in enumerate(label_list)}

In [None]:
def merge_word_pieces(ner_results):
    """
    Merges subword tokens correctly (fixes WordPiece issues).
    Example: ['Tri', '##bh', '##u', '##van'] → ['Tribhuvan']
    """
    merged_results = []
    current_word = ""
    current_label = None
    current_start = None

    for entity in ner_results:
        word = entity["word"]
        label = label_map[entity["entity"]]  # Map LABEL_X to actual label
        start, end = entity["start"], entity["end"]

        if word.startswith("##"):  # Subword token detected
            current_word += word[2:]  # Remove "##" and append
        else:
            if current_word:  # Store previous word
                merged_results.append({"word": current_word, "entity": current_label, "start": current_start, "end": end})
            current_word = word
            current_label = label
            current_start = start

    if current_word:  # Append last word
        merged_results.append({"word": current_word, "entity": current_label, "start": current_start, "end": end})

    return merged_results


In [None]:
import re

def clean_roll_number(text):
    """ Remove unwanted characters like '(', ')', ',' from roll numbers. """
    return re.sub(r"[^\d]", "", text)

def convert_ner_results(ner_results):
    """
    Converts processed NER results into structured metadata.
    Fixes multi-token supervisor extraction.
    """
    extracted_metadata = {
        "authors": set(),
        "roll_numbers": set(),
        "organization": "",
        "supervisor": "",
        "submission_date": ""
    }

    merged_results = merge_word_pieces(ner_results)  # Ensure WordPiece tokens are merged correctly

    current_entity = None
    entity_text = ""

    for entity in merged_results:
        label = entity["entity"]
        word = entity["word"]

        if label.startswith("B-"):  # Beginning of an entity
            if current_entity:
                if current_entity == "author":
                    extracted_metadata["authors"].add(entity_text.strip())
                elif current_entity == "roll_num":
                    cleaned_roll_num = clean_roll_number(entity_text.strip())
                    if cleaned_roll_num:
                        extracted_metadata["roll_numbers"].add(cleaned_roll_num)
                elif current_entity == "supervisor":
                    extracted_metadata["supervisor"] = entity_text.strip()  # Store supervisor name
                else:
                    extracted_metadata[current_entity] = entity_text.strip()

            current_entity = label[2:].lower()  # Remove "B-" prefix
            entity_text = word
        elif label.startswith("I-") and current_entity:  # Continuation of entity
            entity_text += " " + word
        else:
            current_entity = None  # Reset entity

    # Store last entity
    if current_entity:
        if current_entity == "author":
            extracted_metadata["authors"].add(entity_text.strip())
        elif current_entity == "roll_num":
            cleaned_roll_num = clean_roll_number(entity_text.strip())
            if cleaned_roll_num:
                extracted_metadata["roll_numbers"].add(cleaned_roll_num)
        elif current_entity == "supervisor":
            extracted_metadata["supervisor"] = entity_text.strip()  # Ensure full supervisor name is stored
        else:
            extracted_metadata[current_entity] = entity_text.strip()

    # Merge organization fields
    if "org" in extracted_metadata and extracted_metadata["org"]:
        extracted_metadata["organization"] = extracted_metadata.pop("org")

    # Merge date fields
    if "date" in extracted_metadata and extracted_metadata["date"]:
        extracted_metadata["submission_date"] = extracted_metadata.pop("date")

    return extracted_metadata


In [None]:
# Convert raw NER results to structured format
structured_metadata = convert_ner_results(results)

print("Final Extracted Metadata:\n", structured_metadata)


Final Extracted Metadata:
 {'authors': {'rabin pant', 'prabin raj amatya', 'dipesh ghimire'}, 'roll_numbers': {'199', '201', '200'}, 'organization': 'tribhuvan university', 'supervisor': 'mr', 'submission_date': 'february 2025'}


In [None]:
structured_metadata = convert_ner_results(results)

print("Final Extracted Metadata:\n", structured_metadata)


Final Extracted Metadata:
 {'authors': {'rabin pant', 'prabin raj amatya', 'dipesh ghimire'}, 'roll_numbers': {'199', '201', '200'}, 'organization': 'tribhuvan university', 'supervisor': 'mr', 'submission_date': 'february 2025'}


In [None]:
text2 = "Name of Supervisor: Mr Nabaraj Bahadur Negi"
results2 = ner_pipeline(text2)
# Convert raw NER results to structured format
structured_metadata2 = convert_ner_results(results2)

print("Final Extracted Metadata:\n", structured_metadata2)

Final Extracted Metadata:
 {'authors': set(), 'roll_numbers': set(), 'organization': '', 'supervisor': 'mr nabaraj bahadur negi', 'submission_date': ''}


# Extract Text from PDF

In [None]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.4


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import fitz  # PyMuPDF

def extract_first_page_text(pdf_path):
    doc = fitz.open(pdf_path)  # Open PDF
    first_page_text = doc[0].get_text("text")  # Extract text from first page
    return first_page_text.strip()

# Example Usage
pdf_path = "/content/drive/MyDrive/toxicMeter.pdf"  # Replace with your PDF path
pdf_text = extract_first_page_text(pdf_path)

print("Extracted Text from First Page:\n", pdf_text)

Extracted Text from First Page:
 TRIBHUVAN UNIVERSITY 
INSTITUTE OF SCIENCE AND TECHNOLOGY 
 
Project Report On 
TOXIC COMMENT MODERATION SYSTEM 
In the partial fulfilment of the requirements for the Bachelor’s Degree in 
Information Technology 
 
Under the supervision of 
Mr. Nabaraj Bahadur Negi 
Lecturer 
Department of Information Technology 
Amrit Campus 
Lainchaur, Kathmandu 
 
Submitted by 
Dipesh Ghimire (199/077) 
Rajesh Adhikari (212/077)  
Sijan B.K. (223/077) 
Department of Information Technology 
Amrit Campus 
Lainchaur, Kathmandu 
 
Submitted to 
Tribhuvan University 
Institute of Science and Technology 
February 2025


In [None]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize full text
tokens = tokenizer(pdf_text, return_tensors="pt", truncation=False)

# Convert token IDs back to words (to check length)
tokenized_text = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])

print(f"Total Tokens: {len(tokenized_text)}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Total Tokens: 127


In [None]:
def split_tokens_into_sliding_windows(tokenized_text, max_tokens=50, overlap=20):
    """
    Splits tokenized text into overlapping sliding windows of max_tokens.
    Uses overlap to maintain entity continuity.
    """
    chunks = []
    i = 0

    while i < len(tokenized_text):
        chunk = tokenized_text[i:i + max_tokens]  # Take max token length
        chunks.append(chunk)
        i += max_tokens - overlap  # Move window by max_tokens - overlap

    return chunks

# Apply sliding window technique
token_chunks = split_tokens_into_sliding_windows(tokenized_text)

# Convert tokens back to text for processing
text_chunks = [tokenizer.convert_tokens_to_string(chunk) for chunk in token_chunks]

print("First Tokenized Chunk:\n", text_chunks[0])


First Tokenized Chunk:
 [CLS] tribhuvan university institute of science and technology project report on toxic comment moderation system in the partial fulfilment of the requirements for the bachelor ’ s degree in information technology under the supervision of mr. nabaraj bahadur negi


In [None]:
print(" Tokenized Chunk:\n", text_chunks[4])

 Tokenized Chunk:
 science and technology february 2025 [SEP]


In [None]:
results = []
for chunk in text_chunks:
    chunk_results = ner_pipeline(chunk)
    results.extend(chunk_results)  # Store results from all chunks


In [None]:
print(results)

[{'entity': 'LABEL_0', 'score': np.float32(0.9678183), 'index': 1, 'word': '[CLS]', 'start': 0, 'end': 5}, {'entity': 'LABEL_4', 'score': np.float32(0.9952089), 'index': 2, 'word': 'tri', 'start': 6, 'end': 9}, {'entity': 'LABEL_4', 'score': np.float32(0.9933429), 'index': 3, 'word': '##bh', 'start': 9, 'end': 11}, {'entity': 'LABEL_4', 'score': np.float32(0.9962812), 'index': 4, 'word': '##u', 'start': 11, 'end': 12}, {'entity': 'LABEL_4', 'score': np.float32(0.9977374), 'index': 5, 'word': '##van', 'start': 12, 'end': 15}, {'entity': 'LABEL_5', 'score': np.float32(0.99610066), 'index': 6, 'word': 'university', 'start': 16, 'end': 26}, {'entity': 'LABEL_5', 'score': np.float32(0.9964089), 'index': 7, 'word': 'institute', 'start': 27, 'end': 36}, {'entity': 'LABEL_5', 'score': np.float32(0.9962604), 'index': 8, 'word': 'of', 'start': 37, 'end': 39}, {'entity': 'LABEL_5', 'score': np.float32(0.995103), 'index': 9, 'word': 'science', 'start': 40, 'end': 47}, {'entity': 'LABEL_5', 'score'

In [None]:
structured_metadata = convert_ner_results(results)

print("Final Extracted Metadata:\n", structured_metadata)


Final Extracted Metadata:
 {'authors': {'dipesh ghimire', 'sijan b', 'rajesh adhikari'}, 'roll_numbers': {'223', '212', '199', '077'}, 'organization': 'science and technology', 'supervisor': '.', 'submission_date': 'february 2025'}


In [None]:
import re

# Define Label Mapping (Adjust according to your model)
label_list = ["O", "B-AUTHOR", "I-AUTHOR", "B-ROLL_NUM",
              "B-ORG", "I-ORG", "B-SUPERVISOR", "I-SUPERVISOR", "B-DATE", "I-DATE"]
label_map = {f"LABEL_{i}": label for i, label in enumerate(label_list)}

def merge_word_pieces(ner_results):
    """
    Merges subword tokens correctly (fixes WordPiece issues).
    Example: ['na', '##bara', '##j'] → ['Nabaraj']
    """
    merged_results = []
    current_word = ""
    current_label = None
    current_start = None

    for entity in ner_results:
        word = entity["word"]
        label = label_map[entity["entity"]]  # Convert LABEL_X to actual label
        start, end = entity["start"], entity["end"]

        if word.startswith("##"):  # Subword token detected
            current_word += word[2:]  # Remove "##" and append
        else:
            if current_word:  # Store previous word
                merged_results.append({"word": current_word, "entity": current_label, "start": current_start, "end": end})
            current_word = word
            current_label = label
            current_start = start

    if current_word:  # Append last word
        merged_results.append({"word": current_word, "entity": current_label, "start": current_start, "end": end})

    return merged_results
def clean_roll_number(text):
    """ Remove unwanted characters like '(', ')', ',' from roll numbers. """
    return re.sub(r"[^\d]", "", text)

def convert_ner_results(ner_results):
    """
    Converts processed NER results into structured metadata.
    Fixes multi-token merging for each label.
    """
    extracted_metadata = {
        "authors": set(),
        "roll_numbers": set(),
        "organization": "",
        "supervisor": "",
        "submission_date": ""
    }

    merged_results = merge_word_pieces(ner_results)  # Ensure WordPiece tokens are merged correctly

    # Temporary variables to store multi-token entities
    author_text, roll_text, org_text, supervisor_text, date_text = "", "", "", "", ""
    in_author, in_roll, in_org, in_supervisor, in_date = False, False, False, False, False

    for entity in merged_results:
        label = entity["entity"]
        word = entity["word"]

        # Handle authors
        if label == "B-AUTHOR":
            if in_author:
                extracted_metadata["authors"].add(author_text.strip())  # Store previous
            author_text = word
            in_author = True
        elif label == "I-AUTHOR" and in_author:
            author_text += " " + word

        # Handle roll numbers
        elif label == "B-ROLL_NUM":
            if in_roll:
                cleaned_roll_num = clean_roll_number(roll_text.strip())
                if cleaned_roll_num:
                    extracted_metadata["roll_numbers"].add(cleaned_roll_num)
            roll_text = word
            in_roll = True
        elif label == "I-ROLL_NUM" and in_roll:
            roll_text += " " + word

        # Handle organization
        elif label == "B-ORG":
            if in_org:
                extracted_metadata["organization"] = org_text.strip()
            org_text = word
            in_org = True
        elif label == "I-ORG" and in_org:
            org_text += " " + word

        # Handle supervisor
        elif label == "B-SUPERVISOR":
            if in_supervisor:
                extracted_metadata["supervisor"] = supervisor_text.strip()
            supervisor_text = word
            in_supervisor = True
        elif label == "I-SUPERVISOR" and in_supervisor:
            supervisor_text += " " + word

        # Handle date
        elif label == "B-DATE":
            if in_date:
                extracted_metadata["submission_date"] = date_text.strip()
            date_text = word
            in_date = True
        elif label == "I-DATE" and in_date:
            date_text += " " + word

    # Store the last entity values
    if author_text:
        extracted_metadata["authors"].add(author_text.strip())
    if roll_text:
        cleaned_roll_num = clean_roll_number(roll_text.strip())
        if cleaned_roll_num:
            extracted_metadata["roll_numbers"].add(cleaned_roll_num)
    if org_text:
        extracted_metadata["organization"] = org_text.strip()
    if supervisor_text:
        extracted_metadata["supervisor"] = supervisor_text.strip()
    if date_text:
        extracted_metadata["submission_date"] = date_text.strip()

    return extracted_metadata
