In [1]:
import json

def merge_consecutive_entities(data):
    entities = data
    merged_entities = []
    current_entity = entities[0]
    for next_entity in entities[1:]:
        if current_entity["label"] == next_entity["label"] :
            if current_entity["end_offset"] == next_entity["start_offset"] or current_entity["start_offset"] == next_entity["end_offset"] or current_entity["end_offset"]+1 == next_entity["start_offset"] or current_entity["start_offset"]+1 == next_entity["end_offset"]:
                current_entity["start_offset"] = min(current_entity["start_offset"], next_entity["start_offset"])
                current_entity["end_offset"] = max(current_entity["end_offset"], next_entity["end_offset"])
            else:
                merged_entities.append(current_entity)
                current_entity = next_entity
        else:

            merged_entities.append(current_entity)
            current_entity = next_entity

    merged_entities.append(current_entity)
    return merged_entities


input_file_path = "all.jsonl"

merged_data = []

with open(input_file_path, "r") as file:
    for line in file:
        data = json.loads(line)
        data.pop("relations", None)  # Remove "relations" key if present
        data.pop("Comments", None)
        data["entities"].sort(key=lambda x: x["id"])
        if len(data["entities"]) == 0:
            continue
        data["entities"]= merge_consecutive_entities(data["entities"])
        merged_data.append(data)


In [2]:
unique_labels = set(entity["label"] for sample in merged_data for entity in sample["entities"])

label2id = {label: idx for idx, label in enumerate(unique_labels)}

for sample in merged_data:
    for entity in sample["entities"]:
        entity["label_id"] = label2id.get(entity["label"], -1)  # -1 for unknown labels



In [3]:
from transformers import AutoTokenizer

def tokenize_and_add_labels(data):
    tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v2-xlarge")

    unique_labels = set(entity["label"] for sample in data for entity in sample.get("entities", []))

    label2id = {label: idx + 1 for idx, label in enumerate(unique_labels)}
    label2id["[PAD]"] = 0
    tokenized_data = []

    for sample in data:
        text = sample.get("text", "")
        entities = sample.get("entities", [])
        text = text[:510]
        tokenized_inputs = tokenizer(
            text,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=512,
        )

        input_ids = tokenized_inputs["input_ids"].squeeze().tolist()
        split_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        labels = [0] * len(input_ids)
        attention_mask = tokenized_inputs["attention_mask"].squeeze().tolist()

        for entity in entities:
            start_offset = entity.get("start_offset", 0)
            end_offset = entity.get("end_offset", 0)
            label = entity.get("label", "")

            # Ensure indices are within the bounds of the text
            start_offset = min(start_offset, len(text) - 1)
            end_offset = min(end_offset, len(text) - 1)

            start_token = tokenizer.encode(text[:start_offset], add_special_tokens=False)
            end_token = tokenizer.encode(text[:end_offset], add_special_tokens=False)
            if start_token and end_token:

                start = len(start_token)
                end = len(end_token)
                label_id = label2id.get(label, 0)
                labels[start:end + 1] = [label_id] * (end - start + 1)


        tokenized_data.append({
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask,
            "split_tokens":split_tokens
        })

    return tokenized_data, label2id


# Tokenize the data and add labels
tokenized_data, label2id = tokenize_and_add_labels(merged_data)

id2label = {v: k for k, v in label2id.items()}


In [4]:
# Collect unique labels across all samples
unique_labels = set(label for sample in tokenized_data for label in sample["labels"])
id2label = {v: k for k, v in label2id.items()}

label_list = list(label2id.keys())



In [5]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'


In [6]:
# Hugging Face model references for Transformer library
models = dict(
    ROBERTA = "roberta-base",
    DISTILBERT_U = "distilbert-base-uncased",
    DISTILBERT_C = "distilbert-base-cased",
    DEBERTA_V2_XL = "microsoft/deberta-v2-xlarge",
    DEBERTA_V2_XXL = "microsoft/deberta-v2-xxlarge")

In [7]:
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch
# Split the data into training and validation sets
train_data, valid_data = train_test_split(tokenized_data, test_size=0.2)

num_labels = len(label2id)
train_loader = DataLoader(train_data, batch_size=4, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=4, shuffle=False)


In [8]:
for sample in tokenized_data:
    if len(sample["labels"]) != 512:
        print(len(sample["labels"]))

In [9]:
import os, re, math, random, json, string

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import wandb

import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import TrainerCallback, AdamW, get_cosine_schedule_with_warmup
from transformers import DataCollatorForTokenClassification, PreTrainedModel, RobertaTokenizerFast

from datasets import load_dataset, ClassLabel, Sequence, load_metric

from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [10]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mstagarwal[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [11]:
# Logging date for w&b
from datetime import date
today = date.today()
log_date = today.strftime("%d-%m-%Y")

In [12]:
# LOAD OR TRAIN MODEL
TRAIN = 1 # 1 to TRAIN WEIGHTS or 0 to LOAD WEIGHTS

# TRAIN/VALIDATION SPLIT
TRAIN_SPLIT = 0.90

# RANDOM SEED FOR REPRODUCIBILITY
RANDOM_SEED = 42

# BATCH SIZE
# TRY 4, 8, 16, 32, 64, 128, 256. REDUCE IF OOM ERROR, HIGHER FOR TPUS
BATCH_SIZES = 1

# EPOCHS - TRANSFORMERS ARE TYPICALLY FINE-TUNED BETWEEN 1 AND 3 EPOCHS 
EPOCHS = 10

# WHICH PRE-TRAINED TRANSFORMER TO FINE-TUNE?
MODEL_CHECKPOINT = models['DEBERTA_V2_XL']

# SPECIFY THE WEIGHTS AND BIASES PROJECT NAME
%env WANDB_PROJECT = 'P2D-NER-2021' 

# DETERMINE WHETHER TO SAVE THE MODEL IN THE 100GB OF FREE W&B STORAGE
%env WANDB_LOG_MODEL = false 

env: WANDB_PROJECT='P2D-NER-2021'
env: WANDB_LOG_MODEL=false


In [13]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits_flat = np.argmax(logits, axis=-1).flatten()
    labels_flat = labels.flatten()
    return metric.compute(predictions=logits_flat, references=labels_flat)


In [14]:

FEATURE_CLASS_LABELS = "feature_class_labels.json"
DATA_FILE = 'all.json'
TEMP_MODEL_OUTPUT_DIR = 'temp_model_output_dir'
SAVED_MODEL = f"p2d-NER-Fine-Tune-Transformer-{MODEL_CHECKPOINT}" # Change for notebook version

In [15]:
#Optimizer
model = AutoModelForTokenClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=26)
learning_rate = 0.0000075
lr_max = learning_rate * BATCH_SIZES
weight_decay = 0.05

optimizer = AdamW(
    model.parameters(),
    lr=lr_max,
    weight_decay=weight_decay)

print("The maximum learning rate is: ",lr_max)

# Learning Rate Schedule
num_train_samples = len(train_data)
warmup_ratio = 0.2 # Percentage of total steps to go from zero to max learning rate
num_cycles=0.8 # The cosine exponential rate

num_training_steps = num_train_samples*EPOCHS/BATCH_SIZES
num_warmup_steps = num_training_steps*warmup_ratio

lr_sched = get_cosine_schedule_with_warmup(optimizer=optimizer,
                                           num_warmup_steps=num_warmup_steps,
                                           num_training_steps = num_training_steps,
                                           num_cycles=num_cycles)

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v2-xlarge and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The maximum learning rate is:  7.5e-06




In [16]:
args = TrainingArguments(output_dir = TEMP_MODEL_OUTPUT_DIR,
                         evaluation_strategy = "epoch",
                         learning_rate=lr_max,
                         per_device_train_batch_size=BATCH_SIZES,
                         per_device_eval_batch_size=BATCH_SIZES,
                         num_train_epochs=EPOCHS,
                         weight_decay=weight_decay,
                         lr_scheduler_type = 'cosine',
                         warmup_ratio=warmup_ratio,
                         logging_strategy="epoch",
                         save_strategy="epoch",
                         seed=RANDOM_SEED,
                         report_to = 'wandb', # enable logging to W&B
                         run_name = MODEL_CHECKPOINT+"-"+log_date,
                         metric_for_best_model="f1",
                         load_best_model_at_end = True)   # name of the W&B run (optional)

In [17]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
data_collator = DataCollatorForTokenClassification(tokenizer)

In [18]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)]

    # Define the metric parameters
    overall_precision = precision_score(true_labels, true_predictions, zero_division=1)
    overall_recall = recall_score(true_labels, true_predictions, zero_division=1)
    overall_f1 = f1_score(true_labels, true_predictions, zero_division=1)
    overall_accuracy = accuracy_score(true_labels, true_predictions)
    
    # Return a dictionary with the calculated metrics
    return {
        "precision": overall_precision,
        "recall": overall_recall,
        "f1": overall_f1,
        "accuracy": overall_accuracy,}

In [19]:
# Define and instantiate the Trainer...
trainer = Trainer(
                model=model,
                args=args,
                train_dataset=train_data,
                eval_dataset=valid_data,
                data_collator=data_collator,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics,
                optimizers=(optimizer, lr_sched)
                )

In [None]:
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
trainer.evaluate()

In [None]:
wandb.finish() 

In [None]:
trainer.save_model(SAVED_MODEL)

In [None]:
# Load the model and instantiate
loaded_model = AutoModelForTokenClassification.from_pretrained(SAVED_MODEL)

pred_trainer = Trainer(
    loaded_model,
    args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# Extract the predictions and produce a classification report
predictions, labels, _ = pred_trainer.predict(valid_data)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
    ]

# Generate the metrics and display
results = classification_report(true_labels, true_predictions, zero_division=1)
print(results)

In [None]:
# Check a sample for the evaluation set
check = 3

print(len(valid_data[check]['split_tokens']))
print(len(true_predictions[check]))
print(len(true_labels[check]))

In [None]:
# Have a look at the predicted extracted data
check_pred = zip(valid_data[check]['split_tokens'], true_predictions[check])
for tup in check_pred:
    if tup[1] != 'O':
        print(tup)