# Installations of Required Libaries

In [1]:
!pip install --upgrade pip



In [2]:
! pip install transformers datasets --upgrade



In [3]:
!pip install IProgress



In [4]:
!pip install transformers datasets evaluate seqeval --upgrade



# Load dataset

In [5]:
from datasets import load_dataset

multinerd_dataset = load_dataset("Babelscape/multinerd")

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]



# Data Preprocessing

### System A Data

- Filtering out non english sample. 
- Creating label list, label to label id mapping and label id to label mapping. 

In [6]:
label2id_systemA = {"O": 0, "B-PER": 1, "I-PER": 2, "B-ORG": 3, "I-ORG": 4, "B-LOC": 5, "I-LOC": 6, "B-ANIM": 7, "I-ANIM": 8,
             "B-BIO": 9, "I-BIO": 10, "B-CEL": 11, "I-CEL": 12, "B-DIS": 13, "I-DIS": 14, "B-EVE": 15, "I-EVE": 16,
             "B-FOOD": 17, "I-FOOD": 18, "B-INST": 19, "I-INST": 20, "B-MEDIA": 21, "I-MEDIA": 22, "B-MYTH": 23, "I-MYTH": 24,
             "B-PLANT": 25, "I-PLANT": 26, "B-TIME": 27, "I-TIME": 28, "B-VEHI": 29, "I-VEHI": 30}
id2label_systemA = {k:v for v,k in label2id_systemA.items()}
label_list_systemA = list(label2id_systemA.keys())

In [7]:
system_a_dataset = multinerd_dataset.filter(lambda data: data["lang"] == "en")

### System B Data

- Setting labels that are not "Person"(Label id's : 1,2), "Organization"(Label id's : 3,4), "Location"(Label id's : 5,6), "Animal"(Label id's : 7,8) or "Diseases" (Label id's : 13,14) to the label "Others"(Label id : 0).
- Mapping label the filtered label to consecutive indices 0-10 so that taking maximum value of the final layer of softmax can be used to find the predicted label.
- Creating label list, label to label id mapping and label id to label mapping.

In [8]:
system_b_valid_ner_tags = [0,1,2,3,4,5,6,7,8,13,14] # Valdi NER Tag indices from actual dataset in System B
map_actual_new_ner_tags = {system_b_valid_ner_tags[i]:i for i in range(len(system_b_valid_ner_tags))} # Mapping actual label to new indexes that we continues to make model learn
map_actual_new_ner_tags

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 13: 9, 14: 10}

In [9]:
id2label_systemB = {map_actual_new_ner_tags[tag]:id2label_systemA[tag] for tag in system_b_valid_ner_tags} # create id to label mapping with new ner indices 
label2id_systemB = {k:v for v,k in id2label_systemB.items()}
label_list_systemB = list(label2id_systemB.keys())

In [10]:
label2id_systemB

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-ANIM': 7,
 'I-ANIM': 8,
 'B-DIS': 9,
 'I-DIS': 10}

In [11]:
def filter_labels_for_systemB(data):
    data["ner_tags"] = [map_actual_new_ner_tags[i] if i in system_b_valid_ner_tags else 0 for i in data["ner_tags"]]
    return data

In [12]:
system_b_dataset = system_a_dataset.map(filter_labels_for_systemB)

# Training

## Tokenization

Creating Tokenizer using Huggingface API to cretate token embedding that can be passed to the distilbert model

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Function that maps create tokens for distilbert model and modifies the corresponding labels to align with the created token. This step is done because the distilbert tokenizer can split some words into multiple token and also add special token like [CLS] and [SEP] which denotes the begining and ending of the sentence. The label list also must be modified according to the newly created tokens from the tokenizer.

- For cases where a word is split into multiple tokens, the label is assigned to the first token and the rest of the token are given with a label value as -100 which is a special token that tells that these labels must be ignored during the computation of loss.
- Special tokens are also given a label of -100 to be igonored during loss calculation.

In [14]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

Creating tokens for system A and system B dataset.

In [15]:
system_a_token = system_a_dataset.map(tokenize_and_align_labels, batched=True)
system_b_token = system_b_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/32820 [00:00<?, ? examples/s]

## Evaluation Metric Creation

In [16]:
import evaluate

seqeval = evaluate.load("seqeval")

Creating function to calculate evaluation metric for system A and system B using to evaluate token classification using the metrics precision, recall, accuracy and f1 score.

In [17]:
import numpy as np
import itertools
from sklearn import metrics
import matplotlib.pyplot as plt

def compute_metrics_systemA(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list_systemA[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list_systemA[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

def compute_metrics_systemB(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list_systemB[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list_systemB[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## Model Creation

Creating optimizer and setting hyperparameters for training the models

In [37]:
from transformers import create_optimizer

batch_size = 16
num_train_epochs = 3
num_train_stepsA = (len(system_a_token["train"]) // batch_size) * num_train_epochs
optimizer_systemA, lr_schedule_systemA = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_stepsA,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [19]:

num_train_stepsB = (len(system_b_token["train"]) // batch_size) * num_train_epochs
optimizer_systemB, lr_schedule_systemB = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_stepsB,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

Loading Pretrained models for training on system A and sytem B datasets

In [20]:
from transformers import TFAutoModelForTokenClassification

model_system_a = TFAutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(label2id_systemA), id2label=id2label_systemA, label2id=label2id_systemA
)

model_system_b = TFAutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(id2label_systemB), id2label=id2label_systemB, label2id=label2id_systemB
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able t

## Train, Validation, Test Dataset Creation

Creating a DataCollator object that helps create batchs of input to provide while training. The datacollector also helps in padding and truncating the input so that all inputs have the required size to be provided as input to the models.

In [21]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

Creating datasets train test and validation datasets using training collector for System A.

In [38]:
tf_train_set_systemA = model_system_a.prepare_tf_dataset(
    system_a_token["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_validation_set_systemA = model_system_a.prepare_tf_dataset(
    system_a_token["validation"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_test_set_systemA = model_system_a.prepare_tf_dataset(
    system_a_token["test"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

Creating datasets train test and validation datasets using training collector for System B.

In [23]:
tf_train_set_systemB = model_system_b.prepare_tf_dataset(
    system_b_token["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_validation_set_systemB = model_system_b.prepare_tf_dataset(
    system_b_token["validation"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_test_set_systemB = model_system_b.prepare_tf_dataset(
    system_b_token["test"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

## Model Training

Compiling and creating metric callbacks for system A model and system B model

In [39]:
import tensorflow as tf

model_system_a.compile(optimizer=optimizer_systemA)  
#model_system_b.compile(optimizer=optimizer_systemB)  

In [40]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback_systemA = KerasMetricCallback(metric_fn=compute_metrics_systemA, eval_dataset=tf_validation_set_systemA)
#metric_callback_systemB = KerasMetricCallback(metric_fn=compute_metrics_systemB, eval_dataset=tf_validation_set_systemB)

### System A training

In [41]:
model_system_a.fit(x=tf_train_set_systemA, validation_data=tf_validation_set_systemA, epochs=num_train_epochs, callbacks=[metric_callback_systemA])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x798d0c1bc790>

### System B training

In [27]:
model_system_b.fit(x=tf_train_set_systemB, validation_data=tf_validation_set_systemB, epochs=num_train_epochs, callbacks=metric_callback_systemB)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x798ce541dcc0>

# Evaluation

## Evaluation System A

- #### Validation Data

In [45]:
model_system_a_prediction_validation = model_system_a.predict(tf_validation_set_systemA).logits.to_tensor().numpy()
true_system_a_labels = system_a_token["validation"]["labels"]

print(compute_metrics_systemA((model_system_a_prediction_validation, true_system_a_labels)))


{'precision': 0.8932743563570376, 'recall': 0.9073384278637533, 'f1': 0.9002514668901928, 'accuracy': 0.9835044365466965}


- #### Test Data

In [46]:
model_system_a_prediction_test = model_system_a.predict(tf_test_set_systemA).logits.to_tensor().numpy()
true_system_a_labels = system_a_token["test"]["labels"]

print(compute_metrics_systemA((model_system_a_prediction_test, true_system_a_labels)))


{'precision': 0.9249192680301399, 'recall': 0.9381555858841879, 'f1': 0.931490407882765, 'accuracy': 0.9882198688855269}


## Evaluation System B

- #### Validation data

In [47]:
model_system_b_prediction_validation = model_system_b.predict(tf_validation_set_systemB).logits.to_tensor().numpy()
true_system_b_labels = system_b_token["validation"]["labels"]





In [48]:
print(compute_metrics_systemB((model_system_b_prediction_validation, true_system_b_labels)))


{'precision': 0.9412855377008653, 'recall': 0.9462792142618172, 'f1': 0.9437757704316324, 'accuracy': 0.9920107859666342}


- #### Test data

In [49]:
model_system_b_prediction_test = model_system_b.predict(tf_test_set_systemB).logits.to_tensor().numpy()
true_system_b_labels = system_b_token["test"]["labels"]





In [50]:
print(compute_metrics_systemB((model_system_b_prediction_test, true_system_b_labels)))


{'precision': 0.9451610110803325, 'recall': 0.9510474282478986, 'f1': 0.9480950830348421, 'accuracy': 0.9918680210359484}
