In [None]:
!pip install transformers evaluate seqeval wandb

# Import Statements

In [None]:
import bz2
import torch
import numpy as np
import pandas as pd
import transformers
from transformers import pipeline
import random
from tqdm.auto import tqdm
from transformers import DistilBertTokenizerFast
from transformers import Trainer, TrainingArguments, DistilBertForTokenClassification, EarlyStoppingCallback
import evaluate
seqeval = evaluate.load('seqeval')

import os
import wandb
os.environ["WANDB_API_KEY"]=""
os.environ["WANDB_PROJECT"]="CS6301_Project"
os.environ["WANDB_WATCH"]="false"

## THE EVALUATE FUNCTION CAN DO THE SAME OPERATIONS
# import seqeval
# from seqeval.scheme import IOB1
# from seqeval.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

In [None]:
token_docs = []
tag_docs = []
datasets = ["aij-wikiner-en-wp2.bz2", "aij-wikiner-en-wp3.bz2"]

for dataset in datasets:
    with bz2.open(f"../Data/{dataset}", "rb") as bz_file:

        docs = bz_file.readlines()
        for doc in docs:
            doc = doc.strip().decode()
            if len(doc) <= 1:
                continue

            tokens = []
            tags = []

            for seq in doc.split(" "):

                token, pos, tag = seq.split("|")

                tokens.append(token)
                tags.append(tag)

            token_docs.append(tokens)
            tag_docs.append(tags)

In [None]:
## IOB TAGGING HAS BEEN USED INSTEAD OF IOB2 TAGGING
## In IOB tagging single token tags are tagged with I-<type> instead of B-<type>

In [None]:
texts, tags = token_docs, tag_docs

In [None]:
print((f"Total amount of data = {len(texts)}"))
data_perc = round(len(texts) * 0.25)
print(f"Current sample of data = {data_perc}")

random.seed(100)
random_samples = random.sample(range(0, len(texts)), data_perc)
print(f"First sample index = {random_samples[0]}") ## 76372
texts, tags = [token_docs[i] for i in random_samples], [tag_docs[i] for i in random_samples]

In [None]:
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_tags, test_tags = train_test_split(texts, tags, test_size=.1,random_state=100)

train_texts, val_texts, train_tags, val_tags = train_test_split(train_texts, train_tags, test_size=.1,random_state=100)

train_perc = round( 100 * (len(train_texts)/len(texts)))
val_perc = round( 100 * (len(val_texts)/len(texts)))
test_perc = round( 100 * (len(test_texts)/len(texts)))


print(f"{train_perc}% of data is TRAINING")
print(f"{val_perc}% of data is VALIDATION")
print(f"{test_perc}% of data is TESTING")

In [None]:
print(f"{len(train_texts)} sentences in TRAINING")
print(f"{len(val_texts)} sentences in VALIDATION")
print(f"{len(test_texts)} sentences in TESTING")

In [None]:
unique_tags = set(tag for doc in tags for tag in doc)
sorted_tags = sorted(list(set(tag for doc in tags for tag in doc)),key=lambda x: x[-3:])
tag2id = {tag: id for id, tag in enumerate(sorted_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [None]:
tag2id

In [None]:
# To calculate max len of sentences
m_len = 0
for i in texts:
    m_len = max(m_len, len(i))

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

# Testing the Tokenizer Functions

In [None]:
_encoded_sent = tokenizer(train_texts[1:2],is_split_into_words=True,
                              max_length=m_len,
                              padding='max_length',
                              truncation=True)['input_ids'][0]

In [None]:
print(_encoded_sent)
print(tokenizer.convert_ids_to_tokens(_encoded_sent))

# Dataset Loading

In [None]:
class WikiNERDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tags, tokenizer):
        self.texts = np.asanyarray(texts, dtype=list)
        self.tags = np.asanyarray(tags, dtype=list)
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.texts)
        
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        encodings = self.tokenizer(self.texts[idx],
                              is_split_into_words=True,
                              max_length=m_len,
                              padding='max_length',
                              truncation=True)
        tags = self.tags[idx]
        labels = align_labels(tags, encodings)
        
        item = dict()
        item['input_ids'] = torch.tensor(encodings.input_ids)
        item['attention_mask'] = torch.tensor(encodings.attention_mask)
        item['labels'] = torch.tensor(labels)            
        
        return item
      
## This function is used to put relevant labels as the tokenizer would use a WordPiece tokenizer (one word could be tokenized to three
## seperate tokens)
def align_labels(tags: list, encodings: transformers.tokenization_utils_base.BatchEncoding, label_all_tokens=True) -> list:
    labels = []
    word_ids = encodings.word_ids()
    prev_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != prev_word_idx:
            label_ids.append(tag2id[tags[word_idx]])
        else:
            label_ids.append(tag2id[tags[word_idx]] if label_all_tokens else -100)
    return label_ids

In [None]:
train_dataset = WikiNERDataset(train_texts, train_tags, tokenizer)
val_dataset = WikiNERDataset(val_texts, val_tags, tokenizer)
test_dataset = WikiNERDataset(test_texts, test_tags, tokenizer)

In [None]:
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))
cb_early_stop = EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=1e-3)

In [None]:
## Testing out seqeval
y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
# print(precision_score(y_true, y_pred,mode="strict",scheme=IOB1))

seqeval.compute(references=y_true, predictions=y_pred,mode="strict",scheme="IOB1")

In [None]:
def compute_metrics(eval_predictions):
    predictions, labels = eval_predictions.predictions, eval_predictions.label_ids
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }

    for k in results.keys():
        if k not in flattened_results.keys():
            flattened_results[k + "_f1"] = results[k]["f1"]

    return flattened_results

In [None]:
## By default it will save every 500 steps
training_args = TrainingArguments(
    output_dir='./results', 
    report_to="wandb",
    run_name = "WikiNER_train",
    overwrite_output_dir = True,
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=128,  # batch size per device during training
    per_device_eval_batch_size=128,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    load_best_model_at_end = True,
    evaluation_strategy='steps',
    save_total_limit=2,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics
)

trainer.add_callback(cb_early_stop)

trainer.train()
wandb.finish()

In [None]:
# ## To access training log history
# trainer.state.log_history
history = trainer.state.log_history
history_lst = []
for i in range(0, len(history) - 1, 2):
    dict1 = history[i]
    dict2 = history[i + 1]

    final_dict = {**dict2, **dict1}
    history_lst.append(final_dict)
pd.DataFrame(history_lst).to_csv("Results/train_stats.csv", index=False)

In [None]:
## Saving the model
# trainer.save_model("finalmodel/wikiner_bertdistill")

model = DistilBertForTokenClassification.from_pretrained('finalmodel/wikiner_bertdistill', num_labels=len(unique_tags),
                                                         id2label=id2tag)

In [None]:
recognizer = pipeline("ner", model=model, tokenizer=tokenizer)

In [None]:
recog_result = recognizer(' '.join(test_texts[900]))

In [None]:
print(" ".join(test_texts[900]))
print("\tACTUAL")
print("\t______")
print()
for j, tag in enumerate(test_tags[900]):
    if tag != "O":
        print(f"\t\t{test_texts[900][j]} {tag}")
print()
print("\tPREDICTION")
print("\t__________")
print()
for entity in recog_result:
    _word = entity["word"]
    _ent = entity["entity"]
    print(f"\t\t{_word} {_ent}")

In [None]:
for out in tqdm(recognizer(["My name is Steve Jobs","This is St.Johns School"]) ):
    print(out)

In [None]:
print(recog_result)

## To get predictions and metrics on the unseen test set using the trainer

In [None]:
test_data_predictions = trainer.predict(test_dataset)

In [None]:
print(test_data_predictions.metrics)

In [None]:
def generate_metric_csv(result, csv_name):
    ## result is the o/p obtained from seqeval.compute
    test_metrics = []
    for key in result.keys():
        metric = {}
        if any(
            ext in key
            for ext in [
                "overall_f1",
                "overall_recall",
                "overall_accuracy",
                "overall_precision",
                "LOC_f1",
                "MISC_f1",
                "ORG_f1",
                "PER_f1",
            ]
        ):
            # if key in ["test_overall_f1","test_overall_recall","test_overall_accuracy","test_overall_precision","test_LOC_f1","test_MISC_f1","test_ORG_f1","test_PER_f1"]:
            _key = key[5:]
            metric["metric"] = _key
            metric["val"] = round(result[key], 4)

            test_metrics.append(metric)
    pd.DataFrame(test_metrics).to_csv(f"Results/{csv_name}.csv", index=False)

In [None]:
generate_metric_csv(test_metrics,"test_bert")
generate_metric_csv(val_metrics,"validation_bert")