In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, GlobalMaxPooling1D, LSTM, Dropout, Embedding, Bidirectional

from datasets import load_dataset

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AADESH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AADESH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\AADESH\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#load dataset
ner_dataset = load_dataset(
    "tner/bc5cdr", 
)

In [3]:
print(f'The dataset is a dictionary with {len(ner_dataset)} splits: \n\n{ner_dataset}')
# It  may be useful to obtain the data in a list format for some sequence tagging methods
train_sentences_ner = [item['tokens'] for item in ner_dataset['train']]
train_labels_ner = [[str(tag) for tag in item['tags']] for item in ner_dataset['train']]

val_sentences_ner = [item['tokens'] for item in ner_dataset['validation']]
val_labels_ner = [[str(tag) for tag in item['tags']] for item in ner_dataset['validation']]

test_sentences_ner = [item['tokens'] for item in ner_dataset['test']]
test_labels_ner = [[str(tag) for tag in item['tags']] for item in ner_dataset['test']]

The dataset is a dictionary with 3 splits: 

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5228
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5330
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5865
    })
})


In [4]:
# Show the different tag values in the dataset:
np.unique(np.concatenate(train_labels_ner))

array(['0', '1', '2', '3', '4'], dtype='<U1')

In [5]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, AdamW, get_linear_schedule_with_warmup, DataCollatorForTokenClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import DataLoader, Dataset

# An example of how to use tokenize_and_align:
tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D") 
label_all_tokens=False


In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, max_length=128, is_split_into_words=True)
    print(tokenized_inputs.keys())
    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

#tokenising
tokenized_dataset = ner_dataset.map(tokenize_and_align_labels, batched=True)

In [7]:
#initialising the model
num_labels=5 #0,1,2,3,4
model = AutoModelForTokenClassification.from_pretrained("huawei-noah/TinyBERT_General_4L_312D", num_labels=num_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# data collation
data_collator = DataCollatorForTokenClassification(tokenizer)

In [9]:
#metric
from datasets import load_metric
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions,axis=2)
    
    true_labels=[[label for label in label_row if label != -100] for label_row in labels]
    true_predictions = [[pred for pred, label in zip(prediction_row,label_row) if label != -100] for prediction_row, label_row in zip(predictions, labels)]
    results = metric.compute(predictions = true_predictions, references=true_labels)
    
    return {
        "precision":results["overall_precision"],
        "recall":results["overall_recall"],
        "f1":results["overall_f1"],
        "accuracy":results["overall_accuracy"],
    }

In [10]:
# training args
training_args = TrainingArguments(output_dir="./results",
evaluation_strategy="steps",
eval_steps=500,
logging_steps=500,
save_steps=500,
num_train_epochs=3,
learning_rate=4e-5,
weight_decay=0.01,
per_device_train_batch_size=16,
per_device_eval_batch_size=16)

In [11]:
#initialising training
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics
)

In [12]:
#training the model
trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
500,0.238,0.144953,0.0,0.0,0.0,0.954264
1000,0.1139,0.12204,0.0,0.0,0.0,0.96126
1500,0.0839,0.124978,0.0,0.0,0.0,0.960596


TrainOutput(global_step=1635, training_loss=0.1396595042050796, metrics={'train_runtime': 3718.4745, 'train_samples_per_second': 7.03, 'train_steps_per_second': 0.44, 'total_flos': 54101504867400.0, 'train_loss': 0.1396595042050796, 'epoch': 5.0})

In [13]:
#validate
trainer.evaluate()

{'eval_loss': 0.12452159821987152,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_f1': 0.0,
 'eval_accuracy': 0.9607252147623788,
 'eval_runtime': 205.9416,
 'eval_samples_per_second': 25.881,
 'eval_steps_per_second': 1.622,
 'epoch': 5.0}

In [15]:
#applying on test set
predictions,labels, _ = trainer.predict(tokenized_dataset["test"])
predictions=np.argmax(predictions,axis=2)

In [23]:
#applying on small dataset
from sklearn.metrics import classification_report

def flatten_list(nested_list):
    return [item for sublist in nested_list for item in sublist]
small_test_dataset = tokenized_dataset["test"].select(range(200))
predictions, labels, _ = trainer.predict(small_test_dataset)
predictions = np.argmax(predictions, axis=2)
true_labels = [[label for label in label if label != -100] for label in labels]
true_predictions = [
    [p for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
flat_true_labels = flatten_list(true_labels)
flat_true_predictions = flatten_list(true_predictions)
print(classification_report(flat_true_labels, flat_true_predictions))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98      3228
           1       0.92      0.80      0.85       150
           2       0.68      0.77      0.72       121
           3       0.62      0.91      0.74        54
           4       0.53      0.80      0.64        10

    accuracy                           0.96      3563
   macro avg       0.75      0.85      0.79      3563
weighted avg       0.97      0.96      0.97      3563

