In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertForTokenClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support
from transformers import BertTokenizerFast
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW

In [2]:
if torch.cuda.is_available():
    print("GPU is available")
else:
    print("GPU is not available")

GPU is available


# Data Preparation

In [3]:
%load_ext autoreload
%autoreload 2

# Use HuggingFace's datasets library to access the Emotion dataset
from datasets import load_dataset, load_metric
import numpy as np
import pandas as pd

In [4]:
ner_dataset = load_dataset(
    "tner/bc5cdr", 
)

print(f'The dataset is a dictionary with {len(ner_dataset)} splits: \n\n{ner_dataset}')

The dataset is a dictionary with 3 splits: 

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5228
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5330
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5865
    })
})


In [5]:
label_map = {
    0: "O",
    1: "B-Chemical",
    2: "B-Disease",
    3: "I-Disease",
    4: "I-Chemical"
}

In [6]:
train_sentences_ner = [item['tokens'] for item in ner_dataset['train']]
train_labels_ner = [[label_map[tag] for tag in item['tags']] for item in ner_dataset['train']]

val_sentences_ner = [item['tokens'] for item in ner_dataset['validation']]
val_labels_ner = [[label_map[tag] for tag in item['tags']] for item in ner_dataset['validation']]

test_sentences_ner = [item['tokens'] for item in ner_dataset['test']]
test_labels_ner = [[label_map[tag] for tag in item['tags']] for item in ner_dataset['test']]

In [7]:
import warnings

warnings.filterwarnings('ignore')

# NER

In [8]:
# Show the different tag values in the dataset:
label_list = np.unique(np.concatenate(train_labels_ner))
LENGTH = len(np.unique(np.concatenate(train_labels_ner)))

In [9]:
label_all_tokens = True
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, max_length=128, is_split_into_words=True)
    print(tokenized_inputs.keys())
    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            
            if word_idx is None:
                label_ids.append(-100)
            
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [10]:
task = "ner" 
model_checkpoint = "bert-base-uncased"
BATCH_SIZE = 16

In [11]:
from transformers import AutoTokenizer  
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [12]:
tokenized_datasets = ner_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/5228 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


Map:   0%|          | 0/5330 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


Map:   0%|          | 0/5865 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [13]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback

model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', num_labels=LENGTH)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",  
    load_best_model_at_end=True,  
    metric_for_best_model='loss', 
    greater_is_better=False, 
)

In [15]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [16]:
metric = load_metric("seqeval")

In [17]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [18]:
label_list

array(['B-Chemical', 'B-Disease', 'I-Chemical', 'I-Disease', 'O'],
      dtype='<U10')

In [19]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [20]:
hist = trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.142569,0.954454,0.954454,0.954454,0.951191
2,0.189200,0.144242,0.956348,0.962194,0.959262,0.956036
3,0.189200,0.16145,0.957924,0.960613,0.959267,0.95605
4,0.063000,0.181163,0.961679,0.958638,0.960156,0.956111


In [21]:
trainer.evaluate()

{'eval_loss': 0.14256879687309265,
 'eval_precision': 0.9544543219216948,
 'eval_recall': 0.9544543219216948,
 'eval_f1': 0.9544543219216948,
 'eval_accuracy': 0.9511914252812472,
 'eval_runtime': 35.5157,
 'eval_samples_per_second': 150.075,
 'eval_steps_per_second': 9.404,
 'epoch': 4.0}

In [22]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

In [23]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'Chemical': {'precision': 0.9634365353261994,
  'recall': 0.9599726223952105,
  'f1': 0.9617014597394443,
  'number': 121267},
 'Disease': {'precision': 0.8833283833283834,
  'recall': 0.8919824052784164,
  'f1': 0.8876343016315161,
  'number': 20006},
 'overall_precision': 0.9519615976402686,
 'overall_recall': 0.950344368704565,
 'overall_f1': 0.9511522957358327,
 'overall_accuracy': 0.9476678116272351}