# Fine-Tuning a BERT-based model for NER

## Please refer to the respective sections in the book for further details.


## Step 1. Installing libraries and Data loading. 



In [None]:
!pip install datasets transformers seqeval
!pip install accelerate -U


In [4]:
# ! pip install spacy

In [5]:
from datasets import Dataset, ClassLabel, Sequence, load_dataset, load_metric
import numpy as np
import pandas as pd
from spacy import displacy
import transformers
from transformers import (AutoModelForTokenClassification,
                          AutoTokenizer,
                          DataCollatorForTokenClassification,
                          pipeline,
                          TrainingArguments,
                          Trainer)

In [6]:
# confirm version > 4.11.0
print(transformers.__version__)

4.35.2


In [7]:
datasets = load_dataset("ade_corpus_v2", "Ade_corpus_v2_drug_ade_relation")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/491k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6821 [00:00<?, ? examples/s]

In [8]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'drug', 'effect', 'indexes'],
        num_rows: 6821
    })
})

In [9]:
datasets["train"][0]

{'text': 'Intravenous azithromycin-induced ototoxicity.',
 'drug': 'azithromycin',
 'effect': 'ototoxicity',
 'indexes': {'drug': {'start_char': [12], 'end_char': [24]},
  'effect': {'start_char': [33], 'end_char': [44]}}}

## Step 2. Data pre-processing

In [10]:
merged_dataset = {}

for item in datasets["train"]:
    text = item["text"]
    if text not in merged_dataset:
        merged_dataset[text] = {
            "text": text,
            "drugs": [item["drug"]],
            "effects": [item["effect"]],
            "drug_starts": set(item["indexes"]["drug"]["start_char"]),
            "drug_ends": set(item["indexes"]["drug"]["end_char"]),
            "effect_starts": set(item["indexes"]["effect"]["start_char"]),
            "effect_ends": set(item["indexes"]["effect"]["end_char"])
        }
    else:
        merged_data = merged_dataset[text]
        merged_data["drugs"].append(item["drug"])
        merged_data["effects"].append(item["effect"])
        merged_data["drug_starts"].update(item["indexes"]["drug"]["start_char"])
        merged_data["drug_ends"].update(item["indexes"]["drug"]["end_char"])
        merged_data["effect_starts"].update(item["indexes"]["effect"]["start_char"])
        merged_data["effect_ends"].update(item["indexes"]["effect"]["end_char"])


In [12]:
df_ade = pd.DataFrame(
  list(merged_dataset.values()))

In [13]:
# df_ade = pd.DataFrame(list(consolidated_dataset.values()))

In [14]:
df_ade.head()

Unnamed: 0,text,drugs,effects,drug_starts,drug_ends,effect_starts,effect_ends
0,Intravenous azithromycin-induced ototoxicity.,[azithromycin],[ototoxicity],{12},{24},{33},{44}
1,"Immobilization, while Paget's bone disease was...",[dihydrotachysterol],[increased calcium-release],{91},{109},{143},{168}
2,Unaccountable severe hypercalcemia in a patien...,[dihydrotachysterol],[hypercalcemia],{84},{102},{21},{34}
3,METHODS: We report two cases of pseudoporphyri...,"[naproxen, oxaprozin]","[pseudoporphyria, pseudoporphyria]","{58, 71}","{80, 66}",{32},{47}
4,"Naproxen, the most common offender, has been a...",[Naproxen],[erythropoietic protoporphyria],{0},{8},{134},{163}


In [15]:
df_ade["drug_starts"] = df_ade["drug_starts"].apply(list).apply(sorted)
df_ade["drug_ends"] = df_ade["drug_ends"].apply(list).apply(sorted)
df_ade["effect_starts"] = df_ade["effect_starts"].apply(list).apply(sorted)
df_ade["effect_ends"] = df_ade["effect_ends"].apply(list).apply(sorted)

In [16]:
df_ade.to_json("dataset.jsonl", orient="records", lines=True)

In [17]:
ade_dataset = load_dataset("json", data_files="dataset.jsonl")

Generating train split: 0 examples [00:00, ? examples/s]

In [18]:
ade_train_test = ade_dataset["train"].train_test_split()

In [19]:
ade_train_test

DatasetDict({
    train: Dataset({
        features: ['text', 'drugs', 'effects', 'drug_starts', 'drug_ends', 'effect_starts', 'effect_ends'],
        num_rows: 3203
    })
    test: Dataset({
        features: ['text', 'drugs', 'effects', 'drug_starts', 'drug_ends', 'effect_starts', 'effect_ends'],
        num_rows: 1068
    })
})

In [20]:
entity_label_list = ['O', 'B-DRUG', 'I-DRUG', 'B-EFFECT', 'I-EFFECT']

custom_sequence = Sequence(feature=ClassLabel(num_classes=5,
                                         names=entity_label_list,
                                         names_file=None, id=None), length=-1, id=None)

ade_train_test["train"].features["ner_tags"] = custom_sequence
ade_train_test["test"].features["ner_tags"] = custom_sequence

## Step 3. Model training

In [21]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
# model_checkpoint = "allenai/scibert_scivocab_uncased"
model_checkpoint = "dmis-lab/biobert-v1.1"
# model_checkpoint = "alvaroalon2/biobert_diseases_ner"
batch_size = 16

In [22]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

In [23]:
def create_bio_tags_for_entities(data_row, display_log=False):
    """ This function takes a row from the 'Ade_corpus_v2_drug_ade_relation' dataset
    and creates BIO tags for 'drug' and 'effect' entities.
    """

    sentence = data_row["text"]

    bio_tags = []
    current_label = "O"
    tag_prefix = ""

    drug_span_index = 0
    effect_span_index = 0

    tokenized_data = tokenizer(sentence, return_offsets_mapping=True)

    for index in range(len(tokenized_data["input_ids"])):
        token_start, token_end = tokenized_data["offset_mapping"][index]

        if token_end - token_start == 0:
            bio_tags.append(-100)
            continue

        if drug_span_index < len(data_row["drug_starts"]) and token_start == data_row["drug_starts"][drug_span_index]:
            current_label = "DRUG"
            tag_prefix = "B-"

        elif effect_span_index < len(data_row["effect_starts"]) and token_start == data_row["effect_starts"][effect_span_index]:
            current_label = "EFFECT"
            tag_prefix = "B-"

        bio_tags.append(entity_label_list.index(f"{tag_prefix}{current_label}"))

        if drug_span_index < len(data_row["drug_ends"]) and token_end == data_row["drug_ends"][drug_span_index]:
            current_label = "O"
            tag_prefix = ""
            drug_span_index += 1

        elif effect_span_index < len(data_row["effect_ends"]) and token_end == data_row["effect_ends"][effect_span_index]:
            current_label = "O"
            tag_prefix = ""
            effect_span_index += 1

        if tag_prefix == "B-":
            tag_prefix = "I-"

    if display_log:
        print(f"{data_row}\n")
        original_tokens = tokenizer.convert_ids_to_tokens(tokenized_data["input_ids"])
        for index in range(len(bio_tags)):
            print(original_tokens[index], bio_tags[index])
    tokenized_data["labels"] = bio_tags

    return tokenized_data


In [26]:
create_bio_tags_for_entities(ade_train_test["train"][2], display_log=True)

{'text': 'The changes were progressive regardless of discontinuation of cyclophosphamide and led to severe restrictive ventilatory defect.', 'drugs': ['cyclophosphamide'], 'effects': ['severe restrictive ventilatory defect'], 'drug_starts': [62], 'drug_ends': [78], 'effect_starts': [90], 'effect_ends': [127]}

[CLS] -100
the 0
changes 0
were 0
progressive 0
regardless 0
of 0
discontinuation 0
of 0
cyclo 1
##phosph 2
##amide 2
and 0
led 0
to 0
severe 3
restrictive 4
ventilator 4
##y 4
defect 4
. 0
[SEP] -100


{'input_ids': [102, 111, 1334, 267, 8381, 7161, 131, 21710, 131, 14448, 5952, 5659, 137, 4030, 147, 3167, 17843, 27879, 30126, 7465, 205, 103], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 3), (4, 11), (12, 16), (17, 28), (29, 39), (40, 42), (43, 58), (59, 61), (62, 67), (67, 73), (73, 78), (79, 82), (83, 86), (87, 89), (90, 96), (97, 108), (109, 119), (119, 120), (121, 127), (127, 128), (0, 0)], 'labels': [-100, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 3, 4, 4, 4, 4, 0, -100]}

In [27]:
labeled_dataset = ade_train_test.map(create_bio_tags_for_entities)

Map:   0%|          | 0/3203 [00:00<?, ? examples/s]

Map:   0%|          | 0/1068 [00:00<?, ? examples/s]

In [29]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.05,
    logging_steps=1
)

In [30]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [31]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [32]:
def evaluate_model_performance(prediction_data):
    predicted_values, actual_labels = prediction_data
    predicted_values = np.argmax(predicted_values, axis=2)

    refined_predictions = [
        [entity_label_list[pred] for pred, true_label in zip(pred_row, label_row) if true_label != -100]
        for pred_row, label_row in zip(predicted_values, actual_labels)
    ]
    refined_labels = [
        [entity_label_list[true_label] for pred, true_label in zip(pred_row, label_row) if true_label != -100]
        for pred_row, label_row in zip(predicted_values, actual_labels)
    ]

    metric_results = metric.compute(predictions=refined_predictions, references=refined_labels)
    return {
        "precision_score": metric_results["overall_precision"],
        "recall_score": metric_results["overall_recall"],
        "f1_score": metric_results["overall_f1"],
        "accuracy_score": metric_results["overall_accuracy"],
    }


In [33]:
trainer = Trainer(
    model,
    args,
    train_dataset=labeled_dataset["train"],
    eval_dataset=labeled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=evaluate_model_performance,

)

In [34]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision Score,Recall Score,F1 Score,Accuracy Score
1,0.1216,0.142491,0.782477,0.865256,0.821787,0.94908
2,0.1631,0.129841,0.811828,0.896808,0.852205,0.955228
3,0.0176,0.136384,0.819044,0.897179,0.856333,0.957217
4,0.1585,0.135221,0.828601,0.899035,0.862382,0.95841
5,0.0914,0.138306,0.828807,0.894952,0.86061,0.957832


TrainOutput(global_step=1005, training_loss=0.12086897340339066, metrics={'train_runtime': 93.7987, 'train_samples_per_second': 170.738, 'train_steps_per_second': 10.714, 'total_flos': 439952994857220.0, 'train_loss': 0.12086897340339066, 'epoch': 5.0})

## Step 4. Model Evaluation

In [35]:
predicted_results, actual_labels, _ = trainer.predict(labeled_dataset["test"])
predicted_results = np.argmax(predicted_results, axis=2)

processed_predictions = [
    [entity_label_list[each_pred] for each_pred, each_label in zip(single_prediction, single_label) if each_label != -100]
    for single_prediction, single_label in zip(predicted_results, actual_labels)
]
processed_labels = [
    [entity_label_list[each_label] for each_pred, each_label in zip(single_prediction, single_label) if each_label != -100]
    for single_prediction, single_label in zip(predicted_results, actual_labels)
]

evaluation_results = metric.compute(predictions=processed_predictions, references=processed_labels)
evaluation_results


{'DRUG': {'precision': 0.915680473372781,
  'recall': 0.9626749611197511,
  'f1': 0.9385898407884761,
  'number': 1286},
 'EFFECT': {'precision': 0.7533718689788054,
  'recall': 0.8330965909090909,
  'f1': 0.7912310286677909,
  'number': 1408},
 'overall_precision': 0.8288071502234445,
 'overall_recall': 0.8949517446176689,
 'overall_f1': 0.8606103872925219,
 'overall_accuracy': 0.9578315431629959}

### Step 4.1 See Model Outputs

In [36]:
ade_ner_model = pipeline(task="ner", model=model, tokenizer=tokenizer, device=0)

In [37]:
ade_ner_model(labeled_dataset["test"][4]["text"])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'LABEL_0',
  'score': 0.9998895,
  'index': 1,
  'word': 'we',
  'start': 0,
  'end': 2},
 {'entity': 'LABEL_0',
  'score': 0.99990284,
  'index': 2,
  'word': 'report',
  'start': 3,
  'end': 9},
 {'entity': 'LABEL_0',
  'score': 0.9998871,
  'index': 3,
  'word': 'a',
  'start': 10,
  'end': 11},
 {'entity': 'LABEL_0',
  'score': 0.9998155,
  'index': 4,
  'word': 'case',
  'start': 12,
  'end': 16},
 {'entity': 'LABEL_0',
  'score': 0.9998499,
  'index': 5,
  'word': 'of',
  'start': 17,
  'end': 19},
 {'entity': 'LABEL_3',
  'score': 0.99345183,
  'index': 6,
  'word': 'ac',
  'start': 20,
  'end': 22},
 {'entity': 'LABEL_4',
  'score': 0.9955485,
  'index': 7,
  'word': '##ne',
  'start': 22,
  'end': 24},
 {'entity': 'LABEL_4',
  'score': 0.9525375,
  'index': 8,
  'word': 'ful',
  'start': 25,
  'end': 28},
 {'entity': 'LABEL_4',
  'score': 0.97534436,
  'index': 9,
  'word': '##min',
  'start': 28,
  'end': 31},
 {'entity': 'LABEL_4',
  'score': 0.97026104,
  'index

In [38]:
def display_entity_annotations(text):
    annotated_tokens = ade_ner_model(text)
    entity_annotations = []

    for token in annotated_tokens:
        entity_type = int(token["entity"][-1])
        if entity_type != 0:
            token["label"] = entity_label_list[entity_type]
            entity_annotations.append(token)

    render_params = [{"text": text, "ents": entity_annotations, "title": None}]

    rendered_html = displacy.render(render_params, style="ent", manual=True, options={
        "colors": {
                   "B-DRUG": "#00FF00",
                   "I-DRUG": "#00FF00",
                   "B-EFFECT": "#ff0000",
                   "I-EFFECT": "#ff0000",
               },
    })
    display(HTML(rendered_html))


In [39]:
examples = [
"Rhabdomyolysis associated with statins (anticholesterol drugs)",
"Seizures caused by withdrawal from benzodiazepines",
"Drowsiness or increase in appetite due to antihistamine use. Some antihistamines are used in sleep aids explicitly because they cause drowsiness",
"Stroke or heart attack associated with sildenafil (Viagra), when used with nitroglycerin",
"Suicide, increased tendency associated to the use of fluoxetine and other selective serotonin reuptake inhibitor (SSRI) antidepressants",
"Tardive dyskinesia associated with use of metoclopramide and many antipsychotic medications"
]


from IPython.core.display import HTML
import spacy
from spacy import displacy

for sample in examples:
    display_entity_annotations(sample)
    print(f"{'=' * 50}\n")


















