In [1]:
!pip install transformers accelerate datasets tokenizers seqeval -q


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m97.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.6 MB/s[0m e

In [2]:
! pip install spacy




In [2]:
from datasets import Dataset, ClassLabel, Sequence, load_dataset, load_metric
import numpy as np
import pandas as pd
from spacy import displacy
import transformers
from transformers import (AutoModelForTokenClassification,
                          AutoTokenizer,
                          DataCollatorForTokenClassification,
                          pipeline,
                          TrainingArguments,
                          Trainer)

In [3]:
# confirm version > 4.11.0
print(transformers.__version__)

4.31.0


In [4]:
#dataset ADE-corpus-v2
#https://huggingface.co/datasets/ade_corpus_v2
datasets = load_dataset("ade_corpus_v2", "Ade_corpus_v2_drug_ade_relation")
datasets

Downloading builder script:   0%|          | 0.00/11.7k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.84k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/307k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/18.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/868k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'drug', 'effect', 'indexes'],
        num_rows: 6821
    })
})

In [6]:
datasets["train"][0]


{'text': 'Intravenous azithromycin-induced ototoxicity.',
 'drug': 'azithromycin',
 'effect': 'ototoxicity',
 'indexes': {'drug': {'start_char': [12], 'end_char': [24]},
  'effect': {'start_char': [33], 'end_char': [44]}}}

In [5]:
consolidated_dataset = {}

for row in datasets["train"]:
    if row["text"] in consolidated_dataset:
        consolidated_dataset[row["text"]]["drug_indices_start"].update(row["indexes"]["drug"]["start_char"])
        consolidated_dataset[row["text"]]["drug_indices_end"].update(row["indexes"]["drug"]["end_char"])
        consolidated_dataset[row["text"]]["effect_indices_start"].update(row["indexes"]["effect"]["start_char"])
        consolidated_dataset[row["text"]]["effect_indices_end"].update(row["indexes"]["effect"]["end_char"])
        consolidated_dataset[row["text"]]["drug"].append(row["drug"])
        consolidated_dataset[row["text"]]["effect"].append(row["effect"])

    else:
        consolidated_dataset[row["text"]] = {
            "text": row["text"],
            "drug": [row["drug"]],
            "effect": [row["effect"]],
            # use sets because the indices can repeat for various reasons
            "drug_indices_start": set(row["indexes"]["drug"]["start_char"]),
            "drug_indices_end": set(row["indexes"]["drug"]["end_char"]),
            "effect_indices_start": set(row["indexes"]["effect"]["start_char"]),
            "effect_indices_end": set(row["indexes"]["effect"]["end_char"])
        }

df = pd.DataFrame(list(consolidated_dataset.values()))


In [6]:
## since no spans overlap, we can sort to get 1:1 matched index spans
# note that sets don't preserve insertion order

df["drug_indices_start"] = df["drug_indices_start"].apply(list).apply(sorted)
df["drug_indices_end"] = df["drug_indices_end"].apply(list).apply(sorted)
df["effect_indices_start"] = df["effect_indices_start"].apply(list).apply(sorted)
df["effect_indices_end"] = df["effect_indices_end"].apply(list).apply(sorted)


In [18]:
len(df['effect'])

4271

In [7]:
# save to JSON to then import into Dataset object
df.to_json("dataset.jsonl", orient="records", lines=True)

In [8]:
cons_dataset = load_dataset("json", data_files="dataset.jsonl")


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [9]:
# no train-test provided, so we create our own
# cons_dataset = cons_dataset["train"].train_test_split(test_size=0.2, seed=42)

cons_dataset = cons_dataset["train"].train_test_split(test_size=0.2, seed=42)



In [19]:
cons_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'drug', 'effect', 'drug_indices_start', 'drug_indices_end', 'effect_indices_start', 'effect_indices_end'],
        num_rows: 3416
    })
    test: Dataset({
        features: ['text', 'drug', 'effect', 'drug_indices_start', 'drug_indices_end', 'effect_indices_start', 'effect_indices_end'],
        num_rows: 855
    })
})

In [20]:
#labeled_dataset
labeled_dataset = cons_dataset.map(generate_row_labels)


Map:   0%|          | 0/3416 [00:00<?, ? examples/s]

Map:   0%|          | 0/855 [00:00<?, ? examples/s]

In [10]:

label_list = ['O', 'B-DRUG', 'I-DRUG', 'B-EFFECT', 'I-EFFECT']

custom_seq = Sequence(feature=ClassLabel(num_classes=5,
                                         names=label_list,
                                         names_file=None, id=None), length=-1, id=None)

cons_dataset["train"].features["ner_tags"] = custom_seq
cons_dataset["test"].features["ner_tags"] = custom_seq

In [11]:
from transformers import AutoTokenizer

# model_checkpoint = "biomednlp/pubmedbert-base-uncased-abstract"
model_checkpoint = "bert-base-uncased"
# model_checkpoint = "allenai/scibert_scivocab_uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [12]:
#function to generate BIO tags for effect
def generate_row_labels(row, verbose=False):
    """ Given a row from the consolidated `Ade_corpus_v2_drug_ade_relation` dataset,
    generates BIO tags for drug and effect entities.

    """

    text = row["text"]

    labels = []
    label = "O"
    prefix = ""

    # while iterating through tokens, increment to traverse all drug and effect spans
    drug_index = 0
    effect_index = 0

    tokens = tokenizer(text, return_offsets_mapping=True)

    for n in range(len(tokens["input_ids"])):
        offset_start, offset_end = tokens["offset_mapping"][n]

        # should only happen for [CLS] and [SEP]
        if offset_end - offset_start == 0:
            labels.append(-100)
            continue

        if drug_index < len(row["drug_indices_start"]) and offset_start == row["drug_indices_start"][drug_index]:
            label = "DRUG"
            prefix = "B-"

        elif effect_index < len(row["effect_indices_start"]) and offset_start == row["effect_indices_start"][effect_index]:
            label = "EFFECT"
            prefix = "B-"

        labels.append(label_list.index(f"{prefix}{label}"))

        if drug_index < len(row["drug_indices_end"]) and offset_end == row["drug_indices_end"][drug_index]:
            label = "O"
            prefix = ""
            drug_index += 1

        elif effect_index < len(row["effect_indices_end"]) and offset_end == row["effect_indices_end"][effect_index]:
            label = "O"
            prefix = ""
            effect_index += 1

        # need to transition "inside" if we just entered an entity
        if prefix == "B-":
            prefix = "I-"

    if verbose:
        print(f"{row}\n")
        orig = tokenizer.convert_ids_to_tokens(tokens["input_ids"])
        for n in range(len(labels)):
            print(orig[n], labels[n])
    tokens["labels"] = labels

    return tokens

In [13]:
generate_row_labels(cons_dataset["train"][0], verbose=True)


{'text': 'Anaphylactic reaction to oral prednisone: a case report and review of the literature.', 'drug': ['prednisone'], 'effect': ['Anaphylactic reaction'], 'drug_indices_start': [30], 'drug_indices_end': [40], 'effect_indices_start': [0], 'effect_indices_end': [21]}

[CLS] -100
ana 3
##phy 4
##la 4
##ctic 4
reaction 4
to 0
oral 0
pre 1
##d 2
##nis 2
##one 2
: 0
a 0
case 0
report 0
and 0
review 0
of 0
the 0
literature 0
. 0
[SEP] -100


{'input_ids': [101, 9617, 21281, 2721, 13306, 4668, 2000, 8700, 3653, 2094, 8977, 5643, 1024, 1037, 2553, 3189, 1998, 3319, 1997, 1996, 3906, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 3), (3, 6), (6, 8), (8, 12), (13, 21), (22, 24), (25, 29), (30, 33), (33, 34), (34, 37), (37, 40), (40, 41), (42, 43), (44, 48), (49, 55), (56, 59), (60, 66), (67, 69), (70, 73), (74, 84), (84, 85), (0, 0)], 'labels': [-100, 3, 4, 4, 4, 4, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]}

In [14]:
#fine-tuning
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "bert-base-uncased"
batch_size = 16

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
!pip install accelerate -U
# !pip install transformers[torch]



In [15]:
model_name = model_checkpoint.split("/")[-1]
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.05,
    logging_steps=1
)



In [16]:
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")


  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [17]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [21]:
trainer = Trainer(
    model,
    args,
    train_dataset=labeled_dataset["train"],
    eval_dataset=labeled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,

)

In [22]:
trainer.train()


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2065,0.205222,0.70646,0.86414,0.777385,0.931846
2,0.076,0.184097,0.756778,0.890018,0.818008,0.939716
3,0.211,0.163117,0.782626,0.886784,0.831456,0.94579
4,0.0527,0.163708,0.800498,0.891867,0.843716,0.949075
5,0.0611,0.167767,0.796212,0.893715,0.842151,0.949381


TrainOutput(global_step=1070, training_loss=0.18571323126589304, metrics={'train_runtime': 1141.7671, 'train_samples_per_second': 14.959, 'train_steps_per_second': 0.937, 'total_flos': 549762873076560.0, 'train_loss': 0.18571323126589304, 'epoch': 5.0})

In [23]:
predictions, labels, _ = trainer.predict(labeled_dataset["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'DRUG': {'precision': 0.8836996336996337,
  'recall': 0.9526159921026653,
  'f1': 0.9168646080760094,
  'number': 1013},
 'EFFECT': {'precision': 0.724756918474196,
  'recall': 0.841876629018245,
  'f1': 0.7789389067524115,
  'number': 1151},
 'overall_precision': 0.7962124331000412,
 'overall_recall': 0.8937153419593346,
 'overall_f1': 0.8421510994992379,
 'overall_accuracy': 0.9493811124694377}

In [24]:

import os
from google.colab import drive

drive.mount('/content/drive')

save_dir = '/content/drive/My Drive/Bert_model/ADE_Corpus_V2/model/'

# model.save_pretrained("general_ner_model")


model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)


Mounted at /content/drive


('/content/drive/My Drive/Bert_model/ADE_Corpus_V2/model/tokenizer_config.json',
 '/content/drive/My Drive/Bert_model/ADE_Corpus_V2/model/special_tokens_map.json',
 '/content/drive/My Drive/Bert_model/ADE_Corpus_V2/model/vocab.txt',
 '/content/drive/My Drive/Bert_model/ADE_Corpus_V2/model/added_tokens.json',
 '/content/drive/My Drive/Bert_model/ADE_Corpus_V2/model/tokenizer.json')

In [26]:
effect_ner_model = pipeline(task="ner", model=model, tokenizer=tokenizer)


No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


In [27]:
effect_ner_model(labeled_dataset["test"][4]["text"])


[{'entity': 'LABEL_0',
  'score': 0.9986234,
  'index': 1,
  'word': 'to',
  'start': 0,
  'end': 2},
 {'entity': 'LABEL_0',
  'score': 0.99914074,
  'index': 2,
  'word': 'the',
  'start': 3,
  'end': 6},
 {'entity': 'LABEL_0',
  'score': 0.99881774,
  'index': 3,
  'word': 'best',
  'start': 7,
  'end': 11},
 {'entity': 'LABEL_0',
  'score': 0.9990426,
  'index': 4,
  'word': 'of',
  'start': 12,
  'end': 14},
 {'entity': 'LABEL_0',
  'score': 0.9990664,
  'index': 5,
  'word': 'our',
  'start': 15,
  'end': 18},
 {'entity': 'LABEL_0',
  'score': 0.99918145,
  'index': 6,
  'word': 'knowledge',
  'start': 19,
  'end': 28},
 {'entity': 'LABEL_0',
  'score': 0.9991099,
  'index': 7,
  'word': ',',
  'start': 28,
  'end': 29},
 {'entity': 'LABEL_0',
  'score': 0.99923456,
  'index': 8,
  'word': 'this',
  'start': 30,
  'end': 34},
 {'entity': 'LABEL_0',
  'score': 0.99919325,
  'index': 9,
  'word': 'is',
  'start': 35,
  'end': 37},
 {'entity': 'LABEL_0',
  'score': 0.9993149,
  'inde