In [None]:
!pip install -U transformers datasets seqeval

In [None]:
from datasets import Dataset, DatasetDict, load_metric
import pandas as pd

def read_bio_tsv(filename):
    sentences, labels = [], []
    tokens, tags = [], []

    with open(filename, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line == "":
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
            else:
                splits = line.split("\t")
                if len(splits) == 2:
                    tokens.append(splits[0])
                    tags.append(splits[1])
    return sentences, labels

sentences, ner_tags = read_bio_tsv("biobert_ner_data.tsv")

df = pd.DataFrame({"tokens": sentences, "ner_tags": ner_tags})
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1, seed=42)
dataset = DatasetDict({"train": dataset["train"], "test": dataset["test"]})


In [None]:
unique_tags = sorted(set(tag for tags in ner_tags for tag in tags))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

def encode_labels(example):
    example["labels"] = [tag2id[tag] for tag in example["ner_tags"]]
    return example

dataset = dataset.map(encode_labels)


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id,
)


In [None]:
from transformers import DataCollatorForTokenClassification

def tokenize_and_align_labels(example):
    tokenized = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    word_ids = tokenized.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["labels"][word_idx])
        else:
            label = example["labels"][word_idx]
            if id2tag[label].startswith("B-"):
                label = tag2id["I-" + id2tag[label][2:]]
            labels.append(label)
        previous_word_idx = word_idx
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


In [None]:
import evaluate
from transformers import TrainingArguments, Trainer

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    preds, labels = p
    preds = preds.argmax(-1)
    true_preds, true_labels = [], []
    for pred, label in zip(preds, labels):
        true_pred = [id2tag[p] for (p, l) in zip(pred, label) if l != -100]
        true_label = [id2tag[l] for (p, l) in zip(pred, label) if l != -100]
        true_preds.append(true_pred)
        true_labels.append(true_label)
    return seqeval.compute(predictions=true_preds, references=true_labels)


In [None]:
training_args = TrainingArguments(
    output_dir="./biobert-ner",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:
trainer.save_model("./biobert-ner-final")
tokenizer.save_pretrained("./biobert-ner-final")