# **04.2-NER-Bert**

References:
* [Token Classification HuggingFace](https://huggingface.co/learn/nlp-course/chapter7/2)

In [1]:
%cd ..

/Users/belensantamaria/Documentos/section_identification


In [2]:
import json
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import datasets
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
from transformers import pipeline

import evaluate

In [3]:
def json_to_df(file_path, label2id):

    with open(file_path) as f:
        data = json.load(f)

    entries = []
    for note_id, entry in data["annotated_entries"].items():
        text = entry["note_text"]
        tokens = re.findall(r"\w+|[^\w\s]", text, re.UNICODE)
        labels = np.zeros(len(tokens))
        position = 0
        for annotation in entry["section_annotation"]["gold"]:
            segment = annotation["segment"]
            segment_tokens = re.findall(r"\w+|[^\w\s]", segment, re.UNICODE)
            labels[position] =  label2id[annotation["label"]]
            position += len(segment_tokens)
            
        d = {
            "tokens": tokens,
            "ner_tags": labels.astype(int),
        }    
        entries.append(d)
        
    df = pd.DataFrame(entries)    
    return df

In [4]:
label2id = {
    "O": 0,
    "EXPLORATION": 1,
    "TREATMENT": 2,
    "PRESENT_ILLNESS": 3,
    "EVOLUTION": 4,
    "PAST_MEDICAL_HISTORY": 5,
    "DERIVED_FROM/TO": 6,
    "FAMILY_HISTORY": 7,
}

id2label = {v: k for k, v in label2id.items()}

In [5]:
train_path = "data/raw/clinais.train.json"
df_train = json_to_df(train_path, label2id)

In [6]:
df_train

Unnamed: 0,tokens,ner_tags
0,"[En, Mayo, de, 1997, ,, una, mujer, de, 29, añ...","[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[Varón, de, 66, años, controlado, en, Consulta...","[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[Mujer, de, 51, años, ,, monorrena, derecha, ,...","[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[Nuestra, paciente, es, una, mujer, de, 77, añ...","[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, ..."
4,"[Paciente, de, 68, años, de, edad, ,, con, ant...","[3, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
776,"[Mujer, de, 26, años, con, ERC, secundaria, a,...","[3, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
777,"[Varón, de, 41, años, con, ERC, estadio, V, de...","[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
778,"[Mujer, de, 83, años, con, insuficiencia, rena...","[3, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
779,"[Presentamos, un, varón, de, 71, años, ,, con,...","[3, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
train, valid = train_test_split(df_train, test_size=0.2)

dataset = DatasetDict()

dataset["train"] = Dataset.from_pandas(train, preserve_index=False)
dataset["valid"] = Dataset.from_pandas(valid, preserve_index=False)

In [8]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [9]:
model_checkpoint = "PlanTL-GOB-ES/roberta-base-biomedical-clinical-es"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/157 [00:00<?, ? examples/s]

Map:   0%|          | 0/127 [00:00<?, ? examples/s]

In [10]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [11]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es were not used when initializing RobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es and are newly initialized: ['classifier

In [12]:
args = TrainingArguments(
    output_dir="data/NER/model",
    evaluation_strategy = "epoch",
    overwrite_output_dir=True,
    learning_rate=5e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    metric_for_best_model="f1",
    load_best_model_at_end=False,
    save_total_limit=10,
    use_mps_device=True,
)

In [13]:
metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [14]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.049559,0.987013,0.124897,0.221736,0.978889
2,No log,0.027346,0.699837,0.705012,0.702415,0.987746
3,No log,0.022329,0.750202,0.762531,0.756316,0.989712
4,No log,0.022032,0.725076,0.788825,0.755608,0.989578
5,No log,0.024817,0.811513,0.718159,0.761988,0.990437
6,No log,0.025201,0.733234,0.808546,0.76905,0.989998
7,0.036400,0.026461,0.788793,0.751849,0.769878,0.990284
8,0.036400,0.028324,0.779449,0.766639,0.772991,0.990609
9,0.036400,0.029393,0.772991,0.766639,0.769802,0.990303
10,0.036400,0.029288,0.780833,0.769926,0.775341,0.990571


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=780, training_loss=0.02481265251453106, metrics={'train_runtime': 3124.228, 'train_samples_per_second': 1.997, 'train_steps_per_second': 0.25, 'total_flos': 1625251915889280.0, 'train_loss': 0.02481265251453106, 'epoch': 10.0})

In [15]:
trainer.save_model("data/NER/model")

In [16]:
model_checkpoint = "data/NER/model"
token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="simple")

[]

In [22]:
test_path = "data/raw/clinais.dev.json"

with open(test_path) as f:
    data = json.load(f)

In [23]:
annotations = []

for note_id, entry in data["annotated_entries"].items():
    text = entry["note_text"]
    ents = token_classifier(text)

    for ent in ents:
        d = {
            "note_id": note_id,
            "label": ent["entity_group"],
            "start_offset": ent["start"]
        }
        annotations.append(d)

df = pd.DataFrame(annotations)

In [24]:
df

Unnamed: 0,note_id,label,start_offset
0,S0004-06142005000200009-3,PRESENT_ILLNESS,0
1,S0004-06142005000200009-3,EXPLORATION,140
2,S0004-06142005001000015-1,PRESENT_ILLNESS,0
3,S0004-06142005001000015-1,PAST_MEDICAL_HISTORY,112
4,S0004-06142005001000015-1,PRESENT_ILLNESS,154
...,...,...,...
936,S0376-78922009000400002-8,EVOLUTION,741
937,S1135-76062007000100006-1,PRESENT_ILLNESS,0
938,S1135-76062007000100006-1,FAMILY_HISTORY,180
939,S1135-76062007000100006-1,PRESENT_ILLNESS,256


In [25]:
def entry_boundaries(df_test, note_id, boundaries):
    temp_df = df_test[df_test["note_id"]==note_id]
    predictions = boundaries
    for pred in predictions:
        if pred["start_offset"] in temp_df["start_offset"].values:
            pred["boundary"] = temp_df[temp_df["start_offset"]==pred["start_offset"]]["label"].values[0]
        else:
            pred["boundary"] = None
    return predictions   

In [26]:
predictions = {}

for note_id, entry in data["annotated_entries"].items():
    predictions[entry["note_id"]] = entry
    predictions[entry["note_id"]]["boundary_annotation"]["prediction"] = entry_boundaries(df, note_id, entry["boundary_annotation"]["gold"])
    
with open("data/predictions/predictions_ner_bert.json", "w") as f:
    json.dump({"annotated_entries": predictions}, f) 

In [21]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[l for l in label if l != -100] for label in labels]
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "accuracy": all_metrics["overall_accuracy"]
    }