# **04.2-NER-Bert**

References:
* [Token Classification HuggingFace](https://huggingface.co/learn/nlp-course/chapter7/2)

In [1]:
%cd ..

/Users/belensantamaria/Documentos/section_identification


In [2]:
import json
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import datasets
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
from transformers import pipeline

import evaluate

In [3]:
def json_to_df(file_path, label2id):

    with open(file_path) as f:
        data = json.load(f)

    entries = []
    for note_id, entry in data["annotated_entries"].items():
        text = entry["note_text"]
        tokens = re.findall(r"\w+|[^\w\s]", text, re.UNICODE)
        labels = np.zeros(len(tokens))
        position = 0
        for annotation in entry["section_annotation"]["gold"]:
            segment = annotation["segment"]
            segment_tokens = re.findall(r"\w+|[^\w\s]", segment, re.UNICODE)
            labels[position] =  label2id[annotation["label"]]
            position += len(segment_tokens)
            
        d = {
            "tokens": tokens,
            "ner_tags": labels.astype(int),
        }    
        entries.append(d)
        
    df = pd.DataFrame(entries)    
    return df

In [4]:
label2id = {
    "O": 0,
    "EXPLORATION": 1,
    "TREATMENT": 2,
    "PRESENT_ILLNESS": 3,
    "EVOLUTION": 4,
    "PAST_MEDICAL_HISTORY": 5,
    "DERIVED_FROM/TO": 6,
    "FAMILY_HISTORY": 7,
}

id2label = {v: k for k, v in label2id.items()}

In [5]:
train_path = "data/raw/clinais.train.json"
df_train = json_to_df(train_path, label2id)

In [6]:
df_train

Unnamed: 0,tokens,ner_tags
0,"[En, Mayo, de, 1997, ,, una, mujer, de, 29, añ...","[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[Varón, de, 66, años, controlado, en, Consulta...","[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[Mujer, de, 51, años, ,, monorrena, derecha, ,...","[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[Nuestra, paciente, es, una, mujer, de, 77, añ...","[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, ..."
4,"[Paciente, de, 68, años, de, edad, ,, con, ant...","[3, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
776,"[Mujer, de, 26, años, con, ERC, secundaria, a,...","[3, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
777,"[Varón, de, 41, años, con, ERC, estadio, V, de...","[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
778,"[Mujer, de, 83, años, con, insuficiencia, rena...","[3, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
779,"[Presentamos, un, varón, de, 71, años, ,, con,...","[3, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
train, valid = train_test_split(df_train, test_size=0.2)

dataset = DatasetDict()

dataset["train"] = Dataset.from_pandas(train, preserve_index=False)
dataset["valid"] = Dataset.from_pandas(valid, preserve_index=False)

In [8]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [9]:
model_checkpoint = "PlanTL-GOB-ES/roberta-base-biomedical-clinical-es"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/157 [00:00<?, ? examples/s]

In [10]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [11]:
def model_init():
    return AutoModelForTokenClassification.from_pretrained(model_checkpoint, id2label=id2label, label2id=label2id)

In [12]:
args = TrainingArguments(
    output_dir="data/NER/model",
    evaluation_strategy = "epoch",
    overwrite_output_dir=True,
    learning_rate=5e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    metric_for_best_model="Weighted B2",
    load_best_model_at_end=False,
    save_total_limit=10,
    use_mps_device=True,
)

metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [13]:
from src.evaluation.evaluate import evaluate_note
from src.evaluation.dataset_model import BoundaryAnnotation
from numpy import average

id2label[0]=None

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[l for l in label if l != -100] for label in labels]
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    number_sections_per_file = []
    scores_per_file = []

    # Compute the scores
    for label, prediction in zip(true_labels, true_predictions):
        number_sections, metrics = evaluate_note(
            reference_boundaries=[BoundaryAnnotation(span=" ", start_offset=i, end_offset=i+1, boundary=id2label[l]) for i, l in enumerate(label)],
            prediction_boundaries=[BoundaryAnnotation(span=" ", start_offset=i, end_offset=i+1, boundary=id2label[l]) for i, l in enumerate(prediction)]
            #reference_boundaries=[{"span": " ", "start_offset": i, "end_offset": i+1, "boundary": l} for i, l in enumerate(label)],
            #prediction_boundaries=[{"span": " ", "start_offset": i, "end_offset": i+1, "boundary": l} for i, l in enumerate(prediction)]
        )
        number_sections_per_file.append(number_sections)
        scores_per_file.append(metrics["metrics"]["B2"])

    # Compute the final scores on the whole dataset
    return {"Weighted B2": average(scores_per_file, weights=number_sections_per_file)}

In [14]:
trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es and are newly initialized: ['classifier

In [15]:
def hp_space_optuna(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-4),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 3),
        "seed": trial.suggest_int("seed", 1, 40),
        "weight_decay": trial.suggest_float("weight_decay", 0, 0.1),
        "lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type", ["cosine_with_restarts", "linear"]),
    }

In [16]:
best_run = trainer.hyperparameter_search(hp_space=hp_space_optuna, n_trials=10, direction="maximize", backend="optuna") 

[I 2023-09-03 21:41:49,217] A new study created in memory with name: no-name-04aa81bd-b278-4f97-b105-929b1422c2e3
Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from

Epoch,Training Loss,Validation Loss,Weighted b2
1,No log,0.056122,0.013408
2,No log,0.044787,0.317583


[I 2023-09-03 21:50:25,971] Trial 0 finished with value: 0.3175829196444388 and parameters: {'learning_rate': 8.781810649983569e-05, 'num_train_epochs': 2, 'seed': 3, 'weight_decay': 0.07852268588640093, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 0 with value: 0.3175829196444388.
Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be ex

Epoch,Training Loss,Validation Loss,Weighted b2
1,No log,0.042345,0.246389
2,No log,0.022045,0.754715
3,No log,0.021569,0.758518


[I 2023-09-03 22:07:08,505] Trial 1 finished with value: 0.7585182535814342 and parameters: {'learning_rate': 8.329065069640134e-05, 'num_train_epochs': 3, 'seed': 16, 'weight_decay': 0.05560367415336148, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 1 with value: 0.7585182535814342.
Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be e

Epoch,Training Loss,Validation Loss,Weighted b2
1,No log,0.062144,0.013408
2,No log,0.052265,0.013408
3,No log,0.048533,0.142648


[I 2023-09-03 22:22:02,694] Trial 2 finished with value: 0.1426484547544022 and parameters: {'learning_rate': 3.830914955103658e-05, 'num_train_epochs': 3, 'seed': 17, 'weight_decay': 0.045867939845742334, 'lr_scheduler_type': 'linear'}. Best is trial 1 with value: 0.7585182535814342.
Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identi

Epoch,Training Loss,Validation Loss,Weighted b2
1,No log,0.061335,0.013408
2,No log,0.050344,0.161467


[I 2023-09-03 22:31:50,452] Trial 3 finished with value: 0.1614672046880653 and parameters: {'learning_rate': 5.360203869534988e-05, 'num_train_epochs': 2, 'seed': 39, 'weight_decay': 0.08890250438936699, 'lr_scheduler_type': 'linear'}. Best is trial 1 with value: 0.7585182535814342.
Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identic

Epoch,Training Loss,Validation Loss,Weighted b2
1,No log,0.093362,0.013408
2,No log,0.058293,0.013408
3,No log,0.054458,0.044991


[I 2023-09-03 22:46:46,389] Trial 4 finished with value: 0.044991332600617315 and parameters: {'learning_rate': 1.0976450452053677e-05, 'num_train_epochs': 3, 'seed': 18, 'weight_decay': 0.0371879426094601, 'lr_scheduler_type': 'linear'}. Best is trial 1 with value: 0.7585182535814342.
Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly ident

Epoch,Training Loss,Validation Loss,Weighted b2
1,No log,0.038889,0.281576
2,No log,0.024074,0.741798


[I 2023-09-03 22:56:28,719] Trial 5 finished with value: 0.7417975649238479 and parameters: {'learning_rate': 9.745770106120472e-05, 'num_train_epochs': 2, 'seed': 26, 'weight_decay': 0.07262884298280632, 'lr_scheduler_type': 'linear'}. Best is trial 1 with value: 0.7585182535814342.
Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identic

Epoch,Training Loss,Validation Loss,Weighted b2
1,No log,0.044188,0.317232
2,No log,0.038048,0.483071


[I 2023-09-03 23:06:17,631] Trial 6 finished with value: 0.48307106366174557 and parameters: {'learning_rate': 6.430281986694684e-05, 'num_train_epochs': 2, 'seed': 7, 'weight_decay': 0.044545788648285935, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 1 with value: 0.7585182535814342.
Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be 

Epoch,Training Loss,Validation Loss,Weighted b2
1,No log,0.055464,0.039715
2,No log,0.036391,0.368529
3,No log,0.031666,0.614945


[I 2023-09-03 23:21:12,881] Trial 7 finished with value: 0.6149448386402437 and parameters: {'learning_rate': 4.210845325963105e-05, 'num_train_epochs': 3, 'seed': 15, 'weight_decay': 0.08382594344728055, 'lr_scheduler_type': 'linear'}. Best is trial 1 with value: 0.7585182535814342.
Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identic

Epoch,Training Loss,Validation Loss,Weighted b2
1,No log,0.095493,0.013408


[I 2023-09-03 23:26:13,948] Trial 8 pruned. 
Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinic

Epoch,Training Loss,Validation Loss,Weighted b2
1,No log,0.101812,0.013408


[I 2023-09-03 23:31:22,393] Trial 9 pruned. 


In [18]:
best_run

BestRun(run_id='1', objective=0.7585182535814342, hyperparameters={'learning_rate': 8.329065069640134e-05, 'num_train_epochs': 3, 'seed': 16, 'weight_decay': 0.05560367415336148, 'lr_scheduler_type': 'cosine_with_restarts'}, run_summary=None)

In [26]:
model_checkpoint = "data/NER/model/checkpoint-500"
token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="simple")

In [27]:
test_path = "data/raw/clinais.dev.json"

with open(test_path) as f:
    data = json.load(f)

In [None]:
annotations = []

for note_id, entry in data["annotated_entries"].items():
    text = entry["note_text"]
    ents = token_classifier(text)

    for ent in ents:
        d = {
            "note_id": note_id,
            "label": ent["entity_group"],
            "start_offset": ent["start"]
        }
        annotations.append(d)

df = pd.DataFrame(annotations)

In [None]:
df

In [None]:
def entry_boundaries(df_test, note_id, boundaries):
    temp_df = df_test[df_test["note_id"]==note_id]
    predictions = boundaries
    for pred in predictions:
        if pred["start_offset"] in temp_df["start_offset"].values:
            pred["boundary"] = temp_df[temp_df["start_offset"]==pred["start_offset"]]["label"].values[0]
        else:
            pred["boundary"] = None
    return predictions   

In [25]:
predictions = {}

for note_id, entry in data["annotated_entries"].items():
    predictions[entry["note_id"]] = entry
    predictions[entry["note_id"]]["boundary_annotation"]["prediction"] = entry_boundaries(df, note_id, entry["boundary_annotation"]["gold"])
    
with open("data/predictions/predictions_ner_bert.json", "w") as f:
    json.dump({"annotated_entries": predictions}, f) 

KeyError: 'note_id'

No puedo aplicar la métrica B2 porque en compute_metrics solo tengo la lista de etiquetas y predicciones de cada token, y necesitaría saber también la posición inicial de cada token para calcular cuánto se ha ido. La he implementado poniendo que cada token es de longitud 1.

Utilizando como métrica el accuracy 10 epochs "Weighted B2": 0.5340219745589161
Utilizando como métrica b2 10 epochs "Weighted B2": 0.5877273720523583,