# **02-Reglas+Clasificación**

Referencias

* [Pysbd](https://github.com/nipunsadvilkar/pySBD/tree/master)
* [Hyperparameter tuning transformers](https://huggingface.co/docs/transformers/hpo_train)

In [1]:
%cd ..

/Users/belensantamaria/Documentos/section_identification


In [2]:
import json
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import datasets
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
from transformers import TextClassificationPipeline

import pysbd

## Creación del conjunto de datos para entrenar el clasificador

In [3]:
def json_to_df(file_path):

    with open(file_path) as f:
        data = json.load(f)

    entries = []
    for note_id, entry in data["annotated_entries"].items():
        for annotation in entry["section_annotation"]["gold"]:
            d = {
                "text": annotation["segment"],
                "label": annotation["label"],
            }
            entries.append(d)
        
    df = pd.DataFrame(entries)    
    return df

In [4]:
train_path = "data/raw/clinais.train.json"
df_train = json_to_df(train_path)

In [5]:
df_train.to_json("data/rule/train.json")
df_train

Unnamed: 0,text,label
0,"En Mayo de 1997, una mujer de 29 años de edad ...",PRESENT_ILLNESS
1,la ecografía y la tomografía axial computeriza...,EXPLORATION
2,Se realizó resección completa de la tumoración...,TREATMENT
3,"Treinta meses después, la paciente presentó en...",EVOLUTION
4,se reintervino y se llevó a cabo una resección...,TREATMENT
...,...,...
6471,por lo que precisó anticoagulación y retirada ...,TREATMENT
6472,A pesar de la utilización de la fistula arteri...,EVOLUTION
6473,"y por petición de la paciente, se replanteó vo...",TREATMENT
6474,Diariamente tiene una ultrafiltrafiltración co...,EVOLUTION


In [6]:
le = preprocessing.LabelEncoder()
le.fit(df_train["label"])

LabelEncoder()

In [7]:
df_train["label"] = le.transform(df_train["label"])

In [8]:
df_train

Unnamed: 0,text,label
0,"En Mayo de 1997, una mujer de 29 años de edad ...",5
1,la ecografía y la tomografía axial computeriza...,2
2,Se realizó resección completa de la tumoración...,6
3,"Treinta meses después, la paciente presentó en...",1
4,se reintervino y se llevó a cabo una resección...,6
...,...,...
6471,por lo que precisó anticoagulación y retirada ...,6
6472,A pesar de la utilización de la fistula arteri...,1
6473,"y por petición de la paciente, se replanteó vo...",6
6474,Diariamente tiene una ultrafiltrafiltración co...,1


## Creo secciones mediante reglas para el conjunto de test

In [9]:
test_path = "data/raw/clinais.dev.json"

with open(test_path) as f:
    data = json.load(f)

sections = []
seg = pysbd.Segmenter(language="es", clean=False, char_span=True)

for note_id, entry in data["annotated_entries"].items():
    for sec in seg.segment(entry["note_text"]):
        d = {
            "note_id": note_id,
            "text": sec.sent,
            # Necesito columna de label pero le pongo 0 a todo y luego lo cambio con las predicciones
            "label": 0,
            "start_offset": sec.start,
            "end_offset": sec.end,
        }
        sections.append(d)

df_test = pd.DataFrame(sections)

In [10]:
df_test

Unnamed: 0,note_id,text,label,start_offset,end_offset
0,S0004-06142005000200009-3,Paciente de 69 a. de edad con un PSA en el mom...,0,0,79
1,S0004-06142005000200009-3,El paciente tenía una biopsia previa por sexta...,0,79,140
2,S0004-06142005000200009-3,Se practicó una E-RME que mostró inicialmente ...,0,140,479
3,S0004-06142005001000015-1,Un paciente varón de 19 años acudió al Servici...,0,0,112
4,S0004-06142005001000015-1,No presentaba ningún antecedente urológico.,0,112,156
...,...,...,...,...,...
1984,S1135-76062007000100006-1,La madre había fallecido súbitamente a los 48 ...,0,180,256
1985,S1135-76062007000100006-1,Una noche salió a cenar con los compañeros de ...,0,256,369
1986,S1135-76062007000100006-1,No manifestó ninguna sintomatología y se acost...,0,369,439
1987,S1135-76062007000100006-1,Poco después el perro comenzó a ladrar por lo ...,0,439,537


## Búsqueda hiperparámetros transformers

In [11]:
model_checkpoint = "PlanTL-GOB-ES/roberta-base-biomedical-clinical-es"

In [12]:
batch_size = 64

In [13]:
labels = df_train.label.unique().tolist()
train, valid = train_test_split(df_train, test_size=0.2)

dataset = DatasetDict()

dataset["train"] = Dataset.from_pandas(train)
dataset["valid"] = Dataset.from_pandas(valid)
dataset["test"] = Dataset.from_pandas(df_test)

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

In [15]:
encoded_data = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/5180 [00:00<?, ? examples/s]

Map:   0%|          | 0/1296 [00:00<?, ? examples/s]

Map:   0%|          | 0/1989 [00:00<?, ? examples/s]

In [16]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels, ignore_mismatched_sizes=True)

In [17]:
num_labels = len(set(dataset["train"]["label"]))

metric_acc = evaluate.load("accuracy")

args = TrainingArguments(
    output_dir="data/rule",
    evaluation_strategy = "epoch",
    overwrite_output_dir=True,
    learning_rate=5e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    metric_for_best_model="accuracy",
    load_best_model_at_end=False,
    save_total_limit=10,
    use_mps_device=True,
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    return metric_acc.compute(predictions=preds, references=labels)


trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=encoded_data["train"],
    eval_dataset=encoded_data["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
def hp_space_optuna(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-4),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 3),
        "seed": trial.suggest_int("seed", 1, 40),
        "weight_decay": trial.suggest_float("weight_decay", 0, 0.1),
        "lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type", ["cosine_with_restarts", "linear"]),
    }

In [19]:
best_run = trainer.hyperparameter_search(hp_space=hp_space_optuna, n_trials=10, direction="maximize", backend="optuna") 

[I 2023-08-21 14:11:10,860] A new study created in memory with name: no-name-4ccffefd-d965-4104-ba31-f0d14cd2f45d
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5066,0.325485,0.909722
2,0.223,0.306686,0.928241


[I 2023-08-21 15:38:16,213] Trial 0 finished with value: 0.9282407407407407 and parameters: {'learning_rate': 4.9989358625559586e-05, 'num_train_epochs': 2, 'seed': 38, 'weight_decay': 0.08984958679102431, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 0 with value: 0.9282407407407407.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.478,0.258592,0.924383


[I 2023-08-21 16:25:39,496] Trial 1 finished with value: 0.9243827160493827 and parameters: {'learning_rate': 7.226361651030256e-05, 'num_train_epochs': 1, 'seed': 37, 'weight_decay': 0.05020773214857537, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 0.9282407407407407.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5243,0.26478,0.925926
2,0.235,0.306378,0.925926


[I 2023-08-21 17:51:27,022] Trial 2 finished with value: 0.9259259259259259 and parameters: {'learning_rate': 2.6822551582114098e-05, 'num_train_epochs': 2, 'seed': 16, 'weight_decay': 0.043659791302062156, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 0.9282407407407407.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5633,0.384145,0.89892
2,0.2839,0.343865,0.919753
3,0.1823,0.370846,0.925926


[I 2023-08-21 20:13:34,786] Trial 3 finished with value: 0.9259259259259259 and parameters: {'learning_rate': 8.6201919717379e-05, 'num_train_epochs': 3, 'seed': 6, 'weight_decay': 0.021857402537814354, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 0 with value: 0.9282407407407407.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4655,0.281827,0.914352


[I 2023-08-21 21:01:18,190] Trial 4 finished with value: 0.9143518518518519 and parameters: {'learning_rate': 3.856308137687861e-05, 'num_train_epochs': 1, 'seed': 29, 'weight_decay': 0.08584661871656707, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 0 with value: 0.9282407407407407.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4843,0.269903,0.918981


[I 2023-08-21 21:47:31,471] Trial 5 finished with value: 0.9189814814814815 and parameters: {'learning_rate': 7.354848730586093e-05, 'num_train_epochs': 1, 'seed': 12, 'weight_decay': 0.016830105494168756, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 0 with value: 0.9282407407407407.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5148,0.257811,0.927469


[I 2023-08-21 22:33:57,357] Trial 6 finished with value: 0.9274691358024691 and parameters: {'learning_rate': 3.035442787101334e-05, 'num_train_epochs': 1, 'seed': 31, 'weight_decay': 0.028974526212703657, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 0.9282407407407407.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5008,0.353658,0.905093


[I 2023-08-21 23:19:45,529] Trial 7 pruned. 
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5239,0.456264,0.884259


[I 2023-08-22 00:02:17,161] Trial 8 pruned. 
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.526,0.295898,0.906636


[I 2023-08-22 00:43:59,639] Trial 9 pruned. 


In [20]:
best_run

BestRun(run_id='0', objective=0.9282407407407407, hyperparameters={'learning_rate': 4.9989358625559586e-05, 'num_train_epochs': 2, 'seed': 38, 'weight_decay': 0.08984958679102431, 'lr_scheduler_type': 'cosine_with_restarts'}, run_summary=None)

## Predicción de las secciones del conjunto de test

In [21]:
model_checkpoint = "PlanTL-GOB-ES/roberta-base-biomedical-clinical-es"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

best_model = AutoModelForSequenceClassification.from_pretrained("data/rule/run-0/checkpoint-1000")

best_trainer = Trainer(best_model)

In [22]:
predictions, labels, _ = best_trainer.predict(encoded_data["test"])
original_labels = le.inverse_transform(labels)

In [23]:
original_labels

array(['DERIVED_FROM/TO', 'DERIVED_FROM/TO', 'DERIVED_FROM/TO', ...,
       'DERIVED_FROM/TO', 'DERIVED_FROM/TO', 'DERIVED_FROM/TO'],
      dtype=object)

## Transformación de los segmentos y etiquetas para su evaluación

In [24]:
df_test["label"] = original_labels
df_test

Unnamed: 0,note_id,text,label,start_offset,end_offset
0,S0004-06142005000200009-3,Paciente de 69 a. de edad con un PSA en el mom...,DERIVED_FROM/TO,0,79
1,S0004-06142005000200009-3,El paciente tenía una biopsia previa por sexta...,DERIVED_FROM/TO,79,140
2,S0004-06142005000200009-3,Se practicó una E-RME que mostró inicialmente ...,DERIVED_FROM/TO,140,479
3,S0004-06142005001000015-1,Un paciente varón de 19 años acudió al Servici...,DERIVED_FROM/TO,0,112
4,S0004-06142005001000015-1,No presentaba ningún antecedente urológico.,DERIVED_FROM/TO,112,156
...,...,...,...,...,...
1984,S1135-76062007000100006-1,La madre había fallecido súbitamente a los 48 ...,DERIVED_FROM/TO,180,256
1985,S1135-76062007000100006-1,Una noche salió a cenar con los compañeros de ...,DERIVED_FROM/TO,256,369
1986,S1135-76062007000100006-1,No manifestó ninguna sintomatología y se acost...,DERIVED_FROM/TO,369,439
1987,S1135-76062007000100006-1,Poco después el perro comenzó a ladrar por lo ...,DERIVED_FROM/TO,439,537


In [25]:
def entry_boundaries(df_test, note_id, boundaries):
    temp_df = df_test[df_test["note_id"]==note_id]
    predictions = boundaries
    for pred in predictions:
        if pred["start_offset"] in temp_df["start_offset"].values:
            pred["boundary"] = temp_df[temp_df["start_offset"]==pred["start_offset"]]["label"].values[0]
        else:
            pred["boundary"] = None
    return predictions   

In [26]:
test_path = "data/raw/clinais.dev.json"

with open(test_path) as f:
    data = json.load(f)

predictions = {}

for note_id, entry in data["annotated_entries"].items():
    predictions[entry["note_id"]] = entry
    predictions[entry["note_id"]]["boundary_annotation"]["prediction"] = entry_boundaries(df_test, note_id, entry["boundary_annotation"]["gold"])
    
with open("data/predictions/predictions_rule_classsification.json", "w") as f:
    json.dump({"annotated_entries": predictions}, f)  