# **02-Reglas+Clasificación**

Referencias

* [Pysbd](https://github.com/nipunsadvilkar/pySBD/tree/master)
* [Hyperparameter tuning transformers](https://huggingface.co/docs/transformers/hpo_train)

In [1]:
%cd ..

/Users/belensantamaria/Documentos/section_identification


In [2]:
import json
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import datasets
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
from transformers import pipeline

import pysbd

## Creación del conjunto de datos para entrenar el clasificador

In [3]:
def json_to_df(file_path):

    with open(file_path) as f:
        data = json.load(f)

    entries = []
    for note_id, entry in data["annotated_entries"].items():
        for annotation in entry["section_annotation"]["gold"]:
            d = {
                "text": annotation["segment"],
                "label": annotation["label"],
            }
            entries.append(d)
        
    df = pd.DataFrame(entries)    
    return df

In [4]:
train_path = "data/raw/clinais.train.json"
df_train = json_to_df(train_path)

In [5]:
df_train.to_json("data/rule/train.json")
df_train

Unnamed: 0,text,label
0,"En Mayo de 1997, una mujer de 29 años de edad ...",PRESENT_ILLNESS
1,la ecografía y la tomografía axial computeriza...,EXPLORATION
2,Se realizó resección completa de la tumoración...,TREATMENT
3,"Treinta meses después, la paciente presentó en...",EVOLUTION
4,se reintervino y se llevó a cabo una resección...,TREATMENT
...,...,...
6471,por lo que precisó anticoagulación y retirada ...,TREATMENT
6472,A pesar de la utilización de la fistula arteri...,EVOLUTION
6473,"y por petición de la paciente, se replanteó vo...",TREATMENT
6474,Diariamente tiene una ultrafiltrafiltración co...,EVOLUTION


In [6]:
le = preprocessing.LabelEncoder()
le.fit(df_train["label"])

LabelEncoder()

In [7]:
df_train["label"] = le.transform(df_train["label"])

In [8]:
df_train

Unnamed: 0,text,label
0,"En Mayo de 1997, una mujer de 29 años de edad ...",5
1,la ecografía y la tomografía axial computeriza...,2
2,Se realizó resección completa de la tumoración...,6
3,"Treinta meses después, la paciente presentó en...",1
4,se reintervino y se llevó a cabo una resección...,6
...,...,...
6471,por lo que precisó anticoagulación y retirada ...,6
6472,A pesar de la utilización de la fistula arteri...,1
6473,"y por petición de la paciente, se replanteó vo...",6
6474,Diariamente tiene una ultrafiltrafiltración co...,1


## Búsqueda hiperparámetros transformers

In [None]:
model_checkpoint = "PlanTL-GOB-ES/roberta-base-biomedical-clinical-es"

In [None]:
batch_size = 64

In [None]:
labels = df_train.label.unique().tolist()
train, valid = train_test_split(df_train, test_size=0.2)

dataset = DatasetDict()

dataset["train"] = Dataset.from_pandas(train)
dataset["valid"] = Dataset.from_pandas(valid)
dataset["test"] = Dataset.from_pandas(df_test)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

In [None]:
encoded_data = dataset.map(preprocess_function, batched=True)

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels, ignore_mismatched_sizes=True)

In [None]:
num_labels = len(set(dataset["train"]["label"]))

metric_acc = evaluate.load("accuracy")

args = TrainingArguments(
    output_dir="data/rule",
    evaluation_strategy = "epoch",
    overwrite_output_dir=True,
    learning_rate=5e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    metric_for_best_model="accuracy",
    load_best_model_at_end=False,
    save_total_limit=10,
    use_mps_device=True,
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    return metric_acc.compute(predictions=preds, references=labels)


trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=encoded_data["train"],
    eval_dataset=encoded_data["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
def hp_space_optuna(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-4),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 3),
        "seed": trial.suggest_int("seed", 1, 40),
        "weight_decay": trial.suggest_float("weight_decay", 0, 0.1),
        "lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type", ["cosine_with_restarts", "linear"]),
    }

In [None]:
best_run = trainer.hyperparameter_search(hp_space=hp_space_optuna, n_trials=10, direction="maximize", backend="optuna") 

In [None]:
best_run

## Creo secciones mediante reglas para el conjunto de test

In [9]:
model_checkpoint = "data/rule/run-0/checkpoint-500"
classifier = pipeline("text-classification", model=model_checkpoint)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [10]:
test_path = "data/raw/clinais.dev.json"

with open(test_path) as f:
    data = json.load(f)

sections = []
seg = pysbd.Segmenter(language="es", clean=False, char_span=True)

for note_id, entry in data["annotated_entries"].items():
    for sec in seg.segment(entry["note_text"]):
        d = {
            "note_id": note_id,
            "text": sec.sent,
            "label": int(classifier(sec.sent)[0]["label"].replace("LABEL_", "")),
            "start_offset": sec.start,
            "end_offset": sec.end,
        }
        sections.append(d)

df_test = pd.DataFrame(sections)
df_test["label"] = le.inverse_transform(df_test["label"])

In [11]:
df_test

Unnamed: 0,note_id,text,label,start_offset,end_offset
0,S0004-06142005000200009-3,Paciente de 69 a. de edad con un PSA en el mom...,PRESENT_ILLNESS,0,79
1,S0004-06142005000200009-3,El paciente tenía una biopsia previa por sexta...,PAST_MEDICAL_HISTORY,79,140
2,S0004-06142005000200009-3,Se practicó una E-RME que mostró inicialmente ...,EXPLORATION,140,479
3,S0004-06142005001000015-1,Un paciente varón de 19 años acudió al Servici...,PRESENT_ILLNESS,0,112
4,S0004-06142005001000015-1,No presentaba ningún antecedente urológico.,PAST_MEDICAL_HISTORY,112,156
...,...,...,...,...,...
1984,S1135-76062007000100006-1,La madre había fallecido súbitamente a los 48 ...,FAMILY_HISTORY,180,256
1985,S1135-76062007000100006-1,Una noche salió a cenar con los compañeros de ...,PRESENT_ILLNESS,256,369
1986,S1135-76062007000100006-1,No manifestó ninguna sintomatología y se acost...,EVOLUTION,369,439
1987,S1135-76062007000100006-1,Poco después el perro comenzó a ladrar por lo ...,EVOLUTION,439,537


## Transformación de los segmentos y etiquetas para su evaluación

In [13]:
def entry_boundaries(df_test, note_id, boundaries):
    temp_df = df_test[df_test["note_id"]==note_id]
    predictions = boundaries
    for pred in predictions:
        if pred["start_offset"] in temp_df["start_offset"].values:
            pred["boundary"] = temp_df[temp_df["start_offset"]==pred["start_offset"]]["label"].values[0]
        else:
            pred["boundary"] = None
    return predictions   

In [14]:
test_path = "data/raw/clinais.dev.json"

with open(test_path) as f:
    data = json.load(f)

predictions = {}

for note_id, entry in data["annotated_entries"].items():
    predictions[entry["note_id"]] = entry
    predictions[entry["note_id"]]["boundary_annotation"]["prediction"] = entry_boundaries(df_test, note_id, entry["boundary_annotation"]["gold"])
    
with open("data/predictions/predictions_rule_classsification.json", "w") as f:
    json.dump({"annotated_entries": predictions}, f)  