In [None]:
# !sudo pip install transformers peft evaluate seqeval
!sudo pip install pytorch-crf

In [None]:
from datasets import load_from_disk
import srsly

DATASET_NAME = (
    "/resources/data/restricted/anonymization/annonimization-dataset-pruned-2023-08-16"
)
dataset = load_from_disk(DATASET_NAME)

with open(f"{DATASET_NAME}/label_mapping.json") as file:
    label2code = srsly.json_loads(file.read())
    code2label = {v: k for k, v in label2code.items()}

dataset["train"] = dataset["train"].shuffle(seed=42)
print(dataset)
print("nlabels:", len(code2label))

In [None]:
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
)
import evaluate

# model_checkpoint = "roberta-large"
model_checkpoint = "dccuchile/bert-base-spanish-wwm-cased"
# model_checkpoint = "PlanTL-GOB-ES/RoBERTalex"

seqeval = evaluate.load("seqeval")

backbone = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label2code.keys()),
    id2label=code2label,
    label2id=label2code,
    # device_map={0: "cpu"},
)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
len(tokenized_dataset["train"])

In [None]:
import torch
from torch import nn
from torchcrf import CRF
from transformers.modeling_outputs import TokenClassifierOutput


class TransformerCRF(nn.Module):
    def __init__(
        self,
        model: AutoModelForTokenClassification,
        num_labels: int,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.backbone = model
        self.num_labels = num_labels
        self.special_token = -100
        self.crf = CRF(self.num_labels, batch_first=True)
        self.config = self.backbone.config

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=True,
    ):
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs = self.backbone(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # sequence_output = outputs.last_hidden_state
        # sequence_output = self.dropout(output)
        # logits = self.classifier(sequence_output)
        logits = outputs.logits

        loss = None
        if labels is not None:
            _labels = labels[:]
            _labels[_labels == self.special_token] = 0

            log_likelihood = self.crf(logits, _labels)
            loss = 0 - log_likelihood

            # tags = self.crf.decode(logits)

        # tags = torch.Tensor(tags)

        # output = (tags,) + outputs[2:]
        # return ((loss,) + output) if loss is not None else output

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [None]:
crfmodel = TransformerCRF(backbone, num_labels=len(code2label))

In [None]:
text = "El imputado Ramiro Ramallo Martinez DNI 88.384.425 declarado"
inputs = tokenizer(text, return_tensors="pt")
import torch


example = tokenized_dataset["train"][0]
tags = example["tags"]
example["input_ids"] = torch.tensor([example["input_ids"]])
example["token_type_ids"] = torch.tensor([example["token_type_ids"]])
example["attention_mask"] = torch.tensor([example["attention_mask"]])
example["labels"] = torch.tensor([example["labels"]])
example.pop("n_labels")
example.pop("tokens")
example.pop("tags")
example.pop("hash")
# crfmodel.to("cpu")

# inputs = torch.tensor(example["input_ids"])
with torch.no_grad():
    logits = backbone(**example)
    logits = crfmodel(**example)
logits
# inputs

In [None]:
from peft import TaskType, get_peft_model, LoraConfig


lora_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS,
    inference_mode=False,
    r=16,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
)

In [None]:
model = get_peft_model(crfmodel, lora_config)
model.print_trainable_parameters()

In [None]:
import numpy as np

seqeval = evaluate.load("seqeval")


def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [code2label.get(p) for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [code2label.get(l, "O") for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
from transformers import TrainingArguments, Trainer

MODEL_NAME = "beto-crf-lora-aymurai-ner"

training_args = TrainingArguments(
    output_dir=MODEL_NAME,
    learning_rate=1e-3,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=16,
    # predict_with_generate=True,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_steps=500,
    save_steps=500,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
)

In [None]:
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [None]:
trainer.train()

## Save model

In [None]:
import os

MODEL_PATH = f"/resources/models/anonymization/{MODEL_NAME}"

os.makedirs(f"{MODEL_PATH}/config", exist_ok=True)
with open(f"{MODEL_PATH}/config/label_mapping.json", "w") as file:
    json = srsly.json_dumps(label2code)
    file.write(json)

model.save_pretrained(f"{MODEL_PATH}/lora")

In [None]:
from peft import (
    PeftConfig,
    PeftModelForTokenClassification,
    LoraConfig,
    TaskType,
    get_peft_model,
    PeftModel,
)
from transformers import AutoTokenizer, AutoModelForTokenClassification


MODEL_PATH = "./beto-lora-aymurai-ner/model"

# Load peft config for pre-trained checkpoint etc.
peft_config = PeftConfig.from_pretrained(MODEL_PATH)

# load base model and tokenizer
model = AutoModelForTokenClassification.from_pretrained(
    peft_config.base_model_name_or_path,
    num_labels=len(label2code.keys()),
    id2label=code2label,
    label2id=label2code,
)

model = PeftModelForTokenClassification.from_pretrained(model, MODEL_PATH)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)

In [None]:
import torch

text = "El imputado Ramiro Ramallo Martinez DNI 88.384.425 declarado"
inputs = tokenizer(text, return_tensors="pt")
model.to("cpu")
with torch.no_grad():
    logits = model(**inputs).logits

tokens = inputs.tokens()

predictions = torch.argmax(logits, dim=2)

for token, prediction in zip(tokens, predictions[0].numpy()):
    print((token, model.config.id2label[prediction]))

In [None]:
import re
from itertools import groupby

import numpy as np
from more_itertools import unzip, collapse
from aymurai.logging import get_logger

logger = get_logger(__name__)


def postprocessor(
    token_ids: list[int], scores: list[float], aggregator: str = "max"
) -> tuple[str, float]:
    text = tokenizer.convert_tokens_to_string(token_ids)
    text = re.sub("\s+", "", text)

    # use the label of the top class of subwords
    if aggregator == "max":
        score = np.max(scores)
        label_id = np.argmax(scores)
    elif aggregator == "first":
        score = np.max(scores[0])
        label_id = np.argmax(scores[0])
    else:
        raise NotImplemented(f"aggregation: `{aggregator}` not implemented.")
    # if label_id not in code2label:
    #     logger.warn(f"out of range class: `{text}` (label_id {label_id})")
    label = code2label.get(label_id, "O")

    return text, label, score


def single_predict(text: str, aggregator: str = "max"):
    # print(text)
    inputs = tokenizer(
        text.split(), return_tensors="pt", is_split_into_words=True, truncation=True
    )
    word_ids = inputs.word_ids()
    tokens = inputs.tokens()
    # model.to("cpu")
    with torch.no_grad():
        logits = model(**inputs).logits

    maxes = np.max(logits.numpy(), axis=-1, keepdims=True)
    maxes = model.crf.decode(logits)
    print(maxes)
    shifted_exp = np.exp(logits - maxes)
    scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)

    preds = groupby(zip(word_ids, tokens, scores[0]), key=lambda x: x[0])
    preds = filter(lambda x: x[0] is not None, preds)  # drop non words tokens i.e [CLS]
    _, preds = unzip(preds)  # drop group key (word id)
    preds = map(lambda x: list(zip(*x)), preds)  # transpose list

    # x = (word_id, token_ids, scores)
    preds = map(lambda x: postprocessor(x[1], x[2], aggregator=aggregator), preds)

    return list(preds)


single_predict(text, aggregator="first")

In [None]:
single_predict(text, aggregator="first")

# Evaluation

In [None]:
from datasets import load_from_disk
import srsly

DATASET_NAME = (
    "/resources/data/restricted/anonymization/annonimization-dataset-pruned-2023-08-16"
)
dataset = load_from_disk(DATASET_NAME)

with open(f"{DATASET_NAME}/label_mapping.json") as file:
    label2code = srsly.json_loads(file.read())
    code2label = {v: k for k, v in label2code.items()}

dataset["train"] = dataset["train"].shuffle(seed=42)
print(dataset)
print("nlabels:", len(code2label))

### Dev evaluation

In [None]:
from tqdm import tqdm
import pandas as pd

logger.setLevel("ERROR")
# train evaluation file
predictions = pd.DataFrame()
for paragraph in tqdm(dataset["validation"]):
    text = " ".join(paragraph["tokens"])
    preds = single_predict(text, aggregator="first")

    preds = pd.DataFrame(preds, columns=["token", "pred", "score"])
    preds.insert(loc=1, column="label", value=paragraph["tags"])
    preds["label"] = preds["label"].apply(code2label.get)
    preds.loc[-1] = np.nan

    predictions = pd.concat([predictions, preds], ignore_index=True)

predictions.to_csv("dev-evaluation-beto-crf.csv")

# TODO

- [ ] manejar parrafos con mas de 512 tokens (en training se trunco)

In [None]:
df = predictions.copy()
df.dropna(inplace=True)

In [None]:
df.info()

In [None]:
df["label"].value_counts(normalize=True)

In [None]:
df.head()

In [None]:
# Exact match
df["match"] = df["label"] == df["pred"]
df["match"].value_counts(normalize=True)

In [None]:
df.loc[df["label"] != "O", "match"].value_counts(normalize=True)

In [None]:
df.loc[df["label"] == "O", "match"].value_counts(normalize=True)

In [None]:
df.loc[(df["label"] == "O") & (df["match"] != 1)]

In [None]:
normalize_class = lambda x: re.sub(r"B-|I-", "", x)

df["normalized_label"] = df["label"].map(normalize_class)
df["normalized_pred"] = df["pred"].map(normalize_class)

In [None]:
df.head()

In [None]:
# Normalized exact match
df["normalized_match"] = df["normalized_label"] == df["normalized_pred"]
df["normalized_match"].value_counts(normalize=True)

In [None]:
df.loc[df["normalized_label"] != "O", "normalized_match"].value_counts(normalize=True)

In [None]:
df.loc[df["normalized_label"] == "O", "normalized_match"].value_counts(normalize=True)

In [None]:
df["normalized_pred"].value_counts(normalize=True)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print(classification_report(df["label"], df["pred"]))

In [None]:
print(classification_report(df["normalized_label"], df["normalized_pred"]))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(20, 20))

labels = df["normalized_label"].unique()

cm = confusion_matrix(
    df["normalized_label"],
    df["normalized_pred"],
    labels=labels,
    # normalize="true",
)
order = np.argsort(-cm.diagonal())


cm_norm = confusion_matrix(
    df["normalized_label"],
    df["normalized_pred"],
    labels=labels,
    normalize="true",
)
cm_sorted = cm_norm[order, :][:, order]

labels_sorted = labels[order]

sns.heatmap(
    cm_sorted,
    vmin=0.0,
    vmax=1.0,
    cmap="Blues",
    annot=True,
    fmt=".2f",
    cbar=False,
    xticklabels=labels_sorted,
    yticklabels=labels_sorted,
)

plt.title("Confusion Matrix", fontdict={"fontsize": 20})