In [None]:
import srsly
from huggingface_hub import hf_hub_download
from peft import AutoPeftModelForTokenClassification
from transformers import AutoTokenizer

MODEL_NAME = "aymurai/anonymizer-beto-cased-lora"

LABEL2CODE_PATH = hf_hub_download(repo_id=MODEL_NAME, filename="label2code.json")

with open(LABEL2CODE_PATH) as file:
    label2code = srsly.json_loads(file.read())
    code2label = {v: k for k, v in label2code.items()}

model = AutoPeftModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label2code.keys()),
    id2label=code2label,
    label2id=label2code,
)

tokenizer = AutoTokenizer.from_pretrained(
    model.peft_config["default"].base_model_name_or_path
)

In [None]:
import re
from itertools import groupby

import torch
import numpy as np
from more_itertools import unzip

from aymurai.logging import get_logger

logger = get_logger(__name__)


def postprocessor(
    token_ids: list[int], scores: list[float], aggregator: str = "first"
) -> tuple[str, float]:
    text = tokenizer.convert_tokens_to_string(token_ids)
    text = re.sub("\s+", "", text)

    # use the label of the top class of subwords
    if aggregator == "max":
        score = np.max(scores)
        label_id = np.argmax(scores)
    elif aggregator == "first":
        score = np.max(scores[0])
        label_id = np.argmax(scores[0])
    else:
        raise NotImplemented(f"aggregation: `{aggregator}` not implemented.")
    # if label_id not in code2label:
    #     logger.warn(f"out of range class: `{text}` (label_id {label_id})")
    label = code2label.get(label_id, "O")

    return text, label, score


def single_predict(text: str, aggregator: str = "first"):
    # print(text)
    inputs = tokenizer(
        text.split(), return_tensors="pt", is_split_into_words=True, truncation=True
    )
    word_ids = inputs.word_ids()
    tokens = inputs.tokens()
    # model.to("cpu")
    with torch.no_grad():
        logits = model(**inputs).logits.numpy()

    maxes = np.max(logits, axis=-1, keepdims=True)
    shifted_exp = np.exp(logits - maxes)
    scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)

    preds = groupby(zip(word_ids, tokens, scores[0]), key=lambda x: x[0])
    preds = filter(lambda x: x[0] is not None, preds)  # drop non words tokens i.e [CLS]
    _, preds = unzip(preds)  # drop group key (word id)
    preds = map(lambda x: list(zip(*x)), preds)  # transpose list

    # x = (word_id, token_ids, scores)
    preds = map(lambda x: postprocessor(x[1], x[2], aggregator=aggregator), preds)

    return list(preds)


text = "El imputado Ramiro Ramallo Martinez DNI 88.384.425 declarado"
single_predict(text, aggregator="first")

# Evaluation

In [None]:
from datasets import load_from_disk

DATASET_NAME = (
    "/resources/data/restricted/anonymization/annonimization-dataset-pruned-2023-08-16"
)
dataset = load_from_disk(DATASET_NAME)

dataset["train"] = dataset["train"].shuffle(seed=42)
print(dataset)
print("nlabels:", len(code2label))

### Dev evaluation

In [None]:
import pandas as pd
from tqdm import tqdm

logger.setLevel("ERROR")
# train evaluation file
predictions = pd.DataFrame()
for paragraph in tqdm(dataset["validation"]):
    text = " ".join(paragraph["tokens"])
    preds = single_predict(text, aggregator="first")

    preds = pd.DataFrame(preds, columns=["token", "pred", "score"])
    preds.insert(loc=1, column="label", value=paragraph["tags"])
    preds["label"] = preds["label"].apply(code2label.get)
    preds.loc[-1] = np.nan

    predictions = pd.concat([predictions, preds], ignore_index=True)

predictions.to_csv("dev-evaluation.csv")

# TODO

- [ ] manejar parrafos con mas de 512 tokens (en training se trunco)

In [None]:
df = predictions.copy()
df.dropna(inplace=True)

In [None]:
df.info()

In [None]:
df["label"].value_counts(normalize=True)

In [None]:
df.head()

In [None]:
# Exact match
df["match"] = df["label"] == df["pred"]
df["match"].value_counts(normalize=True)

In [None]:
df.loc[df["label"] != "O", "match"].value_counts(normalize=True)

In [None]:
df.loc[df["label"] == "O", "match"].value_counts(normalize=True)

In [None]:
df.loc[(df["label"] == "O") & (df["match"] != 1)]

In [None]:
normalize_class = lambda x: re.sub(r"B-|I-", "", x)

df["normalized_label"] = df["label"].map(normalize_class)
df["normalized_pred"] = df["pred"].map(normalize_class)

In [None]:
df.head()

In [None]:
# Normalized exact match
df["normalized_match"] = df["normalized_label"] == df["normalized_pred"]
df["normalized_match"].value_counts(normalize=True)

In [None]:
df.loc[df["normalized_label"] != "O", "normalized_match"].value_counts(normalize=True)

In [None]:
df.loc[df["normalized_label"] == "O", "normalized_match"].value_counts(normalize=True)

In [None]:
df["normalized_pred"].value_counts(normalize=True)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print(classification_report(df["label"], df["pred"]))

In [None]:
print(classification_report(df["normalized_label"], df["normalized_pred"]))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(20, 20))

labels = df["normalized_label"].unique()

cm = confusion_matrix(
    df["normalized_label"],
    df["normalized_pred"],
    labels=labels,
    # normalize="true",
)
order = np.argsort(-cm.diagonal())


cm_norm = confusion_matrix(
    df["normalized_label"],
    df["normalized_pred"],
    labels=labels,
    normalize="true",
)
cm_sorted = cm_norm[order, :][:, order]

labels_sorted = labels[order]

sns.heatmap(
    cm_sorted,
    vmin=0.0,
    vmax=1.0,
    cmap="Blues",
    annot=True,
    fmt=".2f",
    cbar=False,
    xticklabels=labels_sorted,
    yticklabels=labels_sorted,
)

plt.title("Confusion Matrix", fontdict={"fontsize": 20})