In [None]:
# !sudo pip install transformers peft evaluate seqeval

In [None]:
from datasets import load_from_disk
import srsly

ANNOT_DIR = "/resources/data/restricted/anonymization"

dataset = load_from_disk(f"{ANNOT_DIR}/hg_dataset")

with open(f"{ANNOT_DIR}/hg_dataset/label_mapping.json") as file:
    label2code = srsly.json_loads(file.read())
    code2label = {v: k for k, v in label2code.items()}

print(dataset)
print("nlabels:", len(code2label))

In [None]:
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
)
import evaluate

# model_checkpoint = "roberta-large"
model_checkpoint = "dccuchile/bert-base-spanish-wwm-cased"

seqeval = evaluate.load("seqeval")

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label2code.keys()),
    id2label=code2label,
    label2id=label2code,
)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
len(tokenized_dataset["train"])

In [None]:
from peft import LoraConfig, TaskType, get_peft_model


lora_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS,
    inference_mode=False,
    r=16,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="all",
)

In [None]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
import numpy as np

seqeval = evaluate.load("seqeval")


def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [code2label.get(p) for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [code2label.get(l, "O") for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="beto-lora-aymurai-ner",
    learning_rate=1e-3,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=16,
    # predict_with_generate=True,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_steps=500,
    save_steps=500,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
)

In [None]:
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [None]:
trainer.train()

In [None]:
MODEL_PATH = "./beto-lora-aymurai-ner/model/"

model.save_pretrained(MODEL_PATH)

In [None]:
from peft import (
    PeftConfig,
    PeftModelForTokenClassification,
    LoraConfig,
    TaskType,
    get_peft_model,
    PeftModel,
)
from transformers import AutoTokenizer, AutoModelForTokenClassification


MODEL_PATH = "./beto-lora-aymurai-ner/model"

# Load peft config for pre-trained checkpoint etc.
peft_config = PeftConfig.from_pretrained(MODEL_PATH)

# load base model and tokenizer
model = AutoModelForTokenClassification.from_pretrained(
    peft_config.base_model_name_or_path,
    num_labels=len(label2code.keys()),
    id2label=code2label,
    label2id=label2code,
)

model = PeftModelForTokenClassification.from_pretrained(model, MODEL_PATH)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)

In [None]:
import torch

text = "El imputado Ramiro Ramallo Martinez DNI 88.384.425 declarado"
inputs = tokenizer(text, return_tensors="pt")
model.to("cpu")
with torch.no_grad():
    logits = model(**inputs).logits

tokens = inputs.tokens()

predictions = torch.argmax(logits, dim=2)

for token, prediction in zip(tokens, predictions[0].numpy()):
    print((token, model.config.id2label[prediction]))

In [None]:
tokenizer.convert_tokens_to_string(tokens)

In [None]:
from transformers import pipeline

pipe = pipeline(task="token-classification", model=model, tokenizer=tokenizer)

In [None]:
pipe(text)