# 1. Project Overview

This notebook fine-tunes a small Portuguese BERT model for Named Entity Recognition (NER) on Brazilian legal texts using the LeNER-Br dataset. It supports Deliverable 1 of IF1015 - Advanced Topics in Information Systems 6 (application definition plus partial training results). The workflow below keeps the full training pipeline intact while improving documentation and clarity.

In [None]:
# Optional workspace setup. Uncomment to set HF_TOKEN here if it is not already exported.
# import os
# os.environ["HF_TOKEN"] = "<your-hf-token>"

# 2. Dataset: LeNER-Br (Brazilian Legal NER)

LeNER-Br contains Brazilian court decisions annotated with entities such as PERSON, ORGANIZATION, LOCATION, TIME, LEGISLACAO, and JURISPRUDENCIA. The dataset is loaded from Hugging Face (`peluz/lener_br`) and comes with train, validation, and test splits.

In [None]:
import os

import evaluate
import matplotlib.pyplot as plt
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
)


## Load the dataset and inspect splits

Authenticate with `HF_TOKEN` if the dataset requires a gated download. The next cells show a sample and basic sentence-length statistics to guide the tokenization setup.

In [None]:
dataset_id = "peluz/lener_br"

# Attempt to read the Hugging Face token from the environment (required if the dataset is gated).
hf_token = os.getenv("HF_TOKEN")

lener = load_dataset(dataset_id, token=hf_token)
print(lener)

split_sizes = {split: len(lener[split]) for split in lener}
for split, size in split_sizes.items():
    print(f"{split}: {size} examples")


In [None]:
# Preview a single annotated example
print(lener["train"][0])


### Sentence-length statistics

Quick descriptive stats to choose a reasonable maximum sequence length for BERT.

In [None]:
sizes = [len(sample["tokens"]) for sample in lener["train"]]

avg_len = sum(sizes) / len(sizes)
max_len = max(sizes)

print(f"Average tokens per sentence: {avg_len:.2f}")
print(f"Max sentence length: {max_len}")

plt.figure(figsize=(8, 5))
plt.boxplot(sizes, vert=True, showmeans=True)
plt.title("Sentence length distribution (train)")
plt.ylabel("Number of tokens")
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()


# 3. Label Space and Task Definition

LeNER-Br uses BIO tags for legal entities (e.g., PERSON, ORGANIZATION, LOCATION, TIME, LEGISLACAO, JURISPRUDENCIA) plus the outside tag `O`. The mapping below is reused for the token-classification head.

In [None]:
ner_feature = lener["train"].features["ner_tags"]
label_list = ner_feature.feature.names
num_labels = len(label_list)
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}
print(label_list)
print(f"Number of labels: {num_labels}")


# 4. Model: Portuguese BERT (small LM)

We fine-tune `neuralmind/bert-base-portuguese-cased` for token classification. The `Trainer` uses `seqeval` to report precision, recall, F1, and accuracy.

In [None]:
checkpoint = "neuralmind/bert-base-portuguese-cased"

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint, num_labels=num_labels, id2label=id2label, label2id=label2id
)

metric = evaluate.load("seqeval")

def align_predictions(predictions, labels):
    pred_ids = np.argmax(predictions, axis=-1)
    true_labels, true_preds = [], []
    for pred_sequence, label_sequence in zip(pred_ids, labels):
        aligned_labels = []
        aligned_preds = []
        for pred_id, label_id in zip(pred_sequence, label_sequence):
            if label_id == -100:
                continue
            aligned_labels.append(label_list[label_id])
            aligned_preds.append(label_list[pred_id])
        true_labels.append(aligned_labels)
        true_preds.append(aligned_preds)
    return true_preds, true_labels


def compute_metrics(p):
    predictions, labels = p
    true_preds, true_labels = align_predictions(predictions, labels)
    results = metric.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


# 5. Tokenization and Label Alignment

Word-level labels are aligned to subword tokens. Subtokens repeat the parent word label while padding tokens use `-100` so they are ignored by the loss. `MAX_LEN` is set based on the sentence-length stats above.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

MAX_LEN = 256

def tokenize_and_align(examples):
    tokenized = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        max_length=MAX_LEN,
    )

    all_labels = []
    for i, labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        prev_word = None
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != prev_word:
                label_ids.append(labels[word_id])
            else:
                # Repeat the same label on subtokens to keep alignment across pieces.
                label_ids.append(labels[word_id])
            prev_word = word_id
        all_labels.append(label_ids)

    tokenized["labels"] = all_labels
    return tokenized


tokenized_ds = lener.map(
    tokenize_and_align,
    batched=True,
    remove_columns=["tokens", "ner_tags", "id"],
)

print(tokenized_ds["train"][0].keys())


# 6. Training Configuration

Hyperparameters for the initial run (Deliverable 1): learning rate 5e-5, 3 epochs, batch size 8/8, weight decay 0.01, mixed precision on GPU (`fp16`), max sequence length 256. Evaluation and checkpoints run at each epoch.


In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)
use_fp16 = torch.cuda.is_available()

training_args = TrainingArguments(
    output_dir="results_lenerbr",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    fp16=use_fp16,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

eval_predictions = trainer.predict(tokenized_ds["validation"])
true_preds, true_labels = align_predictions(
    eval_predictions.predictions, eval_predictions.label_ids
)
label_metrics = metric.compute(predictions=true_preds, references=true_labels)
core_metrics = {
    "precision": label_metrics["overall_precision"],
    "recall": label_metrics["overall_recall"],
    "f1": label_metrics["overall_f1"],
    "accuracy": label_metrics["overall_accuracy"],
}


# 7. Initial Training Results (Deliverable 1)

Aggregate validation metrics and per-entity scores are printed below for quick reference.

In [None]:
print("Validation metrics (seqeval):")
for name, value in core_metrics.items():
    print(f"  {name.capitalize():<10}: {value:.4f}")

entity_rows = [
    (label, scores["precision"], scores["recall"], scores["f1"], scores["number"])
    for label, scores in label_metrics.items()
    if not label.startswith("overall_")
]

if entity_rows:
    print("\nPer-entity scores:")
    header = f"{'Label':<18} {'P':>6} {'R':>6} {'F1':>6} {'N':>6}"
    print(header)
    print("-" * len(header))
    for label, p, r, f1, n in entity_rows:
        print(f"{label:<18} {p:6.3f} {r:6.3f} {f1:6.3f} {n:6d}")

save_dir = "models/lenerbr_bert_base"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"\nModel and tokenizer saved to: {save_dir}")


# 8. Qualitative Analysis: Example Predictions

A single test sentence is decoded to illustrate token-level predictions after fine-tuning.

In [None]:
example = lener["test"][0]
tokens = example["tokens"]

encoding = tokenizer(
    tokens,
    is_split_into_words=True,
    return_tensors="pt",
    truncation=True,
)

word_ids = encoding.word_ids()

device = "cuda" if torch.cuda.is_available() else "cpu"
encoding = {k: v.to(device) for k, v in encoding.items()}
model.to(device)
model.eval()

with torch.no_grad():
    outputs = model(**encoding)

pred_ids = outputs.logits.argmax(-1).cpu().numpy()[0]

pred_labels = []
clean_tokens = []
used = set()

for idx, word_id in enumerate(word_ids):
    if word_id is None:
        continue
    if word_id in used:
        continue
    used.add(word_id)
    clean_tokens.append(tokens[word_id])
    pred_labels.append(label_list[pred_ids[idx]])

print("Example sentence:")
print(" ".join(clean_tokens))

print("\nToken-level predictions:")
print(f"{'Token':<20} {'Predicted tag'}")
print("-" * 34)
for token, label in zip(clean_tokens, pred_labels):
    print(f"{token:<20} {label}")


# 9. Next Steps: Robustness, Interpretability, Adversarial Attacks

Planned follow-ups: robustness checks (noise/context perturbations), interpretability analyses (attention/explanations), and adversarial stress-testing for the NER model.