In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from datasets import load_dataset, load_from_disk
import evaluate
import numpy as np
import torch
import os

model_dir = "./pii_model"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForTokenClassification.from_pretrained(model_dir)
model.eval()

dataset_name = "ai4privacy/pii-masking-400k"
dataset_path = "saved_datasets/ai4privacy_pii-masking-400k"

if os.path.exists(dataset_path):
    dataset = load_from_disk(dataset_path)
else:
    dataset = load_dataset(dataset_name)

unique_labels = set()
for row in dataset["train"]["mbert_token_classes"]:
    unique_labels.update(row)
label_list = sorted(unique_labels)
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for i, label in enumerate(label_list)}

val_samples = dataset["validation"].select(range(100))

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["mbert_tokens"],
        is_split_into_words=True,
        truncation=True,
        return_tensors=None
    )
    labels = []
    for i, label in enumerate(examples["mbert_token_classes"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != prev_word_id:
                label_ids.append(label_to_id[label[word_id]])
            else:
                label_ids.append(label_to_id[label[word_id]])
            prev_word_id = word_id
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized = val_samples.map(tokenize_and_align_labels, batched=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': './saved_models/pii_model'. Use `repo_type` argument if needed.

In [2]:
all_preds = []
all_labels = []

for example in tokenized:
    input_ids = torch.tensor([example["input_ids"]]).to(device)
    attention_mask = torch.tensor([example["attention_mask"]]).to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1).squeeze().cpu().numpy()
    labels = np.array(example["labels"])

    word_preds = [id_to_label[p] for (p, l) in zip(preds, labels) if l != -100]
    word_labels = [id_to_label[l] for (p, l) in zip(preds, labels) if l != -100]

    all_preds.append(word_preds)
    all_labels.append(word_labels)

metric = evaluate.load("seqeval")
results = metric.compute(predictions=all_preds, references=all_labels)
print("Evaluation Metrics:")
for k, v in results.items():
    print(f"{k}: {v}")


Evaluation Metrics:
ACCOUNTNUM: {'precision': np.float64(0.5), 'recall': np.float64(0.5833333333333334), 'f1': np.float64(0.5384615384615384), 'number': np.int64(12)}
BUILDINGNUM: {'precision': np.float64(0.864406779661017), 'recall': np.float64(0.75), 'f1': np.float64(0.8031496062992127), 'number': np.int64(68)}
CITY: {'precision': np.float64(0.8205128205128205), 'recall': np.float64(0.8648648648648649), 'f1': np.float64(0.8421052631578947), 'number': np.int64(37)}
CREDITCARDNUMBER: {'precision': np.float64(0.6551724137931034), 'recall': np.float64(1.0), 'f1': np.float64(0.7916666666666666), 'number': np.int64(19)}
DATEOFBIRTH: {'precision': np.float64(0.8888888888888888), 'recall': np.float64(0.8648648648648649), 'f1': np.float64(0.8767123287671232), 'number': np.int64(37)}
DRIVERLICENSENUM: {'precision': np.float64(0.7), 'recall': np.float64(0.7777777777777778), 'f1': np.float64(0.7368421052631577), 'number': np.int64(9)}
EMAIL: {'precision': np.float64(1.0), 'recall': np.float64(0.

In [7]:
def predict_pii(sentence: str):
    # Tokenize input (raw string → subword tokens)
    encoding = tokenizer(sentence, return_tensors="pt", truncation=True, is_split_into_words=False)
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    # Run model
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).squeeze().cpu().numpy()

    tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().cpu().numpy())
    labels = [id_to_label[pred] for pred in predictions]

    # Print output
    print("TOKEN".ljust(20), "PREDICTED LABEL")
    print("-" * 40)
    for token, label in zip(tokens, labels):
        print(f"{token.ljust(20)} {label}")

predict_pii("My SSN is 123456789)")


TOKEN                PREDICTED LABEL
----------------------------------------
[CLS]                O
My                   O
SS                   O
##N                  O
is                   O
123                  B-TAXNUM
##45                 I-TAXNUM
##6                  I-TAXNUM
##7                  I-TAXNUM
##8                  I-TAXNUM
##9                  I-TAXNUM
)                    O
[SEP]                O
