In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from datasets import load_dataset, load_from_disk
import evaluate
import numpy as np
import torch
import os

# Load model and tokenizer
model_dir = "./pii_model"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForTokenClassification.from_pretrained(model_dir)
model.eval()

# Load dataset (use same tokenizer fields)
dataset_name = "ai4privacy/pii-masking-400k"
dataset_path = "saved_datasets/ai4privacy_pii-masking-400k"

if os.path.exists(dataset_path):
    dataset = load_from_disk(dataset_path)
else:
    dataset = load_dataset(dataset_name)

# Load label mappings
unique_labels = set()
for row in dataset["train"]["mbert_token_classes"]:
    unique_labels.update(row)
label_list = sorted(unique_labels)
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for i, label in enumerate(label_list)}

# Sample 100 validation examples
val_samples = dataset["validation"].select(range(100))

# Tokenize & align
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["mbert_tokens"],
        is_split_into_words=True,
        truncation=True,
        return_tensors=None
    )
    labels = []
    for i, label in enumerate(examples["mbert_token_classes"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != prev_word_id:
                label_ids.append(label_to_id[label[word_id]])
            else:
                label_ids.append(label_to_id[label[word_id]])
            prev_word_id = word_id
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized = val_samples.map(tokenize_and_align_labels, batched=True)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Run inference
all_preds = []
all_labels = []

for example in tokenized:
    input_ids = torch.tensor([example["input_ids"]]).to(device)
    attention_mask = torch.tensor([example["attention_mask"]]).to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1).squeeze().cpu().numpy()
    labels = np.array(example["labels"])

    # Convert predictions and labels to tag names, ignoring -100
    word_preds = [id_to_label[p] for (p, l) in zip(preds, labels) if l != -100]
    word_labels = [id_to_label[l] for (p, l) in zip(preds, labels) if l != -100]

    all_preds.append(word_preds)
    all_labels.append(word_labels)

# Evaluate
metric = evaluate.load("seqeval")
results = metric.compute(predictions=all_preds, references=all_labels)
print("Evaluation Metrics:")
for k, v in results.items():
    print(f"{k}: {v}")


  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'os' is not defined