# Updated NER with DeBERTa (BIO tags, seqeval)
This notebook provides an updated, reproducible pipeline to fine-tune a DeBERTa model for token classification (NER) using the PII detection dataset. It includes:

- Installation of required libraries,
- Loading the dataset from specified paths,
- Preprocessing labels to BIO format and aligning to subtokens (with -100 for subtokens),
- Fine-tuning a DeBERTa model using Hugging Face Trainer with seqeval metric,
- Inference demonstration using transformers.pipeline with aggregation strategy.

Ensure dataset files are available at the specified paths or update DATA_ROOTS accordingly.

In [None]:
# Install required packages
%pip install -q transformers[torch] datasets accelerate evaluate seqeval wandb

In [None]:
# Imports and configuration
import os
from pathlib import Path
import json
import numpy as np
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoConfig, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification, TrainingArguments, Trainer
import evaluate

# Dataset paths
DATA_ROOTS = [
    '/kaggle/input/pii-detection-removal-from-educational-data',
    './data',
]

def find_file(name):
    for root in DATA_ROOTS:
        p = Path(root) / name
        if p.exists():
            return str(p)
    return None

TRAIN_FILE = find_file('train.json')
TEST_FILE = find_file('test.json')
SAMPLE_SUB = find_file('sample_submission.csv')

print('Train file:', TRAIN_FILE)
print('Test file:', TEST_FILE)
print('Sample submission:', SAMPLE_SUB)

In [None]:
# Load dataset
if TRAIN_FILE:
    ds = load_dataset('json', data_files={'train': TRAIN_FILE, 'test': TEST_FILE} if TEST_FILE else {'train': TRAIN_FILE})
    print(ds)
else:
    print('Dataset files not found. Please update DATA_ROOTS or place files in ./data.')

In [None]:
# Inspect dataset examples
if 'ds' in locals():
    for i in range(min(3, len(ds['train']))):
        example = ds['train'][i]
        print('Example:')
        for key, value in example.items():
            print(f'{key}: {type(value)}')
        print()

In [None]:
# Helper functions for preprocessing
def whitespace_tokenize_with_spans(text):
    tokens = []
    spans = []
    i = 0
    for tok in text.split():
        j = text.find(tok, i)
        if j == -1:
            j = i
        tokens.append(tok)
        spans.append((j, j + len(tok)))
        i = j + len(tok)
    return tokens, spans

def spans_to_bio(tokens, token_spans, entities):
    labels = ['O'] * len(tokens)
    for ent in entities:
        start, end, lab = ent['start'], ent['end'], ent.get('label', ent.get('entity'))
        first = True
        for i, (ts, te) in enumerate(token_spans):
            if te <= start or ts >= end:
                continue
            prefix = 'B-' if first else 'I-'
            labels[i] = prefix + str(lab)
            first = False
    return labels

def align_labels_to_tokens(tokenizer, words, labels, label_to_id, label_all_tokens=False):
    encoding = tokenizer(words, is_split_into_words=True, truncation=True, padding=False)
    word_ids = encoding.word_ids()
    aligned_labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        elif word_idx != previous_word_idx:
            aligned_labels.append(label_to_id[labels[word_idx]])
        else:
            aligned_labels.append(label_to_id[labels[word_idx]] if label_all_tokens else -100)
        previous_word_idx = word_idx
    encoding['labels'] = aligned_labels
    return encoding

In [None]:
# Initialize tokenizer and build dataset
MODEL_NAME = 'microsoft/deberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.model_max_length = 512

if 'ds' in locals():
    tokenized_examples, label_list, label_to_id = build_token_classification_dataset(ds, tokenizer)
    print('Labels:', label_list)

In [None]:
# Model setup
if 'label_list' in locals():
    id2label = {i: l for i, l in enumerate(label_list)}
    label2id = {l: i for i, l in enumerate(label_list)}
    config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=len(label_list), id2label=id2label, label2id=label2id)
    model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, config=config)
    
    data_collator = DataCollatorForTokenClassification(tokenizer)
    
    from datasets import Dataset
    train_ds = Dataset.from_dict(tokenized_examples)
    dataset_for_trainer = DatasetDict({'train': train_ds})
    
    seqeval = evaluate.load('seqeval')
    
    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)
        true_predictions = [
            [label_list[pred] for (pred, lab) in zip(prediction, label) if lab != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [label_list[lab] for (pred, lab) in zip(prediction, label) if lab != -100]
            for prediction, label in zip(predictions, labels)
        ]
        results = seqeval.compute(predictions=true_predictions, references=true_labels)
        return {
            'precision': results['overall_precision'],
            'recall': results['overall_recall'],
            'f1': results['overall_f1'],
        }

In [None]:
# Training setup
if 'model' in locals():
    import torch
    torch.cuda.empty_cache()
    
    training_args = TrainingArguments(
        output_dir='./deberta-ner-output',
        eval_strategy='no',
        learning_rate=3e-5,
        per_device_train_batch_size=1,  # Minimal batch size for GPU memory
        num_train_epochs=2,
        weight_decay=0.01,
        warmup_ratio=0.1,
        lr_scheduler_type='cosine',
        save_total_limit=1,
        fp16=True,  # Enable fp16 for GPU
        report_to='none',
        gradient_checkpointing=True,  # Re-enable for memory savings
        dataloader_num_workers=0,
        logging_steps=50,
        save_steps=200,
        optim='adamw_torch',
        # Remove no_cuda=True to use GPU
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset_for_trainer['train'],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    # Uncomment to train
    trainer.train()
    trainer.save_model('./deberta-ner-final')
    print('Training completed and model saved.')

In [None]:
# Inference with pipeline
from transformers import pipeline

model_dir = './deberta-ner-final' if Path('./deberta-ner-final').exists() else MODEL_NAME
ner_pipe = pipeline('ner', model=model_dir, tokenizer=tokenizer, aggregation_strategy='simple')

text = 'Contact John Doe at john.doe@example.com or call +1 555-555-5555.'
entities = ner_pipe(text)
print('Aggregated Entities:')
for e in entities:
    print(e)