# Token classification (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
# Install required libraries for token classification and model training
!uv pip install datasets evaluate transformers[sentencepiece]
!uv pip install accelerate
# To run the training on TPU, you will need to uncomment the following line:
# !uv pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs

You will need to setup git, adapt your email and name in the following cell.

In [None]:
# Configure git credentials for model pushing to Hugging Face Hub
!git config --global user.email "you@example.com"
!git config --global user.name "Your Name"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [None]:
# Login to Hugging Face Hub for model upload and access to private models
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# Load the CoNLL-2003 dataset for Named Entity Recognition (NER)
# This dataset contains news articles with entities labeled as PER, ORG, LOC, MISC
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")

In [None]:
# Explore the dataset structure - shows train/validation/test splits and features
raw_datasets

In [None]:
# Examine the tokens (words) in the first training example
raw_datasets["train"][0]["tokens"]

In [None]:
# Look at the corresponding NER tags (numeric labels for each token)
raw_datasets["train"][0]["ner_tags"]

In [None]:
# Get the NER feature information to understand the label encoding
# This shows the mapping from numeric tags to entity labels
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

In [None]:
# Extract the label names: O (Outside), B- (Beginning), I- (Inside) for PER/ORG/LOC/MISC
label_names = ner_feature.feature.names
label_names

In [None]:
# Visualize the token-label alignment for better understanding
# This shows how each word corresponds to its entity label
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

In [None]:
# Load the tokenizer for BERT base model (cased version for better entity recognition)
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# Check if we're using a fast tokenizer (needed for word_ids() method)
tokenizer.is_fast

In [None]:
# Tokenize the first example and see how words are split into subwords
# is_split_into_words=True tells the tokenizer the input is already word-tokenized
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

In [None]:
# Get word IDs to track which subword tokens belong to which original words
# None values correspond to special tokens [CLS] and [SEP]
inputs.word_ids()

In [None]:
# Function to align NER labels with tokenized input
# When words are split into subwords, we need to align labels accordingly
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]  # -100 for special tokens
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token (subword continuation)
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX for subword tokens
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
# Test the label alignment function on our first example
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

In [None]:
# Preprocessing function to tokenize and align labels for the entire dataset
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
# Apply tokenization and label alignment to the entire dataset
# This processes all splits (train, validation, test) and removes original columns
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

In [None]:
# Create a data collator for token classification that handles dynamic padding
# This ensures all sequences in a batch have the same length
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
# Test the data collator by creating a batch from two examples
# Notice how shorter sequences are padded with -100 for labels
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

In [None]:
# Compare original label lengths before padding
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

In [None]:
# Install seqeval for proper NER evaluation metrics
!uv pip install seqeval

In [None]:
# Load the seqeval metric which correctly handles entity-level evaluation
# Unlike token-level accuracy, this evaluates complete entities
import evaluate

metric = evaluate.load("seqeval")

In [None]:
# Convert numeric labels to string labels for metric evaluation
labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

In [None]:
# Example of metric evaluation: simulate a prediction error (B-MISC → O)
# This shows entity-level precision, recall, and F1 scores
predictions = labels.copy()
predictions[2] = "O"  # Change "B-MISC" to "O" (miss one entity)
metric.compute(predictions=[predictions], references=[labels])

In [None]:
# Define metric computation function for training evaluation
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
# Create label mappings for the model configuration
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
# Load the model for token classification with proper label configuration
# The model will have 9 output classes (one for each NER label)
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
# Verify the model has the correct number of output labels
model.config.num_labels

In [None]:
# Login again to ensure access for model pushing (if needed)
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# Configure training arguments for fine-tuning
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner",  # Output directory name
    evaluation_strategy="epoch",  # Evaluate after each epoch
    save_strategy="epoch",  # Save checkpoint after each epoch
    learning_rate=2e-5,  # Learning rate optimized for BERT fine-tuning
    num_train_epochs=3,  # Number of training epochs
    weight_decay=0.01,  # L2 regularization
    push_to_hub=True,  # Automatically push model to Hugging Face Hub
)

In [None]:
# Initialize and run the trainer for fine-tuning BERT on NER
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()  # Start training process

In [None]:
# Upload the trained model to Hugging Face Hub
trainer.push_to_hub(commit_message="Training complete")

In [None]:
# Alternative: Manual training loop with PyTorch DataLoaders
# Create data loaders for training and evaluation
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,  # Shuffle training data
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [None]:
# Initialize a fresh model for manual training loop
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
# Set up optimizer with AdamW (recommended for transformers)
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
# Use Accelerate for distributed training and mixed precision
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
# Set up learning rate scheduler for better convergence
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",  # Linear decay schedule
    optimizer=optimizer,
    num_warmup_steps=0,  # No warmup steps
    num_training_steps=num_training_steps,
)

In [None]:
# Set up repository for model versioning and sharing
from huggingface_hub import Repository, get_full_repo_name

model_name = "bert-finetuned-ner-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

In [None]:
# Clone the repository for local saving and version control
output_dir = "bert-finetuned-ner-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

In [None]:
# Post-processing function to prepare predictions and labels for evaluation
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
# Manual training loop with evaluation and model saving
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training phase
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)  # Backward pass with accelerate

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation phase
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Pad predictions and labels for distributed training
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        # Gather predictions from all processes
        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    # Compute and print evaluation results
    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    # Save model and push to hub after each epoch
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

In [None]:
# Final model save after training completion
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)

In [None]:
# Test the trained model using a pipeline for easy inference
# aggregation_strategy="simple" groups B- and I- tokens into complete entities
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "huggingface-course/bert-finetuned-ner"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")