# LayoutLM Model Training: Invoice Field Classification

This notebook trains a `LayoutLMForTokenClassification` model to recognize key fields in scanned invoices, including:
- `COMPANY`
- `DATE`
- `ADDRESS`
- `TOTAL`

It uses preprocessed OCR+label data and aligns text tokens with bounding boxes and entity labels.

---

### Pipeline Overview:
1. **Load Dataset**: Preprocessed invoice dataset from disk
2. **Define Label Map**: Maps field names to numeric class labels
3. **Preprocess**:
   - Tokenize invoice words
   - Align labels & bounding boxes with word-level tokens
4. **Train**:
   - Uses Hugging Face `Trainer` with LayoutLM
   - Trains for 3 epochs with logging + saving per epoch

>  Make sure the dataset at `data/processed_invoice_dataset` contains:
> - `image_path`, `words`, `boxes`, and `labels` fields


In [None]:
import os
from transformers import LayoutLMForTokenClassification, LayoutLMTokenizerFast, TrainingArguments, Trainer
from datasets import load_from_disk
from PIL import Image

# Load the dataset
dataset = load_from_disk("../data/processed_invoice_dataset")

# Label map
label_map = {
    'O': 0,
    'B-COMPANY': 1,
    'B-DATE': 2,
    'B-ADDRESS': 3,
    'B-TOTAL': 4
}

# Load tokenizer for LayoutLM
tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")

# Normalize boxes to 0–1000 scale
def normalize_box(box, width=1000, height=1000):
    return [
        int(box[0] * 1000 / width),
        int(box[1] * 1000 / height),
        int(box[2] * 1000 / width),
        int(box[3] * 1000 / height),
    ]

# Preprocessing function
def preprocess(example):
    from PIL import Image
    image_path = example["image_path"]
    words = example["words"]
    boxes = example["boxes"]
    labels = example["labels"]

    image = Image.open(image_path).convert("RGB")
    width, height = image.size

    # Normalize boxes
    norm_boxes = [normalize_box(box, width, height) for box in boxes]

    # Tokenize and get word IDs
    encoding = tokenizer(
        words,
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_offsets_mapping=True
    )

    word_ids = encoding.word_ids()
    aligned_labels = []
    aligned_boxes = []

    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(0)  # Padding
            aligned_boxes.append([0, 0, 0, 0])
        else:
            aligned_labels.append(label_map.get(labels[word_idx], 0))
            aligned_boxes.append(norm_boxes[word_idx])

    encoding["labels"] = aligned_labels
    encoding["bbox"] = aligned_boxes
    del encoding["offset_mapping"]

    return encoding


# Map preprocessing to dataset
tokenized_dataset = dataset.map(preprocess, batched=False)

# Load model
model = LayoutLMForTokenClassification.from_pretrained(
    "microsoft/layoutlm-base-uncased",
    num_labels=len(label_map),
    id2label={v: k for k, v in label_map.items()},
    label2id=label_map
)

# Training configuration
training_args = TrainingArguments(
    output_dir="./models/layoutlm_invoice",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

# Train the model
trainer.train()


Map: 100%|██████████| 626/626 [00:21<00:00, 28.75 examples/s]
Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.6817
20,0.2414
30,0.2063
40,0.1509
50,0.1242
60,0.1694
70,0.1445
80,0.13
90,0.1117
100,0.1054




TrainOutput(global_step=939, training_loss=0.06651967967026277, metrics={'train_runtime': 4545.4903, 'train_samples_per_second': 0.413, 'train_steps_per_second': 0.207, 'total_flos': 494135871547392.0, 'train_loss': 0.06651967967026277, 'epoch': 3.0})

In [1]:
import os
from transformers import LayoutLMForTokenClassification, LayoutLMTokenizerFast, TrainingArguments, Trainer
from datasets import load_from_disk
from PIL import Image

# Load the processed dataset
dataset = load_from_disk("../data/processed_invoice_dataset")

# Only keep the labels you're training on
label_map = {
    'O': 0,
    'B-COMPANY': 1,
    'B-DATE': 2,
    'B-TOTAL': 3
}
id2label = {v: k for k, v in label_map.items()}
label2id = label_map

# Load tokenizer
tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")

# Normalize box coordinates to 0–1000
def normalize_box(box, width, height):
    return [
        int(box[0] * 1000 / width),
        int(box[1] * 1000 / height),
        int(box[2] * 1000 / width),
        int(box[3] * 1000 / height),
    ]

# Preprocess dataset entries
def preprocess(batch):
    encodings = []

    for image_path, words, boxes, labels in zip(
        batch["image_path"], batch["words"], batch["boxes"], batch["labels"]
    ):
        image = Image.open(image_path).convert("RGB")
        width, height = image.size

        # Truncate if too long
        if len(words) > 512:
            words = words[:512]
            boxes = boxes[:512]
            labels = labels[:512]

        norm_boxes = [normalize_box(b, width, height) for b in boxes]

        encoding = tokenizer(
            words,
            is_split_into_words=True,
            truncation=True,
            padding="max_length",
            max_length=512,
            return_offsets_mapping=True,
        )

        word_ids = encoding.word_ids()
        aligned_labels = []
        aligned_boxes = []

        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(label_map['O'])
                aligned_boxes.append([0, 0, 0, 0])
            else:
                aligned_labels.append(label_map.get(labels[word_idx], label_map['O']))
                aligned_boxes.append(norm_boxes[word_idx])

        encoding["labels"] = aligned_labels
        encoding["bbox"] = aligned_boxes
        del encoding["offset_mapping"]
        encodings.append(encoding)

    return {
        "input_ids": [e["input_ids"] for e in encodings],
        "attention_mask": [e["attention_mask"] for e in encodings],
        "labels": [e["labels"] for e in encodings],
        "bbox": [e["bbox"] for e in encodings],
    }

# Apply preprocessing
tokenized_dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset.column_names
)
tokenized_dataset.set_format("torch")


# Load model
model = LayoutLMForTokenClassification.from_pretrained(
    "microsoft/layoutlm-base-uncased",
    num_labels=len(label_map),
    id2label=id2label,
    label2id=label2id
)

# Set training arguments
training_args = TrainingArguments(
    output_dir="models/layoutlm_invoice",
    per_device_train_batch_size=2,
    num_train_epochs=5,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

# Initialize Trainer and start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()

# Save tokenizer with the model
tokenizer.save_pretrained("../models/layoutlm_invoice")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.4742
20,0.1606
30,0.1176
40,0.0807
50,0.0768
60,0.1129
70,0.0648
80,0.0806
90,0.0619
100,0.0672




('../models/layoutlm_invoice\\tokenizer_config.json',
 '../models/layoutlm_invoice\\special_tokens_map.json',
 '../models/layoutlm_invoice\\vocab.txt',
 '../models/layoutlm_invoice\\added_tokens.json',
 '../models/layoutlm_invoice\\tokenizer.json')

In [2]:
# Save the trained model manually from the last checkpoint
trainer.save_model("../models/layoutlm_invoice")
tokenizer.save_pretrained("../models/layoutlm_invoice")


('../models/layoutlm_invoice\\tokenizer_config.json',
 '../models/layoutlm_invoice\\special_tokens_map.json',
 '../models/layoutlm_invoice\\vocab.txt',
 '../models/layoutlm_invoice\\added_tokens.json',
 '../models/layoutlm_invoice\\tokenizer.json')