# OCR Label Generation for Invoice Dataset

This notebook processes scanned invoice images and their corresponding key-value JSON files to generate:
- Word-level OCR data using `pytesseract`
- Bounding boxes for each word (normalized)
- Token-level labels (`company`, `date`, `address`, `total`) using field text matching

The final output is saved in HuggingFace Dataset format and can be directly used for training models like **LayoutLM** or **LayoutLMv2**.

---

### Steps Performed:
1. Load invoice images and `.txt` annotation files (with fields in JSON)
2. Extract OCR text and bounding boxes from images
3. Match extracted tokens to annotation values to assign labels
4. Save the output dataset using `datasets.Dataset` format

In [1]:
# Import necessary libraries
import os
import json
from pathlib import Path
from PIL import Image
import pytesseract
from datasets import Dataset
from pytesseract import Output

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set paths
data_root = Path("../data/sroie")
image_dir = data_root / "images"
anno_dir = data_root / "annotations"

entries = []
label_fields = ["company", "date", "address", "total"]

In [3]:
# Loop through each image and its corresponding key-value .txt
for filename in os.listdir(image_dir):
    if not filename.endswith(".jpg"):
        continue

    img_path = image_dir / filename
    anno_path = anno_dir / filename.replace(".jpg", ".txt")

    if not anno_path.exists():
        print(f"Annotation missing for {filename}, skipping.")
        continue

    try:
        with open(anno_path, "r") as f:
            key_data = json.load(f)

        image = Image.open(img_path).convert("RGB")
        ocr_data = pytesseract.image_to_data(image, output_type=Output.DICT)

        words = []
        boxes = []
        labels = []

        for i in range(len(ocr_data['text'])):
            word = ocr_data['text'][i].strip()
            if not word:
                continue

            x, y, w, h = (ocr_data['left'][i], ocr_data['top'][i], ocr_data['width'][i], ocr_data['height'][i])
            box = [x, y, x + w, y + h]

            assigned_label = "O"
            for field in label_fields:
                field_val = key_data.get(field, "").lower()
                if word.lower() in field_val:
                    assigned_label = f"B-{field.upper()}"
                    break

            words.append(word)
            boxes.append(box)
            labels.append(assigned_label)

        entries.append({
            "image_path": str(img_path.resolve()).replace("\\", "/"),
            "words": words,
            "boxes": boxes,
            "labels": labels
        })

    except Exception as e:
        print(f"Failed for {filename}: {e}")

# Save as HuggingFace dataset
if entries:
    dataset = Dataset.from_list(entries)
    dataset.save_to_disk("data/processed_invoice_dataset")
    print("Saved processed dataset to data/processed_invoice_dataset")
else:
    print("No entries created.")

Saving the dataset (1/1 shards): 100%|██████████| 626/626 [00:00<00:00, 38910.97 examples/s]

Saved processed dataset to data/processed_invoice_dataset





In [1]:
# ─────────────────────────────────────────────
#  0. Imports
# ─────────────────────────────────────────────
import os
import json
import re
from pathlib import Path

from PIL import Image
import pytesseract
from pytesseract import Output
from datasets import Dataset

# ─────────────────────────────────────────────
#  1. Paths
# ─────────────────────────────────────────────
data_root   = Path("../data/sroie")           # adjust if needed
image_dir   = data_root / "images"
anno_dir    = data_root / "annotations"

output_path = "../data/processed_invoice_dataset"

# ─────────────────────────────────────────────
#  2. Label configuration
# ─────────────────────────────────────────────
label_fields = ["company", "date", "total"]   # address removed for now

def clean_text(text: str) -> str:
    """Lowercase & strip all whitespace for robust matching."""
    return re.sub(r"\s+", "", text).lower()

# ─────────────────────────────────────────────
#  3. Main loop – build entries list
# ─────────────────────────────────────────────
entries = []

for filename in os.listdir(image_dir):
    if not filename.lower().endswith(".jpg"):
        continue

    img_path  = image_dir / filename
    anno_path = anno_dir / filename.replace(".jpg", ".txt")

    if not anno_path.exists():
        print(f"[WARN] Annotation missing for {filename}, skipping.")
        continue

    try:
        with open(anno_path, "r", encoding="utf-8") as f:
            anno_data = json.load(f)

        image    = Image.open(img_path).convert("RGB")
        ocr_data = pytesseract.image_to_data(image, output_type=Output.DICT)

        words, boxes, labels = [], [], []

        for i in range(len(ocr_data["text"])):
            word = ocr_data["text"][i].strip()
            if not word:
                continue

            # bounding-box in pixel coords (normalize later in training)
            x, y, w, h = (
                ocr_data["left"][i],
                ocr_data["top"][i],
                ocr_data["width"][i],
                ocr_data["height"][i],
            )
            box = [x, y, x + w, y + h]

            # ---------- label assignment ----------
            assigned_label = "O"
            clean_word     = clean_text(word)

            for field in label_fields:
                gt_value = str(anno_data.get(field, ""))
                # split GT into tokens to allow partial matches
                for part in gt_value.split():
                    if clean_word in clean_text(part):
                        assigned_label = f"B-{field.upper()}"
                        break
                if assigned_label != "O":
                    break
            # --------------------------------------

            words.append(word)
            boxes.append(box)
            labels.append(assigned_label)

        entries.append({
            "image_path": str(img_path.resolve()).replace("\\", "/"),
            "words":      words,
            "boxes":      boxes,
            "labels":     labels
        })

    except Exception as e:
        print(f"[ERROR] Failed for {filename}: {e}")

# ─────────────────────────────────────────────
#  4. Save as Hugging Face dataset
# ─────────────────────────────────────────────
if entries:
    dataset = Dataset.from_list(entries)
    dataset.save_to_disk(output_path)
    print(f"Saved processed dataset to {output_path} (n={len(entries)})")
else:
    print("No entries created – check paths or annotations.")


  from .autonotebook import tqdm as notebook_tqdm
Saving the dataset (1/1 shards): 100%|██████████| 626/626 [00:00<00:00, 56910.75 examples/s]

Saved processed dataset to ../data/processed_invoice_dataset (n=626)



