In [67]:
import os
import re
from rapidfuzz import distance
import pytesseract

In [None]:
# Helper functions
# Based on code from https://github.com/niclasgriesshaber/llm_historical_dataset_benchmarking/blob/main/src/benchmarking/txt_accuracy.py

def clean_text_nonorm(text, index_numbers=True):
    """
    Minimal cleaning:
      - Remove index numbers (if specified)
      - Remove linebreaks/tabs (replace with space)
      - Remove all instances of \"- \" (dash space; word separated by line break)
      - Collapse multiple spaces
      - Strip leading/trailing
      - Preserve punctuation, casing, accented letters
    """
    text = re.sub(r" *\[ *[0-9]+ *\] *", " ", text) if not index_numbers else text
    text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').replace('- ', '')
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def clean_text_normalized(text, index_numbers):
    """
    Fully normalized:
      - Remove linebreaks/tabs
      - Remove all instances of \"- \" (dash space; word separated by line break)
      - Remove all non-ASCII (accented letters are dropped)
      - Convert to lowercase
      - Remove punctuation => keep only [a-z0-9] plus spaces
      - Collapse multiple spaces
      - Strip leading/trailing
    """
    # Remove linebreaks/tabs
    text = clean_text_nonorm(text, index_numbers)

    # Remove all non-ASCII
    text = text.encode("ascii", errors="ignore").decode("ascii")

    # Lowercase
    text = text.lower()

    # Keep only [a-z0-9] + space
    text = re.sub(r"[^a-z0-9 ]+", "", text)

    # Collapse multiple spaces again
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def compute_metrics(ref_text, hyp_text, normalized=False, index_numbers=True):
    """
    Compute Levenshtein distance, CER, WER.
    If normalized=True => use clean_text_normalized,
    else => use clean_text_nonorm.
    If index_numbers=True => keep index numbers
    else => remove index numbers
    """
    if normalized:
        ref_clean = clean_text_normalized(ref_text, index_numbers)
        hyp_clean = clean_text_normalized(hyp_text, index_numbers)
    else:
        ref_clean = clean_text_nonorm(ref_text, index_numbers)
        hyp_clean = clean_text_nonorm(hyp_text, index_numbers)

    dist_char = distance.Levenshtein.distance(ref_clean, hyp_clean)
    ref_len = len(ref_clean)

    cer = dist_char / ref_len if ref_len > 0 else 0.0

    # For WER, split by whitespace
    ref_words = ref_clean.split()
    hyp_words = hyp_clean.split()
    dist_word = distance.Levenshtein.distance("\n".join(ref_words), "\n".join(hyp_words))
    wer = dist_word / len(ref_words) if len(ref_words) > 0 else 0.0

    return dist_char, cer, wer

In [69]:
ground_truth = open('../../data/ground-truth/txt/kbaa-p#100.txt')
ground_truth_text = ground_truth.read()
ground_truth_text_clean = clean_text_normalized(ground_truth_text, index_numbers=False)

ocr_text = pytesseract.image_to_string('../../data/tiffs/kbaa-p#100.tif')
ocr_text_clean = clean_text_normalized(ocr_text, index_numbers=False)

In [70]:
print(ground_truth_text_clean)
print(ocr_text_clean)

entries 19201938 100 field josephfisher hugh field joseph e three years in texasboston abel tompkins 1836 47 p auto p 4147 dlc his experiences in the war with mexico field richard b 1843 richard field lexington mo 1930 38 p mok missouri lawyer and judge tells also of his youth on a farm field stephen johnson 18161899 personal reminiscences of early days in california with other sketches san francisco 1880 248 p whi by a judge and member of the state legislature fifer joseph wilson b 1840 private joe fifer memories of war peace bloomington ill pantagraph pr co 1936 104 p whi civil war soldier and governor of illinois finch edwin ward b 1831 the frontier army and professional life of edwin w finchny press of simmonds manning dawson 1909 119 p auto p 164 dlc a new york physician tells of his boyhood on a michigan homestead and of his service with the union forces as a surgeon finck henry theophilos 18541926 my adventures in the golden age of music ny funk wagnalls 1926 462 p wu music crit

In [71]:
compute_metrics(ground_truth_text, ocr_text, normalized=False, index_numbers=False)

(110, 0.029522275899087493, 0.18487394957983194)