This reads in a file containing news articles, each seperated by a line of ############### hashes. These are OCR texts with many OCR errors in them. This script call AI to auto correct the bad OCR. The prompt is designed to be cautious to avoid over correction, hallucination, to preserve historic spellings. This doesn't mean it's foolproof. Since there are many errors in OCR anyway this will be much improved for further processing and reading, but better check important facts and quotes against the original. The main risk is that because the text looks good it may lull you into a false sense of security and mask errors.

In [None]:
from openai import OpenAI
import pathlib
import re

# ---------------------------
# CONFIG
# ---------------------------
MAINFOLDER = "./Texts/"
FILENAME = "NewsArticles.txt"

INFILE = MAINFOLDER + FILENAME
OUTFILE = MAINFOLDER + "AICLEAN_" + FILENAME

MODEL = "gpt-4.1"

# ---------------------------
# MODE
# ---------------------------
# "articles" = current behavior (split on ###### delimiters)
# "paged_doc" = treat entire file as a single document, keep page markers protected
MODE = "paged_doc"  # <-- articles or paged_doc
MODE = "paged_doc"

# Matches lines like: ===== page-02.png =====  (exactly, per your OCR output)
PAGE_MARKER_RE = re.compile(r"(?m)^===== page-\d+\.png =====$")

# Chunking settings
MAX_CHARS_PER_CHUNK = 7000
LONG_ARTICLE_THRESHOLD = 12000  # only chunk if body exceeds this

BASE_PROMPT = """Here is a historical colonial 19th century text.
It has OCR errors in it. Please correct the OCR errors.

Do not change any words that are already valid.
Do not change historical spellings, place names, names, titles, abbreviations, or numbers,
unless they are part of an OCR error.
Make the minimum number of edits.
Prefer single-character fixes over word substitutions.

Preserve the original formatting as much as possible (line breaks, indentation, headings,
salutations, lists, signatures), except where a line break is clearly an OCR artefact
(e.g., a word split across lines).

If a word or short span is clearly not a valid English word or historical spelling
(for example, random letter sequences, mixed-case junk, or digit–letter mixtures),
treat it as an OCR error.

If you can confidently infer the intended text from context, correct it.
If the inference is uncertain, put the inferred text in square brackets.
If it cannot be inferred, output [illegible].

When uncertainty affects a verb phrase or grammatical construction,
bracket the entire inferred phrase rather than a single word.

Join words split by hyphen at line breaks when clearly a line-wrap hyphenation.
"""

PROMPT_ARTICLES = """\
"""

PROMPT_PAGED_DOC = """\
IMPORTANT: Page marker lines like "===== page-02.png =====" are structural markers.
Do not change these lines in any way (not spacing, not punctuation, not casing, not digits).
Treat page markers as scan boundaries only; ignore them for meaning.
Sentences and paragraphs may continue across a page marker.
"""

# ---------------------------
# STYLE ADD-ONS
# ---------------------------
STYLE_NEWS = """The text is a newspaper item. Preserve the original newspaper style, including headings,
small-caps, and punctuation conventions, and do not modernise language or spelling."""

STYLE_GOV = """The text is government correspondence/policy. Preserve formatting and structure such as
headings, salutations, numbered clauses, marginal notes, file/reference numbers, and signatures.
Do not rewrite prose or reflow lists into paragraphs. Preserve abbreviations and codes (e.g., "No. 14/1841",
"Encl.", "Ref.", "£ s. d.") unless the OCR error is obvious."""

parts = [BASE_PROMPT.rstrip()]
if MODE == "paged_doc":
    parts.append(STYLE_GOV.rstrip())
    parts.append(PROMPT_PAGED_DOC.rstrip())
elif MODE == "articles":
    parts.append(STYLE_NEWS.rstrip())
    parts.append(PROMPT_ARTICLES.rstrip())
else:
    raise ValueError(f"Unknown MODE: {MODE}")

parts.append("Return only the corrected text.")
PROMPT = "\n\n".join(p for p in parts if p)

# Split articles on delimiter lines that are hashes (6+), e.g. "########"
DELIM_LINE_RE = re.compile(r"(?m)^[ \t]*#{6,}[ \t]*$")

client = OpenAI()

# ---------------------------
# HELPERS
# ---------------------------
def split_keep_delims(text: str):
    """Split text into [chunk, delim, chunk, delim, ...], keeping delimiter lines."""
    parts = []
    last = 0
    for m in DELIM_LINE_RE.finditer(text):
        parts.append(text[last:m.start()])
        parts.append(m.group(0))  # delimiter line
        last = m.end()
    parts.append(text[last:])
    return parts


def extract_citation_and_body(article: str):
    """
    Citation = first non-empty line of the article.
    Body = everything after that line (may start immediately or with newlines).
    """
    article = article.strip("\n")
    if not article.strip():
        return "", ""

    m = re.search(r"(?m)^(?!\s*$)(.*)$", article)
    if not m:
        return "", ""

    citation = m.group(1)
    body = article[m.end(1):]  # everything after the citation line text
    return citation, body


def normalise_newlines_for_ocr(text: str) -> str:
    """
    Conservative cleanup to help continuity without flattening paragraphs.
    - Normalize Windows newlines
    - Collapse 3+ newlines -> 2 newlines (keep paragraph breaks)
    - Join line-wrap hyphenations across 1–3 newlines when continuation is lowercase
    - Collapse double newlines that look mid-sentence -> single newline (wider heuristic)
    """
    text = text.replace("\r\n", "\n").replace("\r", "\n")

    # keep paragraphs but remove giant gaps
    text = re.sub(r"\n{3,}", "\n\n", text)

    # fix sacri-\n\nfice -> sacrifice (and similar), but avoid 1841-\n42
    text = re.sub(r"([A-Za-z])-\n{1,3}([a-z])", r"\1\2", text)

    # collapse blank line mid-sentence (heuristic)
    text = re.sub(
        r"([A-Za-z0-9,;:\)\]\"'’”])\n\n(?=[A-Za-z0-9\"'‘’“”\(\[])",
        r"\1\n",
        text,
    )

    return text

def chunk_by_paragraphs(text: str, max_chars: int):
    """
    Split text into chunks up to max_chars, preserving paragraphs (\n\n) as boundaries.
    If a single paragraph is longer than max_chars, fall back to splitting on single newlines,
    and if still too long, hard-slice.
    """
    text = text.strip("\n")
    if not text:
        return []

    paras = text.split("\n\n")
    chunks = []
    buf = ""

    def flush():
        nonlocal buf
        if buf:
            chunks.append(buf)
            buf = ""

    for p in paras:
        p = p.strip("\n")
        if not p:
            continue

        candidate = p if not buf else (buf + "\n\n" + p)

        if len(candidate) <= max_chars:
            buf = candidate
            continue

        # buffer would overflow
        flush()

        # if paragraph itself fits, start new buffer with it
        if len(p) <= max_chars:
            buf = p
            continue

        # paragraph too big: split by single newline
        lines = p.split("\n")
        linebuf = ""
        for ln in lines:
            ln = ln.rstrip("\n")
            cand2 = ln if not linebuf else (linebuf + "\n" + ln)
            if len(cand2) <= max_chars:
                linebuf = cand2
            else:
                if linebuf:
                    chunks.append(linebuf)
                    linebuf = ln
                else:
                    # single line too long: hard slice
                    s = ln
                    while len(s) > max_chars:
                        chunks.append(s[:max_chars])
                        s = s[max_chars:]
                    linebuf = s
        if linebuf:
            chunks.append(linebuf)

    flush()
    return chunks


def ocr_correct(body_text: str) -> str:
    """
    Responses API call using structured input blocks (prevents prompt echo).
    """
    resp = client.responses.create(
        model=MODEL,
        input=[
            {
                "role": "system",
                "content": [{"type": "input_text", "text": PROMPT}],
            },
            {
                "role": "user",
                "content": [{"type": "input_text", "text": body_text}],
            },
        ],
        temperature=0,
        max_output_tokens=8000,
    )
    return resp.output_text

# Detect obvious OCR garble at the *end* of a chunk (boundary risk)
GARBAGE_TAIL_RE = re.compile(r"(?:[^\w\s]|[»«\^_<>]){2,}\s*$")

def merge_boundary_garble(chunks, max_chars, overflow=800):
    """
    If a chunk ends with obvious garble, merge it with the next chunk BEFORE sending to OpenAI.
    This avoids needing any prefix/suffix trimming (which can drop text).
    overflow allows a small size exceedance to preserve context.
    """
    merged = []
    i = 0
    while i < len(chunks):
        ch = chunks[i]

        if i < len(chunks) - 1 and GARBAGE_TAIL_RE.search(ch):
            nxt = chunks[i + 1]
            candidate = ch.rstrip("\n") + "\n" + nxt.lstrip("\n")
            if len(candidate) <= max_chars + overflow:
                merged.append(candidate)
                i += 2
                continue

        merged.append(ch)
        i += 1

    return merged


def ocr_correct_with_chunking(body_text: str) -> str:
    """
    If body is long, chunk it by paragraphs and correct each chunk.
    Rejoin with blank lines between chunks.

    Strategy:
    - No prefix/suffix expansion and NO trimming (prevents dropped/chopped text).
    - If a chunk ends with obvious garble, merge it with the next chunk before sending.
    - If an API call fails for a chunk, keep the original chunk (never drop content).
    """
    body_text = body_text.strip("\n")
    if not body_text:
        return ""

    if len(body_text) <= LONG_ARTICLE_THRESHOLD:
        try:
            return ocr_correct(body_text).strip("\n")
        except Exception:
            return body_text

    chunks = chunk_by_paragraphs(body_text, MAX_CHARS_PER_CHUNK)
    chunks = merge_boundary_garble(chunks, MAX_CHARS_PER_CHUNK)

    corrected_chunks = []
    for ch in chunks:
        ch = ch.strip("\n")
        if not ch:
            continue
        try:
            corrected_chunks.append(ocr_correct(ch).strip("\n"))
        except Exception:
            # Never drop content
            corrected_chunks.append(ch)

    return "\n\n".join(corrected_chunks).strip("\n")

def isolate_page_markers(text: str) -> str:
    """
    Ensure each page marker line is surrounded by blank lines so it becomes its own paragraph.
    This helps chunk_by_paragraphs keep markers intact and avoids marker lines being merged
    into surrounding text by preprocessing.
    """
    text = text.replace("\r\n", "\n").replace("\r", "\n")

    # Add a blank line before and after each marker line (without changing the marker itself)
    # This is conservative: even if already blank-lined, it remains stable.
    text = re.sub(r"(?m)^(===== page-\d+\.png =====)$", r"\n\n\1\n\n", text)

    # Clean up any huge gaps that might be created
    text = re.sub(r"\n{4,}", "\n\n\n", text)  # keep a little slack
    return text


def validate_page_markers(original_text: str, corrected_text: str):
    """
    Basic integrity check: same number of markers, and markers still match the pattern.
    If it fails, we print a warning with a small sample.
    """
    orig = PAGE_MARKER_RE.findall(original_text)
    corr = PAGE_MARKER_RE.findall(corrected_text)

    if len(orig) != len(corr):
        print(f"WARNING: Page marker count changed: original={len(orig)} corrected={len(corr)}")
        # show a few examples of what the corrected markers look like (lines containing 'page-')
        corr_lines = [ln for ln in corrected_text.splitlines() if "page-" in ln.lower() or "===== " in ln]
        print("Sample corrected lines containing 'page-' or '=====':")
        for ln in corr_lines[:10]:
            print("   ", ln)
        return

    # Optional: stricter check that the marker *sequence* looks identical
    # (this assumes your OCR produced them consistently)
    if orig != corr:
        print("WARNING: Page markers are present but differ from the original sequence.")

# ---------------------------
# MAIN
# ---------------------------
raw = pathlib.Path(INFILE).read_text(encoding="utf-8")

if MODE == "paged_doc":
    # 1) Make markers safe paragraph units
    raw2 = isolate_page_markers(raw)

    # 2) Normalise line breaks etc (doesn't change marker lines themselves)
    cleaned = normalise_newlines_for_ocr(raw2)

    # 3) Correct whole doc with paragraph chunking
    corrected = ocr_correct_with_chunking(cleaned)

    # 4) Validate markers survived
    validate_page_markers(raw2, corrected)

    pathlib.Path(OUTFILE).write_text(corrected.strip() + "\n", encoding="utf-8")
    print(f"Done (paged_doc) -> {OUTFILE}")

else:
    # Existing "articles" behavior
    parts = split_keep_delims(raw)

    out_parts = []

    for part in parts:
        # Keep delimiter lines unchanged
        if DELIM_LINE_RE.fullmatch(part.strip("\n")):
            continue  # delimiter is handled on join

        if not part.strip():
            continue

        citation, body = extract_citation_and_body(part)

        # If no citation found (rare), just correct whole thing (with chunking)
        if not citation:
            cleaned = normalise_newlines_for_ocr(part)
            corrected = ocr_correct_with_chunking(cleaned)
            out_parts.append(corrected.strip("\n"))
            continue

        # If no body, keep citation only
        if not body.strip():
            out_parts.append(citation.strip())
            continue

        # Preprocess body only, then OCR-correct (chunked if long)
        body_for_model = normalise_newlines_for_ocr(body)
        corrected_body = ocr_correct_with_chunking(body_for_model)

        # Force exactly one blank line between citation and body
        rebuilt = citation.rstrip() + "\n\n" + corrected_body.lstrip("\n")
        out_parts.append(rebuilt.strip("\n"))

    # Write output with standard separators
    out_text = "\n\n########\n\n".join(out_parts).strip() + "\n"
    pathlib.Path(OUTFILE).write_text(out_text, encoding="utf-8")

    print(f"Done -> {OUTFILE}")

