Identify larger blocks of text with very bad OCR. These are the sort of areas that might be in the corner of very damaged pages, or the original scan has very black sections, low contrast, so the text could not be distinguished well. AI does an excellent job of correcting a word or two, or a sentence with glitchy parts but most character ok. But these are sections AI cannot be expected to correct but these sections would still be processed and returned by AI correction as if they were sensible English, so would be hard to recognise as 'hallucinations'. This checker looks for blocks of very bad OCR but ignores small OCR errors, so that you can double check these sections and manually correct. The idea is AI handles the bulk of the easy stuff, but there's still a bit of very bad stuff that must be human corrected.

In [None]:
import re
import math
from dataclasses import dataclass
from typing import List, Tuple, Dict, Optional

# -----------------------------
# Config
# -----------------------------
HASH_SPLIT_RE = re.compile(r'^\s*#{4,}\s*$', re.MULTILINE)  # line of >=4 hashes
MULTISPACE_RE = re.compile(r'\s+')
WORD_RE = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)?")
TOKEN_RE = re.compile(r"\S+")

# Common OCR junk characters / patterns
JUNK_CHAR_RE = re.compile(r"[«»•·¦¬©®°±×÷§¶€™“”‘’–…]|[\uFFFD]")
REPEATED_PUNCT_RE = re.compile(r"([^\w\s])\1{2,}")   # e.g. "!!!", "???", "///"
WEIRD_MIX_RE = re.compile(r"(?=.*[A-Za-z])(?=.*\d)[A-Za-z0-9]+")  # token with both letters+digits

# A tiny list of very common English words to help detect "real language"
COMMON_WORDS = {
    "the","and","to","of","in","a","that","is","was","for","on","with","as","by","at",
    "it","from","this","be","are","or","an","have","not","which","but","they","has",
    "were","had","their","his","her","we","you","he","she","them","been","will","would",
    "there","one","all","so","if","no","into","up","out","do","did","than","then"
}

@dataclass
class BlockScore:
    length_chars: int
    alpha_ratio: float
    non_ascii_ratio: float
    punct_ratio: float
    digit_ratio: float
    vowel_ratio: float
    avg_token_len: float
    weird_mix_ratio: float
    junk_char_hits: int
    repeated_punct_hits: int
    common_word_ratio: float
    short_gibberish_token_ratio: float
    score: float

def split_articles(text: str) -> List[str]:
    # Split on hash lines; keep only non-empty chunks
    parts = [p.strip("\n") for p in HASH_SPLIT_RE.split(text)]
    return [p for p in parts if p.strip()]

def extract_citation_and_body(article: str) -> Tuple[str, str]:
    lines = article.splitlines()
    # citation = first nonblank line
    citation = ""
    start_idx = 0
    for i, ln in enumerate(lines):
        if ln.strip():
            citation = ln.strip()
            start_idx = i + 1
            break
    body = "\n".join(lines[start_idx:]).strip("\n")
    return citation, body

def split_paragraphs(body: str) -> List[str]:
    # Paragraphs separated by blank lines (one or more)
    paras = [p.strip() for p in re.split(r"\n\s*\n+", body) if p.strip()]
    return paras

def safe_div(a: float, b: float) -> float:
    return a / b if b else 0.0

def score_block(block: str) -> BlockScore:
    s = block.strip()
    length_chars = len(s)

    # Tokens and words
    tokens = TOKEN_RE.findall(s)
    words = WORD_RE.findall(s)
    lower_words = [w.lower() for w in words]

    # Character classes
    alpha = sum(ch.isalpha() for ch in s)
    digits = sum(ch.isdigit() for ch in s)
    punct = sum((not ch.isalnum()) and (not ch.isspace()) for ch in s)

    non_ascii = sum(ord(ch) > 127 for ch in s)
    vowels = sum(ch.lower() in "aeiou" for ch in s if ch.isalpha())

    alpha_ratio = safe_div(alpha, length_chars)
    digit_ratio = safe_div(digits, length_chars)
    punct_ratio = safe_div(punct, length_chars)
    non_ascii_ratio = safe_div(non_ascii, length_chars)
    vowel_ratio = safe_div(vowels, alpha)

    avg_token_len = safe_div(sum(len(t) for t in tokens), len(tokens))

    weird_mix = sum(1 for t in tokens if WEIRD_MIX_RE.fullmatch(t))
    weird_mix_ratio = safe_div(weird_mix, len(tokens))

    junk_char_hits = len(JUNK_CHAR_RE.findall(s))
    repeated_punct_hits = len(REPEATED_PUNCT_RE.findall(s))

    common_hits = sum(1 for w in lower_words if w in COMMON_WORDS)
    common_word_ratio = safe_div(common_hits, len(lower_words))

    # “short gibberish” tokens: mostly non-letters OR too few vowels for length
    gib = 0
    for t in tokens:
        t_clean = re.sub(r"\W+", "", t)
        if not t_clean:
            gib += 1
            continue
        letters = sum(ch.isalpha() for ch in t_clean)
        v = sum(ch.lower() in "aeiou" for ch in t_clean if ch.isalpha())
        if letters >= 4:
            # very low vowel density often indicates garbage like "strtiB" or "jfiff"
            if safe_div(v, letters) < 0.15:
                gib += 1
        else:
            # short tokens: treat as gib if not mostly letters
            if safe_div(letters, len(t_clean)) < 0.6:
                gib += 1

    short_gibberish_token_ratio = safe_div(gib, len(tokens))

    # Combine into a single "badness" score
    # We weight features that strongly indicate *very* bad OCR
    score = 0.0
    score += 2.5 * max(0.0, 0.70 - alpha_ratio)                 # too few letters overall
    score += 2.0 * max(0.0, punct_ratio - 0.20)                 # punctuation overload
    score += 1.8 * max(0.0, non_ascii_ratio - 0.02)             # weird chars / replacement glyphs
    score += 1.2 * max(0.0, digit_ratio - 0.10)                 # digit soup
    score += 2.0 * max(0.0, 0.25 - vowel_ratio)                 # consonant soup
    score += 2.2 * max(0.0, short_gibberish_token_ratio - 0.25) # many garbage tokens
    score += 1.3 * weird_mix_ratio                               # letter+digit tokens
    score += 0.6 * min(5, junk_char_hits)                        # presence of OCR junk chars
    score += 0.6 * min(5, repeated_punct_hits)                   # "!!!" "///" etc
    score += 1.8 * max(0.0, 0.07 - common_word_ratio)            # basically no common words

    return BlockScore(
        length_chars=length_chars,
        alpha_ratio=alpha_ratio,
        non_ascii_ratio=non_ascii_ratio,
        punct_ratio=punct_ratio,
        digit_ratio=digit_ratio,
        vowel_ratio=vowel_ratio,
        avg_token_len=avg_token_len,
        weird_mix_ratio=weird_mix_ratio,
        junk_char_hits=junk_char_hits,
        repeated_punct_hits=repeated_punct_hits,
        common_word_ratio=common_word_ratio,
        short_gibberish_token_ratio=short_gibberish_token_ratio,
        score=score
    )

def is_very_bad(block: str, sc: BlockScore,
                min_chars: int = 180,
                min_score: float = 2.4) -> bool:
    """
    Designed to catch *obviously* bad OCR blocks, not mild glitches.
    """
    if sc.length_chars < min_chars:
        return False

    # Hard "this is trash" triggers (corner-damage patterns)
    hard_triggers = (
        sc.alpha_ratio < 0.55 and sc.punct_ratio > 0.18,
        sc.non_ascii_ratio > 0.05,
        sc.short_gibberish_token_ratio > 0.45,
        sc.repeated_punct_hits >= 2,
        sc.junk_char_hits >= 3
    )

    # If any two hard triggers are true, flag regardless of score.
    if sum(bool(x) for x in hard_triggers) >= 2:
        return True

    # Otherwise, use the combined score threshold.
    return sc.score >= min_score

def find_bad_blocks_in_file(path: str,
                            min_chars: int = 180,
                            min_score: float = 2.4,
                            max_blocks_per_article: int = 3) -> List[Dict]:
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        text = f.read()

    results = []
    for art_idx, art in enumerate(split_articles(text), start=1):
        citation, body = extract_citation_and_body(art)
        if not body.strip():
            continue

        paras = split_paragraphs(body)
        flagged = []

        for p_idx, para in enumerate(paras, start=1):
            sc = score_block(para)
            if is_very_bad(para, sc, min_chars=min_chars, min_score=min_score):
                flagged.append((p_idx, para, sc))

        # Keep only top N worst per article (prevents noisy reports)
        flagged.sort(key=lambda x: x[2].score, reverse=True)
        flagged = flagged[:max_blocks_per_article]

        for p_idx, para, sc in flagged:
            results.append({
                "article_index": art_idx,
                "citation": citation,
                "paragraph_index": p_idx,
                "bad_block": para,
                "score": sc.score,
                "alpha_ratio": sc.alpha_ratio,
                "punct_ratio": sc.punct_ratio,
                "non_ascii_ratio": sc.non_ascii_ratio,
                "gib_ratio": sc.short_gibberish_token_ratio,
                "common_word_ratio": sc.common_word_ratio,
            })

    return results

def print_report(results: List[Dict], show_scores: bool = True, max_chars: int = 900):
    if not results:
        print("No very-bad OCR blocks detected.")
        return

    for i, r in enumerate(results, start=1):
        print("="*90)
        print(f"{i}. {r['citation']}")
        print(f"(Article #{r['article_index']}, Paragraph #{r['paragraph_index']})")
        if show_scores:
            print(
                f"score={r['score']:.2f} | alpha={r['alpha_ratio']:.2f} | "
                f"punct={r['punct_ratio']:.2f} | non_ascii={r['non_ascii_ratio']:.2f} | "
                f"gib={r['gib_ratio']:.2f} | common={r['common_word_ratio']:.2f}"
            )
        block = r["bad_block"].strip()
        if len(block) > max_chars:
            block = block[:max_chars].rstrip() + " …"
        print()
        print(block)
        print()

# -----------------------------
# Run
# -----------------------------
path = "./Texts/NewsArticles.txt"
results = find_bad_blocks_in_file(path, min_chars=180, min_score=2.4, max_blocks_per_article=3)
print_report(results, show_scores=True, max_chars=900)