In [1]:
import re
import easyocr
from PIL import Image
from pdf2image import convert_from_path
from difflib import SequenceMatcher
import docx
import os
import numpy as np

def ocr_pdf(pdf_path, lang="en", dpi=350, first_page=None, last_page=None):
    """
    Perform OCR on PDF using EasyOCR
    """
    # Initialize EasyOCR reader
    reader = easyocr.Reader([lang])
    
    pages = convert_from_path(pdf_path, dpi=dpi, first_page=first_page, last_page=last_page)
    page_texts = []
    
    for i, page in enumerate(pages, 1):
        # Convert PIL image to numpy array
        img_array = np.array(page)
        
        # Perform OCR
        results = reader.readtext(img_array, detail=0, paragraph=True)
        
        # Combine all text blocks
        page_text = "\n".join(results)
        page_texts.append(page_text)
        
        print(f"[debug] OCR'd Page {i}: {len(page_text)} chars, {len(results)} text blocks")
    
    return page_texts

def read_docx(file_path):
    """Read text from a Word document"""
    doc = docx.Document(file_path)
    full_text = []
    for paragraph in doc.paragraphs:
        full_text.append(paragraph.text)
    return '\n'.join(full_text)

def preprocess_text(text):
    """
    Preprocess text by removing extra whitespace, normalizing case, 
    and removing special characters for better comparison
    """
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s.,;:!?()\-]', '', text)
    return text.strip()

def calculate_error_metrics(ocr_text, ground_truth):
    """
    Calculate various error metrics between OCR text and ground truth
    """
    # Preprocess both texts
    ocr_clean = preprocess_text(ocr_text)
    gt_clean = preprocess_text(ground_truth)
    
    # Calculate similarity ratio
    matcher = SequenceMatcher(None, ocr_clean, gt_clean)
    similarity_ratio = matcher.ratio()
    
    # Calculate error rate
    error_rate = (1 - similarity_ratio) * 100
    
    # Find matching blocks and differences
    matching_blocks = matcher.get_matching_blocks()
    total_chars = max(len(ocr_clean), len(gt_clean))
    matched_chars = sum(block.size for block in matching_blocks)
    
    return {
        'error_rate': error_rate,
        'accuracy': similarity_ratio * 100,
        'matched_chars': matched_chars,
        'total_chars': total_chars,
        'matcher': matcher,
        'ocr_clean': ocr_clean,
        'gt_clean': gt_clean
    }

def highlight_differences(ocr_text, ground_truth, matcher):
    """
    Generate a text with highlights showing differences between OCR and ground truth
    """
    result = []
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == 'equal':
            result.append(ground_truth[j1:j2])
        elif tag == 'replace':
            result.append(f"[OCR: {ocr_text[i1:i2]} -> GT: {ground_truth[j1:j2]}]")
        elif tag == 'delete':
            result.append(f"[OCR: {ocr_text[i1:i2]} -> MISSING]")
        elif tag == 'insert':
            result.append(f"[MISSING IN OCR -> GT: {ground_truth[j1:j2]}]")
    return ''.join(result)

def main():
    # Perform OCR on the PDF using EasyOCR
    pdf_file = "23138.pdf"
    print(f"Performing OCR on {pdf_file} using EasyOCR...")
    ocr_pages = ocr_pdf(pdf_file)
    
    # Read ground truth from Word document
    gt_file = "Shohan Nayak.docx"
    print(f"Reading ground truth from {gt_file}...")
    ground_truth = read_docx(gt_file)
    
    # Save OCR results
    full_ocr_text = "".join(f"\n\n--- Page {i} ---\n{t}" for i, t in enumerate(ocr_pages, 1))
    with open("23138_ocr_easyocr.txt", "w", encoding="utf-8") as f:
        f.write(full_ocr_text)
    print("\nSaved full OCR to 23136_ocr_easyocr.txt")
    
    print("\n" + "="*60)
    print("OCR ACCURACY ANALYSIS REPORT (EasyOCR)")
    print("="*60)
    
    # Calculate metrics for each page
    page_metrics = []
    for i, page_text in enumerate(ocr_pages, 1):
        metrics = calculate_error_metrics(page_text, ground_truth)
        page_metrics.append(metrics)
        print(f"Page {i}: Error Rate = {metrics['error_rate']:.2f}%, Accuracy = {metrics['accuracy']:.2f}%")
    
    # Calculate overall metrics
    combined_ocr = " ".join(ocr_pages)
    overall_metrics = calculate_error_metrics(combined_ocr, ground_truth)
    
    print(f"\nOverall: Error Rate = {overall_metrics['error_rate']:.2f}%, Accuracy = {overall_metrics['accuracy']:.2f}%")
    print(f"Matched Characters: {overall_metrics['matched_chars']} / {overall_metrics['total_chars']}")
    
    # Show detailed differences for the first page
    print("\n" + "="*60)
    print("DETAILED DIFFERENCES (Page 1)")
    print("="*60)
    diff_text = highlight_differences(
        page_metrics[0]['ocr_clean'], 
        page_metrics[0]['gt_clean'], 
        page_metrics[0]['matcher']
    )
    print(diff_text[:1000] + "..." if len(diff_text) > 1000 else diff_text)
    
    # Save detailed comparison to file
    with open("ocr_comparison_report_easyocr.txt", "w", encoding="utf-8") as f:
        f.write("OCR ACCURACY ANALYSIS REPORT (EasyOCR)\n")
        f.write("="*60 + "\n")
        for i, metrics in enumerate(page_metrics, 1):
            f.write(f"Page {i}: Error Rate = {metrics['error_rate']:.2f}%, Accuracy = {metrics['accuracy']:.2f}%\n")
        f.write(f"\nOverall: Error Rate = {overall_metrics['error_rate']:.2f}%, Accuracy = {overall_metrics['accuracy']:.2f}%\n")
        f.write(f"Matched Characters: {overall_metrics['matched_chars']} / {overall_metrics['total_chars']}\n\n")
        
        f.write("DETAILED DIFFERENCES\n")
        f.write("="*60 + "\n")
        for i, metrics in enumerate(page_metrics, 1):
            f.write(f"\n--- Page {i} Differences ---\n")
            diff = highlight_differences(metrics['ocr_clean'], metrics['gt_clean'], metrics['matcher'])
            f.write(diff + "\n")
    
    print("\nSaved detailed comparison report to ocr_comparison_report_easyocr.txt")

if __name__ == "__main__":
    main()

Performing OCR on 23138.pdf using EasyOCR...




[debug] OCR'd Page 1: 1103 chars, 4 text blocks
[debug] OCR'd Page 2: 690 chars, 9 text blocks
Reading ground truth from Shohan Nayak.docx...

Saved full OCR to 23136_ocr_easyocr.txt

OCR ACCURACY ANALYSIS REPORT (EasyOCR)
Page 1: Error Rate = 96.92%, Accuracy = 3.08%
Page 2: Error Rate = 99.03%, Accuracy = 0.97%

Overall: Error Rate = 96.99%, Accuracy = 3.01%
Matched Characters: 50 / 1709

DETAILED DIFFERENCES (Page 1)
[OCR: techno main salt l -> GT: name: shohan nay]ak[OCR: e formerly techno india, salt lake ) name_ _shalan_nayak roll no. -> GT:  roll no.:] 13[OCR: . -> MISSING]0[OCR: . -> MISSING]30[OCR: 4. -> GT: 82]3[OCR: 2. -> MISSING]1[OCR: . -> MISSING]3[OCR: .. stream_  -> GT: 8 part a classification in regression are the two most common supervised tas]k[OCR: el hm) (m imdstsriess ( pcc-alml6ot) su -> GT: s. to check the accuracy of the prediction of a validation set. there are two model parameters are there in a linear regression pro]b[OCR: ject alfliczlun : llaclis kar-semes

# Word Error Rate

In [3]:
pdf = "Ideal Text.pdf"
ground_truth = "Ideal Gound Truth.docx"
Text = "Text.txt"
Report = "Report.txt"

import re
import easyocr
from PIL import Image
from pdf2image import convert_from_path
import docx
import os
import numpy as np

# ---------- OCR + IO ----------

def ocr_pdf(pdf_path, lang="en", dpi=350, first_page=None, last_page=None):
    """
    Perform OCR on PDF using EasyOCR
    """
    # Initialize EasyOCR reader
    reader = easyocr.Reader([lang])

    pages = convert_from_path(pdf_path, dpi=dpi, first_page=first_page, last_page=last_page)
    page_texts = []

    for i, page in enumerate(pages, 1):
        # Convert PIL image to numpy array
        img_array = np.array(page)

        # Perform OCR
        results = reader.readtext(img_array, detail=0, paragraph=True)

        # Combine all text blocks
        page_text = "\n".join(results)
        page_texts.append(page_text)

        print(f"[debug] OCR'd Page {i}: {len(page_text)} chars, {len(results)} text blocks")

    return page_texts

def read_docx(file_path):
    """Read text from a Word document"""
    doc = docx.Document(file_path)
    full_text = []
    for paragraph in doc.paragraphs:
        full_text.append(paragraph.text)
    return '\n'.join(full_text)

# ---------- Preprocessing ----------

def preprocess_text(text):
    """
    Normalize text lightly: lowercase, collapse whitespace, remove non-word punctuation
    (keep basic punctuation, though WER uses word tokens only).
    """
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.,;:!?()\-]', '', text)
    return text.strip()

def tokenize_words(text):
    """
    Tokenize into words for WER (alphanumeric + underscore).
    """
    return re.findall(r'\w+', text.lower())

# ---------- WER (word-level edit distance with ops) ----------

def wer_with_ops(hyp_words, ref_words):
    """
    Compute WER with dynamic programming and return:
    - S (substitutions), D (deletions), I (insertions), N (# reference words)
    - ops: list of (op, ref_word, hyp_word) where op in {"equal","sub","del","ins"}
    """
    m, n = len(ref_words), len(hyp_words)

    # dp[i][j] = minimum edits to convert ref[0:i] -> hyp[0:j]
    dp = [[0]*(n+1) for _ in range(m+1)]
    back = [[None]*(n+1) for _ in range(m+1)]

    # Initialize
    for i in range(1, m+1):
        dp[i][0] = i
        back[i][0] = ('del', i-1, None)  # delete ref[i-1]
    for j in range(1, n+1):
        dp[0][j] = j
        back[0][j] = ('ins', None, j-1)  # insert hyp[j-1]

    # Fill
    for i in range(1, m+1):
        for j in range(1, n+1):
            if ref_words[i-1] == hyp_words[j-1]:
                dp[i][j] = dp[i-1][j-1]
                back[i][j] = ('eq', i-1, j-1)
            else:
                sub_cost = dp[i-1][j-1] + 1
                ins_cost = dp[i][j-1] + 1
                del_cost = dp[i-1][j] + 1
                best = min(sub_cost, ins_cost, del_cost)
                dp[i][j] = best
                if best == sub_cost:
                    back[i][j] = ('sub', i-1, j-1)
                elif best == ins_cost:
                    back[i][j] = ('ins', i, j-1)
                else:
                    back[i][j] = ('del', i-1, j)

    # Backtrack to get operations
    ops = []
    i, j = m, n
    while i > 0 or j > 0:
        op, ii, jj = back[i][j] if back[i][j] is not None else (None, None, None)
        if op == 'eq':
            ops.append(('equal', ref_words[ii], hyp_words[jj]))
            i, j = ii, jj
        elif op == 'sub':
            ops.append(('sub', ref_words[ii], hyp_words[jj]))
            i, j = ii, jj
        elif op == 'ins':
            ops.append(('ins', None, hyp_words[jj]))
            j = jj
        elif op == 'del':
            ops.append(('del', ref_words[ii], None))
            i = ii
        else:
            # Fallbacks
            if i > 0:
                ops.append(('del', ref_words[i-1], None))
                i -= 1
            elif j > 0:
                ops.append(('ins', None, hyp_words[j-1]))
                j -= 1

    ops.reverse()

    # Count S/D/I
    S = sum(1 for o,_,_ in ops if o == 'sub')
    D = sum(1 for o,_,_ in ops if o == 'del')
    I = sum(1 for o,_,_ in ops if o == 'ins')
    N = max(1, m)  # avoid div-by-zero; if ref empty, set N=1 by convention

    return S, D, I, N, ops

def calculate_wer_metrics(ocr_text, ground_truth):
    """
    Calculate WER metrics between OCR text and ground truth, returning:
    - wer_percent, S, D, I, N, ops, ref_words, hyp_words
    """
    # Preprocess and tokenize
    ocr_clean = preprocess_text(ocr_text)
    gt_clean = preprocess_text(ground_truth)

    hyp_words = tokenize_words(ocr_clean)
    ref_words = tokenize_words(gt_clean)

    S, D, I, N, ops = wer_with_ops(hyp_words, ref_words)
    wer = (S + D + I) / N * 100.0

    return {
        'wer_percent': wer,
        'S': S,
        'D': D,
        'I': I,
        'N': N,
        'ops': ops,
        'ref_words': ref_words,
        'hyp_words': hyp_words
    }

# ---------- Highlight (word-level) ----------

def highlight_differences_words(ops):
    """
    Build a readable diff string at the word level using ops from wer_with_ops().
    equal -> plain word
    sub   -> [OCR: hyp -> GT: ref]
    del   -> [MISSING: ref]
    ins   -> [EXTRA OCR: hyp]
    """
    out = []
    for op, ref_w, hyp_w in ops:
        if op == 'equal':
            out.append(ref_w)
        elif op == 'sub':
            out.append(f"[OCR: {hyp_w} -> GT: {ref_w}]")
        elif op == 'del':
            out.append(f"[MISSING: {ref_w}]")
        elif op == 'ins':
            out.append(f"[EXTRA OCR: {hyp_w}]")
    return ' '.join(out)

# ---------- Main ----------

def main():
    # Perform OCR on the PDF using EasyOCR
    pdf_file = pdf
    print(f"Performing OCR on {pdf_file} using EasyOCR...")
    ocr_pages = ocr_pdf(pdf_file)

    # Read ground truth from Word document
    gt_file = ground_truth
    print(f"Reading ground truth from {gt_file}...")
    gt_text = read_docx(gt_file)      # ✅ rename here

    # Save OCR results
    full_ocr_text = "".join(f"\n\n--- Page {i} ---\n{t}" for i, t in enumerate(ocr_pages, 1))
    with open(Text, "w", encoding="utf-8") as f:
        f.write(full_ocr_text)
    print(f"\nSaved full OCR to {Text}")

    print("\n" + "="*60)
    print("OCR ACCURACY ANALYSIS REPORT (WER, EasyOCR)")
    print("="*60)

    # Page-wise WER (each page vs full ground truth, to mirror your original style)
    page_metrics = []
    for i, page_text in enumerate(ocr_pages, 1):
        m = calculate_wer_metrics(page_text, gt_text)    # ✅ use gt_text
        page_metrics.append(m)
        print(f"Page {i}: WER = {m['wer_percent']:.2f}%  "
              f"(S={m['S']}, D={m['D']}, I={m['I']}, N={m['N']})")

    # Overall WER (all pages concatenated vs full ground truth)
    combined_ocr = " ".join(ocr_pages)
    overall = calculate_wer_metrics(combined_ocr, gt_text)  # ✅ use gt_text
    print(f"\nOverall: WER = {overall['wer_percent']:.2f}%  "
          f"(S={overall['S']}, D={overall['D']}, I={overall['I']}, N={overall['N']})")

    # Detailed differences for the first page (word-level)
    print("\n" + "="*60)
    print("DETAILED WORD-LEVEL DIFFERENCES (Page 1)")
    print("="*60)
    diff_text = highlight_differences_words(page_metrics[0]['ops'])
    print(diff_text[:1000] + "..." if len(diff_text) > 1000 else diff_text)

    # Save detailed comparison to file
    with open(Report, "w", encoding="utf-8") as f:
        f.write("OCR ACCURACY ANALYSIS REPORT (WER, EasyOCR)\n")
        f.write("="*60 + "\n")
        for i, m in enumerate(page_metrics, 1):
            f.write(f"Page {i}: WER = {m['wer_percent']:.2f}%  "
                    f"(S={m['S']}, D={m['D']}, I={m['I']}, N={m['N']})\n")
        f.write(f"\nOverall: WER = {overall['wer_percent']:.2f}%  "
                f"(S={overall['S']}, D={overall['D']}, I={overall['I']}, N={overall['N']})\n\n")

        f.write("DETAILED WORD-LEVEL DIFFERENCES\n")
        f.write("="*60 + "\n")
        for i, m in enumerate(page_metrics, 1):
            f.write(f"\n--- Page {i} Differences ---\n")
            f.write(highlight_differences_words(m['ops']) + "\n")

    print(f"\nSaved detailed comparison report to {Report}")

if __name__ == "__main__":
    main()

Performing OCR on Ideal Text.pdf using EasyOCR...




[debug] OCR'd Page 1: 249 chars, 10 text blocks
Reading ground truth from Ideal Gound Truth.docx...

Saved full OCR to Text.txt

OCR ACCURACY ANALYSIS REPORT (WER, EasyOCR)
Page 1: WER = 104.00%  (S=49, D=0, I=3, N=50)

Overall: WER = 104.00%  (S=49, D=0, I=3, N=50)

DETAILED WORD-LEVEL DIFFERENCES (Page 1)
[OCR: move -> GT: a] [OCR: j0 -> GT: move] [OCR: 3o8 -> GT: to] [OCR: ar -> GT: stop] [OCR: _ -> GT: mr] [OCR: gibbel -> GT: gaitskell] [OCR: fven -> GT: from] [OCR: wowinal -> GT: nominating] [OCR: n3 -> GT: any] [OCR: a -> GT: more] [OCR: lone_ -> GT: labour] [OCR: labaur -> GT: life] [OCR: lfe_ -> GT: peers] [OCR: per -> GT: is] [OCR: lo -> GT: to] [OCR: o_ -> GT: be] [OCR: luadr_ -> GT: made] at [EXTRA OCR: meohia] [EXTRA OCR: ak] [EXTRA OCR: labcuv] [OCR: m -> GT: a] [OCR: donorrol -> GT: meeting] [OCR: ar -> GT: of] [OCR: aclae -> GT: labour] [OCR: fool -> GT: m] [OCR: loa -> GT: ps] [OCR: pux -> GT: tomorrow] [OCR: dowsu -> GT: mr] [OCR: 0l -> GT: michael] [OCR: vejoluhot -> 

# Character Error Rate

In [6]:
import re
import easyocr
from PIL import Image
from pdf2image import convert_from_path
import docx
import os
import numpy as np

# ---------- OCR + IO ----------

def ocr_pdf(pdf_path, lang="en", dpi=350, first_page=None, last_page=None):
    """
    Perform OCR on PDF using EasyOCR
    """
    # Initialize EasyOCR reader
    reader = easyocr.Reader([lang])

    pages = convert_from_path(pdf_path, dpi=dpi, first_page=first_page, last_page=last_page)
    page_texts = []

    for i, page in enumerate(pages, 1):
        # Convert PIL image to numpy array
        img_array = np.array(page)

        # Perform OCR
        results = reader.readtext(img_array, detail=0, paragraph=True)

        # Combine all text blocks
        page_text = "\n".join(results)
        page_texts.append(page_text)

        print(f"[debug] OCR'd Page {i}: {len(page_text)} chars, {len(results)} text blocks")

    return page_texts

def read_docx(file_path):
    """Read text from a Word document"""
    doc = docx.Document(file_path)
    full_text = []
    for paragraph in doc.paragraphs:
        full_text.append(paragraph.text)
    return '\n'.join(full_text)

# ---------- Preprocessing ----------

def preprocess_text(text):
    """
    Normalize text lightly: lowercase, collapse whitespace, remove non-word punctuation.
    (CER is computed over characters after this normalization.)
    """
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.,;:!?()\-]', '', text)
    return text.strip()

def tokenize_chars(text, include_spaces=False):
    """
    Convert text into a list of characters for CER.
    If include_spaces=False (default), spaces are removed before tokenization.
    """
    if not include_spaces:
        text = text.replace(' ', '')
    return list(text)

# ---------- CER (character-level edit distance with ops) ----------

def cer_with_ops(hyp_chars, ref_chars):
    """
    Compute CER with dynamic programming and return:
    - S (substitutions), D (deletions), I (insertions), N (# reference chars)
    - ops: list of (op, ref_char, hyp_char) where op in {"equal","sub","del","ins"}
    """
    m, n = len(ref_chars), len(hyp_chars)

    # dp[i][j] = minimum edits to convert ref[0:i] -> hyp[0:j]
    dp = [[0]*(n+1) for _ in range(m+1)]
    back = [[None]*(n+1) for _ in range(m+1)]

    # Initialize
    for i in range(1, m+1):
        dp[i][0] = i
        back[i][0] = ('del', i-1, None)  # delete ref[i-1]
    for j in range(1, n+1):
        dp[0][j] = j
        back[0][j] = ('ins', None, j-1)  # insert hyp[j-1]

    # Fill
    for i in range(1, m+1):
        for j in range(1, n+1):
            if ref_chars[i-1] == hyp_chars[j-1]:
                dp[i][j] = dp[i-1][j-1]
                back[i][j] = ('eq', i-1, j-1)
            else:
                sub_cost = dp[i-1][j-1] + 1
                ins_cost = dp[i][j-1] + 1
                del_cost = dp[i-1][j] + 1
                best = min(sub_cost, ins_cost, del_cost)
                dp[i][j] = best
                if best == sub_cost:
                    back[i][j] = ('sub', i-1, j-1)
                elif best == ins_cost:
                    back[i][j] = ('ins', i, j-1)
                else:
                    back[i][j] = ('del', i-1, j)

    # Backtrack to get operations
    ops = []
    i, j = m, n
    while i > 0 or j > 0:
        op, ii, jj = back[i][j] if back[i][j] is not None else (None, None, None)
        if op == 'eq':
            ops.append(('equal', ref_chars[ii], hyp_chars[jj]))
            i, j = ii, jj
        elif op == 'sub':
            ops.append(('sub', ref_chars[ii], hyp_chars[jj]))
            i, j = ii, jj
        elif op == 'ins':
            ops.append(('ins', None, hyp_chars[jj]))
            j = jj
        elif op == 'del':
            ops.append(('del', ref_chars[ii], None))
            i = ii
        else:
            # Fallbacks
            if i > 0:
                ops.append(('del', ref_chars[i-1], None))
                i -= 1
            elif j > 0:
                ops.append(('ins', None, hyp_chars[j-1]))
                j -= 1

    ops.reverse()

    # Count S/D/I
    S = sum(1 for o,_,_ in ops if o == 'sub')
    D = sum(1 for o,_,_ in ops if o == 'del')
    I = sum(1 for o,_,_ in ops if o == 'ins')
    N = max(1, m)  # avoid div-by-zero if ref empty

    return S, D, I, N, ops

def calculate_cer_metrics(ocr_text, ground_truth, include_spaces=False):
    """
    Calculate CER metrics between OCR text and ground truth, returning:
    - cer_percent, S, D, I, N, ops, ref_chars, hyp_chars
    """
    # Preprocess
    ocr_clean = preprocess_text(ocr_text)
    gt_clean = preprocess_text(ground_truth)

    hyp_chars = tokenize_chars(ocr_clean, include_spaces=include_spaces)
    ref_chars = tokenize_chars(gt_clean, include_spaces=include_spaces)

    S, D, I, N, ops = cer_with_ops(hyp_chars, ref_chars)
    cer = (S + D + I) / N * 100.0

    return {
        'cer_percent': cer,
        'S': S,
        'D': D,
        'I': I,
        'N': N,
        'ops': ops,
        'ref_chars': ref_chars,
        'hyp_chars': hyp_chars
    }

# ---------- Highlight (character-level) ----------

def highlight_differences_chars(ops):
    """
    Build a readable diff string at the character level.
    equal -> char as-is
    sub   -> [OCR:h -> GT:r]
    del   -> [MISSING:r]
    ins   -> [EXTRA OCR:h]
    """
    out = []
    for op, ref_c, hyp_c in ops:
        if op == 'equal':
            out.append(ref_c)
        elif op == 'sub':
            out.append(f"[OCR:{hyp_c} -> GT:{ref_c}]")
        elif op == 'del':
            out.append(f"[MISSING:{ref_c}]")
        elif op == 'ins':
            out.append(f"[EXTRA OCR:{hyp_c}]")
    return ''.join(out)

# ---------- Main ----------

def main():
    # Choose whether to include spaces in CER computation
    INCLUDE_SPACES = False  # set True if you want spaces to count as characters

    # Perform OCR on the PDF using EasyOCR
    pdf_file = pdf
    print(f"Performing OCR on {pdf_file} using EasyOCR...")
    ocr_pages = ocr_pdf(pdf_file)

    # Read ground truth from Word document
    gt_file = ground_truth
    print(f"Reading ground truth from {gt_file}...")
    gt_text = read_docx(gt_file)   # ✅ FIX: renamed to gt_text

    # Save OCR results
    full_ocr_text = "".join(f"\n\n--- Page {i} ---\n{t}" for i, t in enumerate(ocr_pages, 1))
    with open(Text, "w", encoding="utf-8") as f:
        f.write(full_ocr_text)
    print("\nSaved full OCR to Text.txt")

    print("\n" + "="*60)
    print("OCR ACCURACY ANALYSIS REPORT (CER, EasyOCR)")
    print("="*60)
    print(f"[config] CER includes spaces? {INCLUDE_SPACES}")

    # Page-wise CER (each page vs full ground truth, mirroring your style)
    page_metrics = []
    for i, page_text in enumerate(ocr_pages, 1):
        m = calculate_cer_metrics(page_text, gt_text, include_spaces=INCLUDE_SPACES)  # ✅ use gt_text
        page_metrics.append(m)
        print(f"Page {i}: CER = {m['cer_percent']:.2f}%  "
              f"(S={m['S']}, D={m['D']}, I={m['I']}, N={m['N']})")

    # Overall CER (all pages concatenated vs full ground truth)
    combined_ocr = " ".join(ocr_pages)
    overall = calculate_cer_metrics(combined_ocr, gt_text, include_spaces=INCLUDE_SPACES)  # ✅ use gt_text
    print(f"\nOverall: CER = {overall['cer_percent']:.2f}%  "
          f"(S={overall['S']}, D={overall['D']}, I={overall['I']}, N={overall['N']})")

    # Detailed differences for the first page (character-level)
    print("\n" + "="*60)
    print("DETAILED CHARACTER-LEVEL DIFFERENCES (Page 1)")
    print("="*60)
    diff_text = highlight_differences_chars(page_metrics[0]['ops'])
    print(diff_text[:1000] + "..." if len(diff_text) > 1000 else diff_text)

    # Save detailed comparison to file
    with open(Report, "w", encoding="utf-8") as f:
        f.write("OCR ACCURACY ANALYSIS REPORT (CER, EasyOCR)\n")
        f.write("="*60 + "\n")
        f.write(f"[config] CER includes spaces? {INCLUDE_SPACES}\n")
        for i, m in enumerate(page_metrics, 1):
            f.write(f"Page {i}: CER = {m['cer_percent']:.2f}%  "
                    f"(S={m['S']}, D={m['D']}, I={m['I']}, N={m['N']})\n")
        f.write(f"\nOverall: CER = {overall['cer_percent']:.2f}%  "
                f"(S={overall['S']}, D={overall['D']}, I={overall['I']}, N={overall['N']})\n\n")

        f.write("DETAILED CHARACTER-LEVEL DIFFERENCES\n")
        f.write("="*60 + "\n")
        for i, m in enumerate(page_metrics, 1):
            f.write(f"\n--- Page {i} Differences ---\n")
            f.write(highlight_differences_chars(m['ops']) + "\n")

    print("\nSaved detailed comparison report to Report.txt")

if __name__ == "__main__":
    main()

Performing OCR on Ideal Text.pdf using EasyOCR...




[debug] OCR'd Page 1: 249 chars, 10 text blocks
Reading ground truth from Ideal Gound Truth.docx...

Saved full OCR to Text.txt

OCR ACCURACY ANALYSIS REPORT (CER, EasyOCR)
[config] CER includes spaces? False
Page 1: CER = 60.19%  (S=100, D=19, I=5, N=206)

Overall: CER = 60.19%  (S=100, D=19, I=5, N=206)

DETAILED CHARACTER-LEVEL DIFFERENCES (Page 1)
[MISSING:a]move[MISSING:t][OCR:j -> GT:o][OCR:0 -> GT:s][OCR:3 -> GT:t]o[OCR:8 -> GT:p][OCR:a -> GT:m]r[OCR:_ -> GT:.]g[MISSING:a]i[MISSING:t][OCR:b -> GT:s][OCR:b -> GT:k]e[MISSING:l]lf[OCR:v -> GT:r][OCR:e -> GT:o][OCR:n -> GT:m][OCR:w -> GT:n]o[OCR:w -> GT:m]ina[OCR:l -> GT:t][OCR:; -> GT:i]n[OCR:3 -> GT:g]a[MISSING:n][MISSING:y][OCR:l -> GT:m]o[OCR:n -> GT:r]e[EXTRA OCR:_]lab[OCR:a -> GT:o]url[MISSING:i]fe[OCR:_ -> GT:p][OCR:p -> GT:e]er[OCR:) -> GT:s][OCR:l -> GT:i][OCR:o -> GT:s][OCR:( -> GT:t]o[OCR:_ -> GT:b][OCR:l -> GT:e][OCR:u -> GT:m]ad[EXTRA OCR:r][OCR:_ -> GT:e]at[MISSING:a]me[OCR:o -> GT:e][OCR:h -> GT:t]i[MISSING:n][OCR:a -

# BLEU Error Rate

In [8]:
import re
import math
import easyocr
from PIL import Image
from pdf2image import convert_from_path
import docx
import os
import numpy as np
from collections import Counter

# ---------- OCR + IO ----------

def ocr_pdf(pdf_path, lang="en", dpi=350, first_page=None, last_page=None):
    """
    Perform OCR on PDF using EasyOCR
    """
    # Initialize EasyOCR reader
    reader = easyocr.Reader([lang])

    pages = convert_from_path(pdf_path, dpi=dpi, first_page=first_page, last_page=last_page)
    page_texts = []

    for i, page in enumerate(pages, 1):
        # Convert PIL image to numpy array
        img_array = np.array(page)

        # Perform OCR
        results = reader.readtext(img_array, detail=0, paragraph=True)

        # Combine all text blocks
        page_text = "\n".join(results)
        page_texts.append(page_text)

        print(f"[debug] OCR'd Page {i}: {len(page_text)} chars, {len(results)} text blocks")

    return page_texts

def read_docx(file_path):
    """Read text from a Word document"""
    doc = docx.Document(file_path)
    full_text = []
    for paragraph in doc.paragraphs:
        full_text.append(paragraph.text)
    return '\n'.join(full_text)

# ---------- Preprocessing & Tokenization ----------

def preprocess_text(text):
    """
    Light normalization for BLEU: lowercase, collapse whitespace, strip unusual chars
    (we compute BLEU on word tokens).
    """
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.,;:!?()\-]', '', text)
    return text.strip()

def tokenize_words(text):
    """Tokenize into words (alphanumeric + underscore)."""
    return re.findall(r'\w+', text.lower())

# ---------- BLEU Implementation (no external libs) ----------

def make_ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(max(0, len(tokens)-n+1))]

def modified_precision(hyp, ref, n):
    """
    BLEU modified n-gram precision with clipping.
    Returns numerator, denominator.
    """
    hyp_ngrams = make_ngrams(hyp, n)
    ref_ngrams = make_ngrams(ref, n)

    if not hyp_ngrams:
        return 0, 0

    hyp_counts = Counter(hyp_ngrams)
    ref_counts = Counter(ref_ngrams)

    clipped = {g: min(c, ref_counts.get(g, 0)) for g, c in hyp_counts.items()}
    num = sum(clipped.values())
    den = sum(hyp_counts.values())
    return num, den

def brevity_penalty(c, r):
    """
    c = length of hypothesis in words
    r = length of reference in words
    """
    if c == 0:
        return 0.0
    if c > r:
        return 1.0
    return math.exp(1 - (r / c))  # BP = e^(1 - r/c)

def compute_bleu(hyp_tokens, ref_tokens, max_n=4, smoothing=True):
    """
    Compute BLEU score (0..1) with equal weights for 1..max_n.
    Uses simple add-one smoothing for zero counts.
    """
    weights = [1.0 / max_n] * max_n

    precisions = []
    for n in range(1, max_n + 1):
        num, den = modified_precision(hyp_tokens, ref_tokens, n)
        if den == 0:
            p_n = 0.0
        else:
            if num == 0 and smoothing:
                p_n = (num + 1) / (den + 1)
            else:
                p_n = num / den
        precisions.append(p_n)

    if all(p == 0 for p in precisions):
        geo_mean = 0.0
    else:
        sum_logs = 0.0
        for w, p in zip(weights, precisions):
            p = max(p, 1e-16)  # avoid log(0)
            sum_logs += w * math.log(p)
        geo_mean = math.exp(sum_logs)

    BP = brevity_penalty(len(hyp_tokens), len(ref_tokens))
    bleu = BP * geo_mean
    return bleu, BP, precisions

def calculate_bleu_metrics(ocr_text, ground_truth, max_n=4, smoothing=True):
    """
    Returns:
      - bleu_percent (0..100), bleu_error_percent (100 - BLEU%)
      - brevity_penalty
      - precisions list (p1..pN as 0..1)
      - hyp_len, ref_len
    """
    hyp = tokenize_words(preprocess_text(ocr_text))
    ref = tokenize_words(preprocess_text(ground_truth))
    bleu, bp, precisions = compute_bleu(hyp, ref, max_n=max_n, smoothing=smoothing)
    bleu_percent = bleu * 100.0
    return {
        'bleu_percent': bleu_percent,
        'bleu_error_percent': 100.0 - bleu_percent,
        'brevity_penalty': bp,
        'precisions': precisions,
        'hyp_len': len(hyp),
        'ref_len': len(ref),
    }

# ---------- Main ----------

def main():
    # Settings
    MAX_N = 4        # BLEU-4
    SMOOTHING = True # add-one smoothing to avoid zero scores on short/noisy text

    # Perform OCR on the PDF using EasyOCR
    pdf_file = pdf
    print(f"Performing OCR on {pdf_file} using EasyOCR...")
    ocr_pages = ocr_pdf(pdf_file)

    # ✅ FIX: use different variable names for file path and text
    gt_file = ground_truth
    print(f"Reading ground truth from {gt_file}...")
    gt_text = read_docx(gt_file)  # renamed variable

    # Save OCR results
    full_ocr_text = "".join(f"\n\n--- Page {i} ---\n{t}" for i, t in enumerate(ocr_pages, 1))
    with open(Text, "w", encoding="utf-8") as f:
        f.write(full_ocr_text)
    print("\nSaved full OCR to Text.txt")

    print("\n" + "="*60)
    print("OCR ACCURACY ANALYSIS REPORT (BLEU, EasyOCR)")
    print("="*60)
    print(f"[config] BLEU max n-gram: {MAX_N}, smoothing: {SMOOTHING}")

    # Page-wise BLEU
    page_metrics = []
    for i, page_text in enumerate(ocr_pages, 1):
        m = calculate_bleu_metrics(page_text, gt_text, max_n=MAX_N, smoothing=SMOOTHING)  # use gt_text
        page_metrics.append(m)
        pcts = ", ".join(f"p{n}={m['precisions'][n-1]*100:.1f}%" for n in range(1, MAX_N+1))
        print(
            f"Page {i}: BLEU = {m['bleu_percent']:.2f}%  "
            f"(Error={m['bleu_error_percent']:.2f}%)  "
            f"BP={m['brevity_penalty']:.3f}  "
            f"[{pcts}]  (hyp={m['hyp_len']}, ref={m['ref_len']})"
        )

    # Overall BLEU
    combined_ocr = " ".join(ocr_pages)
    overall = calculate_bleu_metrics(combined_ocr, gt_text, max_n=MAX_N, smoothing=SMOOTHING)  # use gt_text
    pcts_overall = ", ".join(f"p{n}={overall['precisions'][n-1]*100:.1f}%" for n in range(1, MAX_N+1))
    print(
        f"\nOverall: BLEU = {overall['bleu_percent']:.2f}%  "
        f"(Error={overall['bleu_error_percent']:.2f}%)  "
        f"BP={overall['brevity_penalty']:.3f}  "
        f"[{pcts_overall}]  (hyp={overall['hyp_len']}, ref={overall['ref_len']})"
    )

    # Save report
    with open(Report, "w", encoding="utf-8") as f:
        f.write("OCR ACCURACY ANALYSIS REPORT (BLEU, EasyOCR)\n")
        f.write("="*60 + "\n")
        f.write(f"[config] BLEU max n-gram: {MAX_N}, smoothing: {SMOOTHING}\n\n")
        for i, m in enumerate(page_metrics, 1):
            pcts = ", ".join(f"p{n}={m['precisions'][n-1]*100:.1f}%" for n in range(1, MAX_N+1))
            f.write(
                f"Page {i}: BLEU = {m['bleu_percent']:.2f}%  "
                f"(Error={m['bleu_error_percent']:.2f}%)  "
                f"BP={m['brevity_penalty']:.3f}  "
                f"[{pcts}]  (hyp={m['hyp_len']}, ref={m['ref_len']})\n"
            )
        f.write("\n")
        f.write(
            f"Overall: BLEU = {overall['bleu_percent']:.2f}%  "
            f"(Error={overall['bleu_error_percent']:.2f}%)  "
            f"BP={overall['brevity_penalty']:.3f}  "
            f"[{pcts_overall}]  (hyp={overall['hyp_len']}, ref={overall['ref_len']})\n"
        )

    print("\nSaved BLEU report to Report.txt")

if __name__ == "__main__":
    main()

Performing OCR on Ideal Text.pdf using EasyOCR...




[debug] OCR'd Page 1: 249 chars, 10 text blocks
Reading ground truth from Ideal Gound Truth.docx...

Saved full OCR to Text.txt

OCR ACCURACY ANALYSIS REPORT (BLEU, EasyOCR)
[config] BLEU max n-gram: 4, smoothing: True
Page 1: BLEU = 2.71%  (Error=97.29%)  BP=1.000  [p1=7.5%, p2=1.9%, p3=1.9%, p4=2.0%]  (hyp=53, ref=50)

Overall: BLEU = 2.71%  (Error=97.29%)  BP=1.000  [p1=7.5%, p2=1.9%, p3=1.9%, p4=2.0%]  (hyp=53, ref=50)

Saved BLEU report to Report.txt


# ROUGE Error Rate

In [11]:
import re
import math
import easyocr
from PIL import Image
from pdf2image import convert_from_path
import docx
import os
import numpy as np
from collections import Counter

# ---------- OCR + IO ----------

def ocr_pdf(pdf_path, lang="en", dpi=350, first_page=None, last_page=None):
    """
    Perform OCR on PDF using EasyOCR
    """
    # Initialize EasyOCR reader
    reader = easyocr.Reader([lang])

    pages = convert_from_path(pdf_path, dpi=dpi, first_page=first_page, last_page=last_page)
    page_texts = []

    for i, page in enumerate(pages, 1):
        # Convert PIL image to numpy array
        img_array = np.array(page)

        # Perform OCR
        results = reader.readtext(img_array, detail=0, paragraph=True)

        # Combine all text blocks
        page_text = "\n".join(results)
        page_texts.append(page_text)

        print(f"[debug] OCR'd Page {i}: {len(page_text)} chars, {len(results)} text blocks")

    return page_texts

def read_docx(file_path):
    """Read text from a Word document"""
    doc = docx.Document(file_path)
    full_text = []
    for paragraph in doc.paragraphs:
        full_text.append(paragraph.text)
    return '\n'.join(full_text)

# ---------- Preprocessing & Tokenization ----------

def preprocess_text(text):
    """
    Light normalization for ROUGE: lowercase, collapse whitespace, strip unusual chars
    (we compute ROUGE on word tokens).
    """
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.,;:!?()\-]', '', text)
    return text.strip()

def tokenize_words(text):
    """Tokenize into words (alphanumeric + underscore)."""
    return re.findall(r'\w+', text.lower())

def make_ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(max(0, len(tokens)-n+1))]

# ---------- ROUGE-N (recall) ----------

def rouge_n_recall(hyp_tokens, ref_tokens, n=1):
    """
    ROUGE-N recall = (# of clipped overlapping n-grams) / (total n-grams in reference).
    Returns recall in [0,1].
    """
    ref_ngrams = make_ngrams(ref_tokens, n)
    hyp_ngrams = make_ngrams(hyp_tokens, n)

    ref_counts = Counter(ref_ngrams)
    hyp_counts = Counter(hyp_ngrams)

    overlap = 0
    for g, rc in ref_counts.items():
        overlap += min(rc, hyp_counts.get(g, 0))

    denom = sum(ref_counts.values())
    if denom == 0:
        return 0.0
    return overlap / denom

# ---------- ROUGE-L (LCS-based) ----------

def lcs_length(x, y):
    """
    Classic DP for LCS length over token sequences.
    """
    m, n = len(x), len(y)
    dp = [[0]*(n+1) for _ in range(m+1)]
    for i in range(m):
        xi = x[i]
        for j in range(n):
            if xi == y[j]:
                dp[i+1][j+1] = dp[i][j] + 1
            else:
                dp[i+1][j+1] = max(dp[i][j+1], dp[i+1][j])
    return dp[m][n]

def rouge_l_scores(hyp_tokens, ref_tokens):
    """
    Compute ROUGE-L recall, precision, and F1 based on LCS.
    Returns (R, P, F1) in [0,1].
    """
    lcs = lcs_length(ref_tokens, hyp_tokens)
    ref_len = len(ref_tokens)
    hyp_len = len(hyp_tokens)

    R = 0.0 if ref_len == 0 else lcs / ref_len
    P = 0.0 if hyp_len == 0 else lcs / hyp_len
    if R == 0.0 and P == 0.0:
        F1 = 0.0
    else:
        F1 = (2 * R * P) / (R + P)
    return R, P, F1

# ---------- ROUGE metrics wrapper ----------

def calculate_rouge_metrics(ocr_text, ground_truth):
    """
    Returns:
      - rouge1_recall_percent, rouge2_recall_percent
      - rougeL_recall_percent, rougeL_precision_percent, rougeL_f1_percent
      - error rates for recalls (100 - recall%)
      - lengths
    """
    hyp = tokenize_words(preprocess_text(ocr_text))
    ref = tokenize_words(preprocess_text(ground_truth))

    r1 = rouge_n_recall(hyp, ref, n=1)  # ROUGE-1 recall
    r2 = rouge_n_recall(hyp, ref, n=2)  # ROUGE-2 recall
    rl_R, rl_P, rl_F1 = rouge_l_scores(hyp, ref)

    return {
        'rouge1_recall_percent': r1 * 100.0,
        'rouge2_recall_percent': r2 * 100.0,
        'rougeL_recall_percent': rl_R * 100.0,
        'rougeL_precision_percent': rl_P * 100.0,
        'rougeL_f1_percent': rl_F1 * 100.0,
        # "Error rates" as complements of recall:
        'rouge1_error_percent': 100.0 - (r1 * 100.0),
        'rouge2_error_percent': 100.0 - (r2 * 100.0),
        'rougeL_error_percent': 100.0 - (rl_R * 100.0),
        'hyp_len': len(hyp),
        'ref_len': len(ref),
    }

# ---------- Main ----------

def main():
    # Perform OCR on the PDF using EasyOCR
    pdf_file = pdf
    print(f"Performing OCR on {pdf_file} using EasyOCR...")
    ocr_pages = ocr_pdf(pdf_file)

    # ✅ FIX: avoid variable shadowing
    gt_file = ground_truth
    print(f"Reading ground truth from {gt_file}...")
    gt_text = read_docx(gt_file)   # <-- renamed variable

    # Save OCR results
    full_ocr_text = "".join(f"\n\n--- Page {i} ---\n{t}" for i, t in enumerate(ocr_pages, 1))
    with open(Text, "w", encoding="utf-8") as f:
        f.write(full_ocr_text)
    print("\nSaved full OCR to Text.txt")

    print("\n" + "="*60)
    print("OCR ACCURACY ANALYSIS REPORT (ROUGE, EasyOCR)")
    print("="*60)

    # Page-wise ROUGE (each page vs full ground truth)
    page_metrics = []
    for i, page_text in enumerate(ocr_pages, 1):
        m = calculate_rouge_metrics(page_text, gt_text)   # ✅ use gt_text
        page_metrics.append(m)
        print(
            f"Page {i}: "
            f"R1={m['rouge1_recall_percent']:.2f}% (Err={m['rouge1_error_percent']:.2f}%)  "
            f"R2={m['rouge2_recall_percent']:.2f}% (Err={m['rouge2_error_percent']:.2f}%)  "
            f"RL_R={m['rougeL_recall_percent']:.2f}% (Err={m['rougeL_error_percent']:.2f}%)  "
            f"RL_P={m['rougeL_precision_percent']:.2f}%  RL_F1={m['rougeL_f1_percent']:.2f}%  "
            f"(hyp={m['hyp_len']}, ref={m['ref_len']})"
        )

    # Overall ROUGE (all pages concatenated vs full ground truth)
    combined_ocr = " ".join(ocr_pages)
    overall = calculate_rouge_metrics(combined_ocr, gt_text)  # ✅ use gt_text
    print(
        f"\nOverall: "
        f"R1={overall['rouge1_recall_percent']:.2f}% (Err={overall['rouge1_error_percent']:.2f}%)  "
        f"R2={overall['rouge2_recall_percent']:.2f}% (Err={overall['rouge2_error_percent']:.2f}%)  "
        f"RL_R={overall['rougeL_recall_percent']:.2f}% (Err={overall['rougeL_error_percent']:.2f}%)  "
        f"RL_P={overall['rougeL_precision_percent']:.2f}%  RL_F1={overall['rougeL_f1_percent']:.2f}%  "
        f"(hyp={overall['hyp_len']}, ref={overall['ref_len']})"
    )

    # Save report
    with open(Report, "w", encoding="utf-8") as f:
        f.write("OCR ACCURACY ANALYSIS REPORT (ROUGE, EasyOCR)\n")
        f.write("="*60 + "\n")
        for i, m in enumerate(page_metrics, 1):
            f.write(
                f"Page {i}: "
                f"R1={m['rouge1_recall_percent']:.2f}% (Err={m['rouge1_error_percent']:.2f}%)  "
                f"R2={m['rouge2_recall_percent']:.2f}% (Err={m['rouge2_error_percent']:.2f}%)  "
                f"RL_R={m['rougeL_recall_percent']:.2f}% (Err={m['rougeL_error_percent']:.2f}%)  "
                f"RL_P={m['rougeL_precision_percent']:.2f}%  RL_F1={m['rougeL_f1_percent']:.2f}%  "
                f"(hyp={m['hyp_len']}, ref={m['ref_len']})\n"
            )
        f.write("\n")
        f.write(
            f"Overall: "
            f"R1={overall['rouge1_recall_percent']:.2f}% (Err={overall['rouge1_error_percent']:.2f}%)  "
            f"R2={overall['rouge2_recall_percent']:.2f}% (Err={overall['rouge2_error_percent']:.2f}%)  "
            f"RL_R={overall['rougeL_recall_percent']:.2f}% (Err={overall['rougeL_error_percent']:.2f}%)  "
            f"RL_P={overall['rougeL_precision_percent']:.2f}%  RL_F1={overall['rougeL_f1_percent']:.2f}%  "
            f"(hyp={overall['hyp_len']}, ref={overall['ref_len']})\n"
        )

    print("\nSaved ROUGE report to Report.txt")

if __name__ == "__main__":
    main()

Performing OCR on Ideal Text.pdf using EasyOCR...




[debug] OCR'd Page 1: 249 chars, 10 text blocks
Reading ground truth from Ideal Gound Truth.docx...

Saved full OCR to Text.txt

OCR ACCURACY ANALYSIS REPORT (ROUGE, EasyOCR)
Page 1: R1=8.00% (Err=92.00%)  R2=0.00% (Err=100.00%)  RL_R=6.00% (Err=94.00%)  RL_P=5.66%  RL_F1=5.83%  (hyp=53, ref=50)

Overall: R1=8.00% (Err=92.00%)  R2=0.00% (Err=100.00%)  RL_R=6.00% (Err=94.00%)  RL_P=5.66%  RL_F1=5.83%  (hyp=53, ref=50)

Saved ROUGE report to Report.txt


# BERT Score Error Rate

In [13]:
import re
import math
import easyocr
from PIL import Image
from pdf2image import convert_from_path
import docx
import os
import numpy as np
from collections import Counter

# NEW: BERTScore
from bert_score import score as bertscore_score

# ---------- OCR + IO ----------

def ocr_pdf(pdf_path, lang="en", dpi=350, first_page=None, last_page=None):
    """
    Perform OCR on PDF using EasyOCR
    """
    # Initialize EasyOCR reader
    reader = easyocr.Reader([lang])

    pages = convert_from_path(pdf_path, dpi=dpi, first_page=first_page, last_page=last_page)
    page_texts = []

    for i, page in enumerate(pages, 1):
        # Convert PIL image to numpy array
        img_array = np.array(page)

        # Perform OCR
        results = reader.readtext(img_array, detail=0, paragraph=True)

        # Combine all text blocks
        page_text = "\n".join(results)
        page_texts.append(page_text)

        print(f"[debug] OCR'd Page {i}: {len(page_text)} chars, {len(results)} text blocks")

    return page_texts

def read_docx(file_path):
    """Read text from a Word document"""
    doc = docx.Document(file_path)
    full_text = []
    for paragraph in doc.paragraphs:
        full_text.append(paragraph.text)
    return '\n'.join(full_text)

# ---------- Minimal preprocessing (optional) ----------

def preprocess_text_for_bertscore(text):
    """
    Light normalization. BERTScore works on raw text, but normalizing whitespace
    helps when OCR creates odd spacing.
    """
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# ---------- BERTScore metrics wrapper ----------

def calculate_bertscore_metrics(ocr_text, ground_truth,
                                lang="en",
                                model_type=None,
                                rescale_with_baseline=True,
                                use_idf=False):
    """
    Compute BERTScore Precision, Recall, and F1 between OCR hypothesis and reference.
    Returns % scores and % error rates (100 - score%).
    """
    hyp = preprocess_text_for_bertscore(ocr_text)
    ref = preprocess_text_for_bertscore(ground_truth)

    # BERTScore expects lists of strings (cands, refs)
    cands = [hyp]
    refs = [ref]

    P, R, F1 = bertscore_score(
        cands,
        refs,
        lang=lang,
        rescale_with_baseline=rescale_with_baseline,
        model_type=model_type,   # None => library picks a good default for lang
        idf=use_idf
    )

    p = float(P.mean().item())
    r = float(R.mean().item())
    f1 = float(F1.mean().item())

    p_pct = p * 100.0
    r_pct = r * 100.0
    f1_pct = f1 * 100.0

    return {
        'bert_precision_percent': p_pct,
        'bert_recall_percent': r_pct,
        'bert_f1_percent': f1_pct,
        'bert_precision_error_percent': 100.0 - p_pct,
        'bert_recall_error_percent': 100.0 - r_pct,
        'bert_f1_error_percent': 100.0 - f1_pct,
        'hyp_len_chars': len(hyp),
        'ref_len_chars': len(ref),
    }

# ---------- Main ----------

def main():
    # Perform OCR on the PDF using EasyOCR
    pdf_file = pdf
    print(f"Performing OCR on {pdf_file} using EasyOCR...")
    ocr_pages = ocr_pdf(pdf_file)

    # ✅ FIX: use separate variable for file and text
    gt_file = ground_truth
    print(f"Reading ground truth from {gt_file}...")
    gt_text = read_docx(gt_file)   # renamed to avoid shadowing

    # Save OCR results
    full_ocr_text = "".join(f"\n\n--- Page {i} ---\n{t}" for i, t in enumerate(ocr_pages, 1))
    with open(Text, "w", encoding="utf-8") as f:
        f.write(full_ocr_text)
    print("\nSaved full OCR to Text.txt")

    print("\n" + "="*60)
    print("OCR ACCURACY ANALYSIS REPORT (BERTScore, EasyOCR)")
    print("="*60)

    # Page-wise BERTScore (each page vs full ground truth)
    page_metrics = []
    for i, page_text in enumerate(ocr_pages, 1):
        m = calculate_bertscore_metrics(
            page_text,
            gt_text,  # ✅ use gt_text here
            lang="en",
            model_type=None,                # let library choose default for English
            rescale_with_baseline=True,     # recommended
            use_idf=False
        )
        page_metrics.append(m)
        print(
            f"Page {i}: "
            f"P={m['bert_precision_percent']:.2f}% (Err={m['bert_precision_error_percent']:.2f}%)  "
            f"R={m['bert_recall_percent']:.2f}% (Err={m['bert_recall_error_percent']:.2f}%)  "
            f"F1={m['bert_f1_percent']:.2f}% (Err={m['bert_f1_error_percent']:.2f}%)  "
            f"(hyp_chars={m['hyp_len_chars']}, ref_chars={m['ref_len_chars']})"
        )

    # Overall BERTScore (all pages concatenated vs full ground truth)
    combined_ocr = " ".join(ocr_pages)
    overall = calculate_bertscore_metrics(
        combined_ocr,
        gt_text,  # ✅ use gt_text
        lang="en",
        model_type=None,
        rescale_with_baseline=True,
        use_idf=False
    )
    print(
        f"\nOverall: "
        f"P={overall['bert_precision_percent']:.2f}% (Err={overall['bert_precision_error_percent']:.2f}%)  "
        f"R={overall['bert_recall_percent']:.2f}% (Err={overall['bert_recall_error_percent']:.2f}%)  "
        f"F1={overall['bert_f1_percent']:.2f}% (Err={overall['bert_f1_error_percent']:.2f}%)  "
        f"(hyp_chars={overall['hyp_len_chars']}, ref_chars={overall['ref_len_chars']})"
    )

    # Save report
    with open(Report, "w", encoding="utf-8") as f:
        f.write("OCR ACCURACY ANALYSIS REPORT (BERTScore, EasyOCR)\n")
        f.write("="*60 + "\n")
        for i, m in enumerate(page_metrics, 1):
            f.write(
                f"Page {i}: "
                f"P={m['bert_precision_percent']:.2f}% (Err={m['bert_precision_error_percent']:.2f}%)  "
                f"R={m['bert_recall_percent']:.2f}% (Err={m['bert_recall_error_percent']:.2f}%)  "
                f"F1={m['bert_f1_percent']:.2f}% (Err={m['bert_f1_error_percent']:.2f}%)  "
                f"(hyp_chars={m['hyp_len_chars']}, ref_chars={m['ref_len_chars']})\n"
            )
        f.write("\n")
        f.write(
            f"Overall: "
            f"P={overall['bert_precision_percent']:.2f}% (Err={overall['bert_precision_error_percent']:.2f}%)  "
            f"R={overall['bert_recall_percent']:.2f}% (Err={overall['bert_recall_error_percent']:.2f}%)  "
            f"F1={overall['bert_f1_percent']:.2f}% (Err={overall['bert_f1_error_percent']:.2f}%)  "
            f"(hyp_chars={overall['hyp_len_chars']}, ref_chars={overall['ref_len_chars']})\n"
        )

    print("\nSaved BERTScore report to Report.txt")

if __name__ == "__main__":
    main()

Performing OCR on Ideal Text.pdf using EasyOCR...




[debug] OCR'd Page 1: 249 chars, 10 text blocks
Reading ground truth from Ideal Gound Truth.docx...

Saved full OCR to Text.txt

OCR ACCURACY ANALYSIS REPORT (BERTScore, EasyOCR)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Page 1: P=-78.85% (Err=178.85%)  R=-18.80% (Err=118.80%)  F1=-50.61% (Err=150.61%)  (hyp_chars=249, ref_chars=255)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Overall: P=-78.85% (Err=178.85%)  R=-18.80% (Err=118.80%)  F1=-50.61% (Err=150.61%)  (hyp_chars=249, ref_chars=255)

Saved BERTScore report to Report.txt
