In [37]:
!pip install -r requirements.txt




# IMPORTS

In [38]:
#IMPORTS
import os
import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from jiwer import wer, cer
from pdf2image import convert_from_path
import docx
import torch


# STEP 1: Preprocess scanned page

In [39]:
# ------------------------------------------------------
# STEP 1: Preprocess scanned page
# ------------------------------------------------------
def preprocess_image(pil_image):
    """Preprocess scanned handwritten page: grayscale, normalize illumination, threshold."""
    img = np.array(pil_image.convert("RGB"))
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    # Normalize illumination using morphological closing
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15))
    bg = cv2.morphologyEx(gray, cv2.MORPH_CLOSE, kernel)
    norm = cv2.divide(gray, bg, scale=255)

    # Adaptive threshold
    binary = cv2.adaptiveThreshold(norm, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
                                   cv2.THRESH_BINARY_INV, 25, 15)

    # Denoise small specks
    binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, np.ones((2, 2), np.uint8))
    return binary

# STEP 2: Line segmentation

In [40]:
# ------------------------------------------------------
# STEP 2: Line segmentation
# ------------------------------------------------------
def segment_lines(binary_img, min_height=20):
    """Segment text lines from binary image."""
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (100, 3))
    dilated = cv2.dilate(binary_img, kernel, iterations=1)

    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    lines = []
    for cnt in sorted(contours, key=lambda x: cv2.boundingRect(x)[1]):
        x, y, w, h = cv2.boundingRect(cnt)
        if h > min_height:
            line = binary_img[y:y + h, x:x + w]
            lines.append(line)
    return lines

# STEP 3: Visualize segmented lines (optional)

In [41]:
# ------------------------------------------------------
# STEP 3: Visualize segmented lines (optional)
# ------------------------------------------------------
def visualize_lines(lines, max_lines=5):
    plt.figure(figsize=(10, 10))
    for i, l in enumerate(lines[:max_lines]):
        plt.subplot(max_lines, 1, i + 1)
        plt.imshow(255 - l, cmap='gray')
        plt.axis('off')
    plt.show()


# STEP 4: Initialize TrOCR model

In [42]:
# ------------------------------------------------------
# STEP 4: Initialize TrOCR model
# ------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(device)

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# STEP 5: Recognize a single line

In [43]:
# ------------------------------------------------------
# STEP 5: Recognize a single line
# ------------------------------------------------------
def recognize_line(line_img):
    """Recognize handwritten text line using TrOCR."""
    image = Image.fromarray(255 - line_img).convert("RGB")  # invert for TrOCR
    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
    generated_ids = model.generate(pixel_values)
    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return text.strip()



# STEP 6: Process a full PDF

In [44]:
# ------------------------------------------------------
# STEP 6: Process a full PDF
# ------------------------------------------------------
def recognize_pdf(pdf_path, dpi=300):
    """Convert PDF to images, preprocess, segment, recognize each page."""
    pages = convert_from_path(pdf_path, dpi=dpi)
    all_page_texts = []

    for i, page in enumerate(pages, 1):
        print(f"\nðŸ“„ Processing Page {i}/{len(pages)} ...")
        binary = preprocess_image(page)
        lines = segment_lines(binary)

        page_texts = []
        for idx, line in enumerate(lines):
            txt = recognize_line(line)
            page_texts.append(txt)
            print(f"  Line {idx + 1}: {txt}")

        all_page_texts.append("\n".join(page_texts))
    return all_page_texts


# STEP 7: Read ground truth from DOCX

In [45]:
# ------------------------------------------------------
# STEP 7: Read ground truth from DOCX
# ------------------------------------------------------
def read_docx_text(docx_path):
    doc = docx.Document(docx_path)
    text = "\n".join(p.text.strip() for p in doc.paragraphs if p.text.strip())
    return text

# STEP 8: Evaluate OCR output

In [46]:
# ------------------------------------------------------
# STEP 8: Evaluate OCR output
# ------------------------------------------------------
def evaluate_text(predicted_text, ground_truth_text):
    cer_score = cer(ground_truth_text.lower(), predicted_text.lower())
    wer_score = wer(ground_truth_text.lower(), predicted_text.lower())
    print("\n==================== OCR EVALUATION ====================")
    print(f"CER: {cer_score:.4f}")
    print(f"WER: {wer_score:.4f}")
    print("========================================================")
    return cer_score, wer_score

In [47]:
!apt-get install -y poppler-utils


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.12).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


In [48]:
from pdf2image import convert_from_path


# STEP 9: Main execution

In [None]:
# ------------------------------------------------------
# STEP 9: Main execution
# ------------------------------------------------------
pdf_path = "15.pdf"  # input PDF of handwritten answer script
gt_path = "15.docx"  # ground truth DOCX file

print("ðŸš€ Starting OCR Pipeline")
page_texts = recognize_pdf(pdf_path)

# Combine recognized text
recognized_text = "\n\n".join(page_texts)

# Save OCR text
with open("008_trocr_output.txt", "w", encoding="utf-8") as f:
    f.write(recognized_text)
print("\nâœ… OCR text saved to 008_trocr_output.txt")

# Load ground truth
ground_truth = read_docx_text(gt_path)
print("\nâœ… Ground truth loaded from", gt_path)

# Evaluate
evaluate_text(recognized_text, ground_truth)

# Save final report
with open("15_TrOCR_comparison_report.txt", "w", encoding="utf-8") as f:
    f.write("OCR ACCURACY REPORT (TrOCR - Handwritten)\n")
    f.write("=" * 60 + "\n")
    f.write(recognized_text + "\n\n")
    f.write("=" * 60 + "\nGround Truth:\n")
    f.write(ground_truth)
print("âœ… Full report saved to 15_TrOCR_comparison_report.txt")


ðŸš€ Starting OCR Pipeline

ðŸ“„ Processing Page 1/3 ...
  Line 1: I
  Line 2: " 12/25 , " many
  Line 3: 2. 5.
  Line 4: 0 0 0 0
  Line 5: spares .


# STEP 10: Additional metrics (BLEU, ROUGE, BERTScore) and reporting

This section adds new metric computations and saves results to the requested
output folders and an Excel master file. The original code above is left exactly
as provided. The cells below add functionality without modifying the original code.

In [None]:
# Install additional required packages for BLEU / ROUGE / BERTScore and Excel handling
# (This cell is optional if your environment already has these installed)
!pip install -q sacrebleu rouge-score bert-score pandas openpyxl

In [None]:
# Additional imports for metrics, Excel handling and utilities
import re
import pandas as pd
import sacrebleu
from rouge_score import rouge_scorer
from bert_score import score as bertscore_score
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Improved paper number extractor (matches any digit sequence)
def extract_paper_number(pdf_path, gt_path):
    combined = f"{pdf_path} {gt_path}"
    m = re.search(r'(\d+)', combined)   # match 1+ digits
    return m.group(1) if m else "papernumber"

# Compute BLEU, ROUGE-L, BERTScore (returning error rates in percent)
def compute_additional_metrics(pred_text, gt_text, bert_lang='en', use_cuda=False):
    # BLEU (sacrebleu corpus_bleu expects list of references and list of hypotheses)
    bleu = 0.0
    rouge_l_f = 0.0
    bert_f = 0.0
    try:
        bleu = sacrebleu.corpus_bleu([pred_text], [[gt_text]]).score  # 0-100
    except Exception as e:
        print("BLEU computation failed:", e)

    bleu_error = 100.0 - float(bleu)

    # ROUGE-L (use fmeasure and convert to percent)
    try:
        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        rouge_scores = scorer.score(gt_text, pred_text)
        rouge_l_f = rouge_scores['rougeL'].fmeasure  # 0-1
    except Exception as e:
        print("ROUGE computation failed:", e)

    rouge_error = 100.0 - (float(rouge_l_f) * 100.0)

    # BERTScore (F1 -> percent). Use cuda if available and requested.
    try:
        device_str = "cuda" if use_cuda else "cpu"
        P, R, F = bertscore_score([pred_text], [gt_text], lang=bert_lang, rescale_with_baseline=True, device=device_str)
        # F may be tensor or list-like
        f0 = F[0]
        bert_f = float(f0.item() if hasattr(f0, "item") else f0)
    except Exception as e:
        print("BERTScore computation failed:", e)

    bert_error = 100.0 - (bert_f * 100.0)

    return {
        'bleu_score': float(bleu),
        'bleu_error': float(bleu_error),
        'rouge_l_f': float(rouge_l_f),
        'rouge_error': float(rouge_error),
        'bert_f1': float(bert_f),
        'bert_error': float(bert_error)
    }

# Main wrapper function to compute metrics and save reports & Excel file
def generate_all_error_rates(predicted_text, ground_truth_text,
                             pdf_path=None, gt_path=None,
                             roll_number=None, traditional_model_name="Traditional OCR Model",
                             output_root="/content/TrOCR_updated_algo"):
    # Choose paper number
    papernum_tag = str(roll_number) if roll_number else extract_paper_number(pdf_path or "", gt_path or "")

    # Directories
    error_report_dir = Path(output_root) / "error_report"
    comparison_dir   = Path(output_root) / "comparison_report"
    output_text_dir  = Path(output_root) / "output_text"

    # Create dirs
    error_report_dir.mkdir(parents=True, exist_ok=True)
    comparison_dir.mkdir(parents=True, exist_ok=True)
    output_text_dir.mkdir(parents=True, exist_ok=True)

    # Compute CER and WER via evaluate_text if present
    try:
        cer_score, wer_score = evaluate_text(predicted_text, ground_truth_text)
    except Exception as e:
        print("evaluate_text failed, recomputing with jiwer:", e)
        from jiwer import wer, cer
        cer_score = cer(ground_truth_text.lower(), predicted_text.lower())
        wer_score = wer(ground_truth_text.lower(), predicted_text.lower())

    # Additional metrics
    try:
        metrics = compute_additional_metrics(predicted_text, ground_truth_text, use_cuda=torch.cuda.is_available())
    except Exception as e:
        print("compute_additional_metrics failed:", e)
        traceback.print_exc()
        metrics = {'bleu_score':0.0,'bleu_error':100.0,'rouge_l_f':0.0,'rouge_error':100.0,'bert_f1':0.0,'bert_error':100.0}

    # Paths for outputs
    error_report_path = error_report_dir / f"{papernum_tag}_error_rates.txt"
    trocr_out_path    = output_text_dir / f"{papernum_tag}_trocr_output.txt"
    comp_path         = comparison_dir / f"{papernum_tag}_TrOCR_comparison_report.txt"
    excel_path        = error_report_dir / "error_rates_master.xlsx"

    # Write error report
    with open(error_report_path, "w", encoding="utf-8") as rf:
        rf.write(f"Paper: {papernum_tag}\n")
        rf.write("="*60 + "\n")
        rf.write("Predicted OCR Text:\n\n")
        rf.write(predicted_text + "\n\n")
        rf.write("="*60 + "\nGround Truth:\n\n")
        rf.write(ground_truth_text + "\n\n")
        rf.write("="*60 + "\nMETRICS (errors shown as percentages):\n\n")
        rf.write(f"Char Error rate (%): {cer_score*100:.4f}\n")
        rf.write(f"Word Error rate (%): {wer_score*100:.4f}\n")
        rf.write(f"Bleu Error rate (%): {metrics['bleu_error']:.4f}\n")
        rf.write(f"Rouge Error rate (%): {metrics['rouge_error']:.4f}\n")
        rf.write(f"Bert Error rate (%): {metrics['bert_error']:.4f}\n")

    print(f"âœ… Saved detailed error report to: {error_report_path}")

    # Save TrOCR output
    with open(trocr_out_path, "w", encoding="utf-8") as f:
        f.write(predicted_text)
    print(f"âœ… Saved TrOCR output to: {trocr_out_path}")

    # Save comparison report
    with open(comp_path, "w", encoding="utf-8") as f:
        f.write("OCR ACCURACY REPORT (TrOCR - Handwritten)\n")
        f.write("=" * 60 + "\n")
        f.write(predicted_text + "\n\n")
        f.write("=" * 60 + "\nGround Truth:\n")
        f.write(ground_truth_text + "\n\n")
        f.write("=" * 60 + "\nMETRICS (errors shown as percentages):\n\n")
        f.write(f"Char Error rate (%): {cer_score*100:.4f}\n")
        f.write(f"Word Error rate (%): {wer_score*100:.4f}\n")
        f.write(f"Bleu Error rate (%): {metrics['bleu_error']:.4f}\n")
        f.write(f"Rouge Error rate (%): {metrics['rouge_error']:.4f}\n")
        f.write(f"Bert Error rate (%): {metrics['bert_error']:.4f}\n")

    print(f"âœ… Saved comparison report to: {comp_path}")

    # Prepare row for Excel master
    row = {
        'Roll Number': papernum_tag,
        'Traditional OCR Model': traditional_model_name,
        'Word Error rate (%)': wer_score*100,
        'Char Error rate (%)': cer_score*100,
        'Bleu Error rate (%)': metrics['bleu_error'],
        'Rouge Error rate (%)': metrics['rouge_error'],
        'Bert Error rate (%)': metrics['bert_error']
    }

    # Read/append/create Excel safely
    try:
        if excel_path.exists():
            df_master = pd.read_excel(excel_path)
            df_master = pd.concat([df_master, pd.DataFrame([row])], ignore_index=True)
        else:
            df_master = pd.DataFrame([row])
        df_master.to_excel(excel_path, index=False)
        print(f"âœ… Excel master saved/updated at: {excel_path}")
    except Exception as e:
        print("Failed to write Excel master:", e)
        traceback.print_exc()

    return {
        'cer': cer_score,
        'wer': wer_score,
        'bleu_score': metrics['bleu_score'],
        'bleu_error': metrics['bleu_error'],
        'rouge_l_f': metrics['rouge_l_f'],
        'rouge_error': metrics['rouge_error'],
        'bert_f1': metrics['bert_f1'],
        'bert_error': metrics['bert_error'],
        'error_report_path': str(error_report_path),
        'comparison_report_path': str(comp_path),
        'trocr_output_path': str(trocr_out_path),
        'excel_path': str(excel_path)
    }

# ------------------------ CALL the function (ensure recognized_text & ground_truth exist) ------------------------
# Replace "08" with the paper number you want; this explicit roll_number prevents "papernumber" filenames.
results = generate_all_error_rates(
    predicted_text = recognized_text,
    ground_truth_text = ground_truth,
    pdf_path = "15.pdf",
    gt_path = "15.docx",
    roll_number = "15",
    traditional_model_name = "Traditional OCR Model",
    output_root = "/content/TrOCR_updated_algo"
)

# Print saved file locations
print("\n=== Saved outputs ===")
print("Error report:", results['error_report_path'])
print("Comparison report:", results['comparison_report_path'])
print("TrOCR output:", results['trocr_output_path'])
print("Excel master:", results['excel_path'])
print("=====================\n")

In [None]:
# # STEP 11: Run the extended evaluation (override the original file variables if you wish)
# # The original variables in the earlier cell remain unchanged. Here we point to the user's
# # ground truth and scanned PDF locations as requested.

# # Example: replace '008' with whichever paper number files you have in the specified folders.
# pdf_path = "08.pdf"  # input PDF of handwritten answer script
# gt_path = "08.docx"  # ground truth DOCX file

# print("ðŸš€ Starting extended evaluation (this will call the existing recognize_pdf & read_docx_text functions)")

# # Recognize text using pre-existing recognize_pdf function
# page_texts = recognize_pdf(pdf_path)

# # Combine recognized text
# recognized_text = "\n\n".join(page_texts)

# # Save TrOCR output to the requested folder (the generate_all_error_rates also saves it again)
# # But we save a local intermediate copy here as well
# local_out = "008_trocr_output.txt"
# with open(local_out, "w", encoding="utf-8") as f:
#     f.write(recognized_text)
# print("\nâœ… OCR text saved locally to", local_out)

# # Load ground truth (reuse existing function)
# ground_truth = read_docx_text(gt_path)
# print("\nâœ… Ground truth loaded from", gt_path)

# # Call the combined metric generator (this will create the error_report, excel, comparison, and output_text files)
# results = generate_all_error_rates(recognized_text, ground_truth, pdf_path=pdf_path, gt_path=gt_path, traditional_model_name='YourTraditionalOCR')\

# print("\n=== SUMMARY OF GENERATED METRICS ===")
# for k, v in results.items():
#     if k.endswith('_path'):
#         print(f"{k}: {v}")
#     else:
#         print(f"{k}: {v}")