In [32]:
!pip install -r requirements.txt



# **Header and Description**

In [31]:
!apt-get install -y poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 38 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.11 [186 kB]
Fetched 186 kB in 0s (733 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126718 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.11_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.11) ...
Setting up poppler-utils (22.02.0-2ubuntu0.11) ...
Processing triggers for man-db (2.10.2-1) ...


In [33]:
# ======================================================
#  TrOCR Handwritten OCR Pipeline for Scanned PDF Scripts
# ======================================================
# Author: OpenAI Assistant (GPT-5)
# Description:
# - Convert PDF pages to images
# - Preprocess (grayscale, threshold, denoise)
# - Segment handwritten lines
# - Run TrOCR model for text recognition
# - Compare output with ground truth from DOCX
# ======================================================

# --- STEP 0: Install dependencies (uncomment if needed) ---
# !pip install opencv-python pillow numpy transformers torch torchvision jiwer matplotlib pdf2image python-docx


# **Imports**

In [34]:
import os
import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from jiwer import wer, cer
from pdf2image import convert_from_path
import docx
import torch


# **STEP 1: Preprocess Scanned Page**

In [35]:
# ------------------------------------------------------
# STEP 1: Preprocess scanned page
# ------------------------------------------------------
def preprocess_image(pil_image):
    """Preprocess scanned handwritten page: grayscale, normalize illumination, threshold."""
    img = np.array(pil_image.convert("RGB"))
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    # Normalize illumination using morphological closing
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15))
    bg = cv2.morphologyEx(gray, cv2.MORPH_CLOSE, kernel)
    norm = cv2.divide(gray, bg, scale=255)

    # Adaptive threshold
    binary = cv2.adaptiveThreshold(norm, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
                                   cv2.THRESH_BINARY_INV, 25, 15)

    # Denoise small specks
    binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, np.ones((2, 2), np.uint8))
    return binary


# **STEP 2: Line Segmentation**

In [36]:
# ------------------------------------------------------
# STEP 2: Line segmentation
# ------------------------------------------------------
def segment_lines(binary_img, min_height=20):
    """Segment text lines from binary image."""
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (100, 3))
    dilated = cv2.dilate(binary_img, kernel, iterations=1)

    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    lines = []
    for cnt in sorted(contours, key=lambda x: cv2.boundingRect(x)[1]):
        x, y, w, h = cv2.boundingRect(cnt)
        if h > min_height:
            line = binary_img[y:y + h, x:x + w]
            lines.append(line)
    return lines


# **STEP 3: Visualize Lines (Optional)**

In [37]:
# ------------------------------------------------------
# STEP 3: Visualize segmented lines (optional)
# ------------------------------------------------------
def visualize_lines(lines, max_lines=5):
    plt.figure(figsize=(10, 10))
    for i, l in enumerate(lines[:max_lines]):
        plt.subplot(max_lines, 1, i + 1)
        plt.imshow(255 - l, cmap='gray')
        plt.axis('off')
    plt.show()


# **STEP 4: Initialize TrOCR Model**

In [38]:
# ------------------------------------------------------
# STEP 4: Initialize TrOCR model
# ------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(device)


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **STEP 5: Recognize a Single Line**

In [39]:
# ------------------------------------------------------
# STEP 5: Recognize a single line
# ------------------------------------------------------
def recognize_line(line_img):
    """Recognize handwritten text line using TrOCR."""
    image = Image.fromarray(255 - line_img).convert("RGB")  # invert for TrOCR
    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
    generated_ids = model.generate(pixel_values)
    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return text.strip()


# **STEP 6: Process a Full PDF**

In [40]:
# ------------------------------------------------------
# STEP 6: Process a full PDF
# ------------------------------------------------------
def recognize_pdf(pdf_path, dpi=300):
    """Convert PDF to images, preprocess, segment, recognize each page."""
    pages = convert_from_path(pdf_path, dpi=dpi)
    all_page_texts = []

    for i, page in enumerate(pages, 1):
        print(f"\n📄 Processing Page {i}/{len(pages)} ...")
        binary = preprocess_image(page)
        lines = segment_lines(binary)

        page_texts = []
        for idx, line in enumerate(lines):
            txt = recognize_line(line)
            page_texts.append(txt)
            print(f"  Line {idx + 1}: {txt}")

        all_page_texts.append("\n".join(page_texts))
    return all_page_texts


# **STEP 7: Read Ground Truth**

In [41]:
# ------------------------------------------------------
# STEP 7: Read ground truth from DOCX
# ------------------------------------------------------
def read_docx_text(docx_path):
    doc = docx.Document(docx_path)
    text = "\n".join(p.text.strip() for p in doc.paragraphs if p.text.strip())
    return text


# **STEP 8: Evaluate OCR Output**

In [42]:
# ------------------------------------------------------
# STEP 8: Evaluate OCR output
# ------------------------------------------------------
def evaluate_text(predicted_text, ground_truth_text):
    cer_score = cer(ground_truth_text.lower(), predicted_text.lower())
    wer_score = wer(ground_truth_text.lower(), predicted_text.lower())
    print("\n==================== OCR EVALUATION ====================")
    print(f"CER: {cer_score:.4f}")
    print(f"WER: {wer_score:.4f}")
    print("========================================================")
    return cer_score, wer_score


# **STEP 9: Main Execution**

In [43]:
# ------------------------------------------------------
# STEP 9: Main execution
# ------------------------------------------------------
pdf_path = "008_scanned.pdf"  # input PDF of handwritten answer script
gt_path = "008_ground_truth.docx"  # ground truth DOCX file

print("🚀 Starting OCR Pipeline")
page_texts = recognize_pdf(pdf_path)

# Combine recognized text
recognized_text = "\n\n".join(page_texts)

# Save OCR text
with open("008_trocr_output.txt", "w", encoding="utf-8") as f:
    f.write(recognized_text)
print("\n✅ OCR text saved to 008_trocr_output.txt")

# Load ground truth
ground_truth = read_docx_text(gt_path)
print("\n✅ Ground truth loaded from", gt_path)

# Evaluate
evaluate_text(recognized_text, ground_truth)

# Save final report
with open("008_TrOCR_comparison_report.txt", "w", encoding="utf-8") as f:
    f.write("OCR ACCURACY REPORT (TrOCR - Handwritten)\n")
    f.write("=" * 60 + "\n")
    f.write(recognized_text + "\n\n")
    f.write("=" * 60 + "\nGround Truth:\n")
    f.write(ground_truth)
print("✅ Full report saved to 008_TrOCR_comparison_report.txt")


🚀 Starting OCR Pipeline

📄 Processing Page 1/4 ...
  Line 1: " " I did not go out . "
  Line 2: " " I understand it
  Line 3: a b.a.a.a.b.a.a.a.a.a
  Line 4: techno main Salt Lake .
  Line 5: " Formerly , techno India , Salt Lake ) ... .
  Line 6: " 51, " D. 0 )
  Line 7: Nameafter graduating with the
  Line 8: " " K. " -
  Line 9: Roll No. " 12030822008____0000sroom .CEA.A.200pick
  Line 10: 1952 53
  Line 11: 2 Part A.
  Line 12: # the most common expenials leaving ducks are- Classifications ,
  Line 13: 0,
  Line 14: mini bottoms . " Let
  Line 15: " " " ... " " ... . "
  Line 16: 3 Language judgement personnel in CO ) and this form ( or just the
  Line 17: " 7. ) " 5.5 :
  Line 18: a member of the Spanish Spanish American diplomat
  Line 19: # 0 -
  Line 20: This is now noxious mind since its return to
  Line 21: and it was the best of the best of the best of the best of the best of the
  Line 22: 0.
  Line 23: Part B.
  Line 24: " 2"
  Line 25: " I'll get it to it ... ... . "
  Li