<a href="https://colab.research.google.com/github/CUHK-DH-Lab/CUHK-DH-Lab.github.io/blob/main/MendozaOCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import fitz  # PyMuPDF
import os

# Define the PDF file path
pdf_file = 'bub_gb_6QiHhHwp8MMC.pdf'

# Create a directory to save the extracted images
output_folder = 'extracted_images_for_ocr'
os.makedirs(output_folder, exist_ok=True)

print(f"Opening PDF: {pdf_file}")

try:
    # Open the PDF document
    doc = fitz.open(pdf_file)

    # Iterate through each page
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)  # Load a page

        # Set a high DPI for rendering to ensure high quality for OCR
        # A common DPI for OCR is 300, but you can go higher if needed (e.g., 600)
        dpi = 300
        zoom_x = dpi / 72.0  # horizontal zoom
        zoom_y = dpi / 72.0  # vertical zoom
        mat = fitz.Matrix(zoom_x, zoom_y)  # zoom factor

        pix = page.get_pixmap(matrix=mat)  # Render page to an image

        # Define the output image file path
        output_image_path = os.path.join(output_folder, f"page_{page_num+1:04d}.png")

        # Save the image
        pix.save(output_image_path)
        print(f"Saved {output_image_path}")

    doc.close()
    print(f"Image extraction complete. Images saved to '{output_folder}' directory.")

except FileNotFoundError:
    print(f"Error: The file '{pdf_file}' was not found. Please make sure it's uploaded or correctly path-configured.")
except Exception as e:
    print(f"An error occurred: {e}")

Opening PDF: bub_gb_6QiHhHwp8MMC.pdf
Saved extracted_images_for_ocr/page_0001.png
Saved extracted_images_for_ocr/page_0002.png
Saved extracted_images_for_ocr/page_0003.png
Saved extracted_images_for_ocr/page_0004.png
Saved extracted_images_for_ocr/page_0005.png
Saved extracted_images_for_ocr/page_0006.png
Saved extracted_images_for_ocr/page_0007.png
Saved extracted_images_for_ocr/page_0008.png
Saved extracted_images_for_ocr/page_0009.png
Saved extracted_images_for_ocr/page_0010.png
Saved extracted_images_for_ocr/page_0011.png
Saved extracted_images_for_ocr/page_0012.png
Saved extracted_images_for_ocr/page_0013.png
Saved extracted_images_for_ocr/page_0014.png
Saved extracted_images_for_ocr/page_0015.png
Saved extracted_images_for_ocr/page_0016.png
Saved extracted_images_for_ocr/page_0017.png
Saved extracted_images_for_ocr/page_0018.png
Saved extracted_images_for_ocr/page_0019.png
Saved extracted_images_for_ocr/page_0020.png
Saved extracted_images_for_ocr/page_0021.png
Saved extracted_im

In [None]:
# Block 1: Setup, Authentication, and Function Definitions (Run this after PyMuPDF code)

import os
import io
import json
import vertexai
import asyncio
from PIL import Image
# Import necessary Vertex AI libraries
from vertexai.generative_models import GenerativeModel, Part, Image as VertexImage
from vertexai.preview.generative_models import Image as PreviewImage # Fallback

# -----------------------------------------------------------
# --- Configuration Variables: UPDATE THESE ---
# -----------------------------------------------------------

# Path to your uploaded GCP JSON key file
JSON_FILE_PATH = '/content/renaissance-ocr-4aabe5b8dc65.json'

# --- The folder where your images are saved from the previous step: ---
INPUT_IMAGE_FOLDER = 'extracted_images_for_ocr'

# GCP Project Details
PROJECT_ID = 'renaissance-ocr'
LOCATION = 'global'
GEMINI_MODEL = 'gemini-2.5-flash' # Using flash for speed, can change to pro
OUTPUT_TEXT_FILE = 'Extracted_OCR_Text_Spanish_Full_Async.txt'

# Specific prompt tailored for Early Modern Spanish
OCR_PROMPT = """
Eres un corrector experto de textos en español moderno temprano. TU TAREA es corregir ÚNICAMENTE los errores de OCR en el texto de entrada basándose en la imagen proporcionada. No reescribas ni alteres el uso auténtico del período.

REGLAS ESPECÍFICAS PARA ESPAÑOL MODERNO TEMPRANO:
1.  **Preservar el texto original (uso auténtico):** Mantén la redacción, gramática, sintaxis, y la ortografía histórica original. Corrige SOLAMENTE errores evidentes de OCR.
2.  **Normalización mínima:** Mantén el uso original de `u`/`v`, `i`/`j`. Reemplazar `&` con `y`. Asegurar que la `ñ` esté correcta.  ſ = s, ß = ss, ç =z or c
3.  **Desguionizado e Hiphenación:** Elimina todos los guiones de final de línea que unan una palabra partida.
4.  **Formato:** Conserva todas las pausas de párrafo y la capitalización exactamente igual que en la entrada. Elimina mobiliario de página (e.g., "A ij", signaturas).
5.  **Salida:** Produce ÚNICAMENTE el texto en español corregido. NO añadas comentarios, explicaciones, ni formato adicional.
6. **Emimina números de página de OCR** eliminar cualquier mención de números de página de OCR
"""
# -----------------------------------------------------------

# -----------------------------------------------------------
# --- Authenticate and Initialize the Vertex AI Client ---
# -----------------------------------------------------------
try:
    if not os.path.exists(JSON_FILE_PATH):
        raise FileNotFoundError(f"Credentials file not found at: {JSON_FILE_PATH}.")
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = JSON_FILE_PATH
    vertexai.init(project=PROJECT_ID, location=LOCATION)
    # Initialize the model globally
    model = GenerativeModel(GEMINI_MODEL)
    print(f"Gemini Model '{GEMINI_MODEL}' initialized successfully for project '{PROJECT_ID}'.")
except Exception as e:
    print(f"Authentication or client initialization failed: {e}")
    exit()

# -----------------------------------------------------------
# --- Define Asynchronous OCR Function ---
# -----------------------------------------------------------

# ASYNC function to call the API
async def process_image_with_gemini_async(image_path):
    """Processes a local image file using the Gemini multimodal model asynchronously."""
    try:
        # Use load_from_file (standard method)
        vertex_ai_image = VertexImage.load_from_file(image_path)
    except Exception as e:
        return f"Error loading image {image_path}: {e}", image_path

    contents = [OCR_PROMPT, vertex_ai_image]

    try:
        # Use generate_content_async for non-blocking API calls
        response = await model.generate_content_async(contents)
        full_text = response.text or ""
        if full_text:
            return full_text, image_path
        else:
            return "No text detected", image_path
    except Exception as e:
        return f"API call failed for {image_path}: {e}", image_path


Found 452 previously completed images in Extracted_OCR_Text_Spanish_Full_Async.txt. Resuming pipeline.
All images already processed successfully. No new tasks to run.


In [None]:
import os
import asyncio
# Assuming GEMINI_MODEL, INPUT_IMAGE_FOLDER, OUTPUT_TEXT_FILE,
# and process_image_with_gemini_async are defined in previous cells/blocks.

# Define the specific output file name you provided in your prompt
# This ensures we check the correct file for existing content
OUTPUT_TEXT_FILE = 'Extracted_OCR_Text_Spanish_Full_Async.txt'

async def main_async_pipeline_resume_from_images():

    # 1. Identify which images need processing
    image_files_list = sorted([
        os.path.join(INPUT_IMAGE_FOLDER, f)
        for f in os.listdir(INPUT_IMAGE_FOLDER)
        if f.endswith('.png') or f.endswith('.jpeg')
    ])

    if not image_files_list:
        print(f"Pipeline halted. No images found in '{INPUT_IMAGE_FOLDER}'.")
        return

    # 2. Check existing output file for completed pages
    completed_images = set()
    if os.path.exists(OUTPUT_TEXT_FILE):
        with open(OUTPUT_TEXT_FILE, 'r', encoding='utf-8') as f_out:
            content = f_out.read()
            # Simple check: find all filenames already marked as 'Start of OCR from <filename>'
            import re
            matches = re.findall(r"--- Start of OCR from (page_\d{4}\.png) ---", content)
            completed_images = set(matches)
        print(f"Found {len(completed_images)} previously completed images in {OUTPUT_TEXT_FILE}. Resuming pipeline.")
    else:
        # Create a new file if it doesn't exist to ensure the script can run
        with open(OUTPUT_TEXT_FILE, 'w', encoding='utf-8') as f:
            f.write(f"Gemini OCR Results [ASYNC PROCESSING] - Initializing new run.\n" + "=" * 60 + "\n")
        print(f"No existing file found. Starting a fresh OCR run.")

    # 3. Filter the list to only images that haven't been completed
    images_to_process = [
        img_path for img_path in image_files_list
        if os.path.basename(img_path) not in completed_images
    ]

    if not images_to_process:
        print("All images already processed successfully. No new tasks to run.")
        return

    print(f"\nStarting ASYNC Gemini OCR process for {len(images_to_process)} remaining images using {GEMINI_MODEL}...")

    # Create tasks only for the *remaining* images
    tasks = [process_image_with_gemini_async(image_path) for image_path in images_to_process]

    # Run remaining tasks concurrently
    try:
        results = await asyncio.gather(*tasks)
    except Exception as e:
        print(f"\nAn error occurred during an async task: {e}")
        print("The pipeline stopped prematurely. You can run this cell again to resume from the last completed page.")
        # We handle the potential failure here, and the results list might be incomplete.

        # To save partial results obtained so far, we need a way to access them,
        # but asyncio.gather stops everything on the first exception.
        # A more robust approach uses asyncio.as_completed or individual try/excepts within the task function.
        return


    # 4. Append new results to the existing file in order
    print(f"\n--- Appending New Gemini OCR Results ---")

    # results is a list of (text, image_path) for *newly* completed items
    # We still sort them just in case they were processed out of order in the gather call
    sorted_new_results = sorted(results, key=lambda x: x[1])

    with open(OUTPUT_TEXT_FILE, 'a', encoding='utf-8') as f_out:
        for text, image_path in sorted_new_results:
            section_text = f"\n\n--- Start of OCR from {os.path.basename(image_path)} ---\n\n{text}\n\n--- End of OCR from {os.path.basename(image_path)} ---\n\n"
            f_out.write(section_text)
            print(f"  -> Appended results for {os.path.basename(image_path)}")

    print(f"\nAll newly extracted text has been appended to: {OUTPUT_TEXT_FILE}")
    print(f"Total images processed so far: {len(image_files_list)}")


# Execute the new async main function in the notebook environment
await main_async_pipeline_resume_from_images()


Found 452 previously completed images in Extracted_OCR_Text_Spanish_Full_Async.txt. Resuming pipeline.
All images already processed successfully. No new tasks to run.


There are missing images, so need to make more robust

In [None]:
!pip install tenacity




In [None]:
import os
import asyncio
import re
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
from google.api_core.exceptions import ResourceExhausted, ServiceUnavailable, InternalServerError
# Ensure all previous imports (like VertexImage, OCR_PROMPT, etc.) from Block 1 are still available

# --- Configuration for Resilience ---
MAX_CONCURRENT_REQUESTS = 5
RETRYABLE_EXCEPTIONS = (ResourceExhausted, ServiceUnavailable, InternalServerError)
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
# -----------------------------------

@retry(
    wait=wait_exponential(multiplier=1, min=4, max=60),
    stop=stop_after_attempt(5),
    retry=retry_if_exception_type(RETRYABLE_EXCEPTIONS),
    reraise=True
)
async def process_image_with_gemini_async_robust(image_path):
    async with semaphore:
        try:
            vertex_ai_image = VertexImage.load_from_file(image_path)
            contents = [OCR_PROMPT, vertex_ai_image]
            response = await model.generate_content_async(contents)
            full_text = response.text or ""
            # Return both the text and the path for later sorting
            return full_text, image_path
        except Exception as e:
            print(f"-> Attempt failed for {os.path.basename(image_path)}: {e}")
            raise


async def main_async_pipeline_rerun_all_ordered():
    # 1. Collect all images in sorted order
    image_files_list = sorted([
        os.path.join(INPUT_IMAGE_FOLDER, f)
        for f in os.listdir(INPUT_IMAGE_FOLDER)
        if f.endswith('.png') or f.endswith('.jpeg')
    ])

    if not image_files_list:
        print(f"Pipeline halted. No images found in '{INPUT_IMAGE_FOLDER}'.")
        return

    # 2. Force a fresh start by deleting the old file
    if os.path.exists(OUTPUT_TEXT_FILE):
        os.remove(OUTPUT_TEXT_FILE)
        print(f"Deleted existing output file '{OUTPUT_TEXT_FILE}' to force a full rerun.")

    print(f"\nStarting ASYNC Gemini OCR process for ALL {len(image_files_list)} images (Results will be stored in memory first)...")

    # 3. Create all tasks
    tasks = [process_image_with_gemini_async_robust(image_path) for image_path in image_files_list]

    # 4. Run tasks concurrently using asyncio.gather()
    # Note: If any single task fails permanently after retries, the whole gather fails.
    try:
        # results will be a list of tuples (text, image_path)
        results = await asyncio.gather(*tasks)
        print(f"\n--- All {len(results)} images processed successfully. Sorting and saving... ---")
    except Exception as e:
        print(f"\nFATAL PIPELINE ERROR: A permanent error occurred in one or more tasks after all retries. Error: {e}")
        print("The pipeline stopped before all tasks finished. The output file was not generated in full.")
        return

    # 5. Sort the results list by the image path (which contains the page number 0001.png, etc.)
    # The key sorts by the second element in the tuple (the file path string)
    sorted_results = sorted(results, key=lambda x: x[1])

    # 6. Write ALL sorted results to the output file in a single synchronous operation
    with open(OUTPUT_TEXT_FILE, 'w', encoding='utf-8') as f_out:
        header = f"Gemini OCR Results for the full document ({len(sorted_results)} pages) [ASYNC PROCESSING - ORDERED OUTPUT]\n" + "=" * 60 + "\n"
        f_out.write(header)

        for text, image_path in sorted_results:
            section_text = f"\n\n--- Start of OCR from {os.path.basename(image_path)} ---\n\n{text}\n\n--- End of OCR from {os.path.basename(image_path)} ---\n\n"
            f_out.write(section_text)

    print(f"\n--- Processing Summary ---")
    print(f"Total tasks attempted and saved: {len(image_files_list)}")
    print(f"All extracted text has been saved sequentially to: {OUTPUT_TEXT_FILE}")


# Execute the main function to rerun all images
await main_async_pipeline_rerun_all_ordered()


Deleted existing output file 'Extracted_OCR_Text_Spanish_Full_Async.txt' to force a full rerun.

Starting ASYNC Gemini OCR process for ALL 452 images (Results will be stored in memory first)...
-> Attempt failed for page_0020.png: 429 Resource exhausted. Please try again later. Please refer to https://cloud.google.com/vertex-ai/generative-ai/docs/error-code-429 for more details.
-> Attempt failed for page_0260.png: 429 Resource exhausted. Please try again later. Please refer to https://cloud.google.com/vertex-ai/generative-ai/docs/error-code-429 for more details.
-> Attempt failed for page_0001.png: 429 Resource exhausted. Please try again later. Please refer to https://cloud.google.com/vertex-ai/generative-ai/docs/error-code-429 for more details.
-> Attempt failed for page_0021.png: 429 Resource exhausted. Please try again later. Please refer to https://cloud.google.com/vertex-ai/generative-ai/docs/error-code-429 for more details.
-> Attempt failed for page_0406.png: 429 Resource exha

CancelledError: 

This doesn't seem to work well with the API breaking down. so let's do vision ocr, then polish text.

In [None]:
# --- IMPORTS AND CONFIGURATION (Corrected) ---
import os
import io
import json
import vertexai
import asyncio
import re
# Make sure these are installed: !pip install --upgrade google-cloud-vision vertexai google-cloud tenacity
from google.cloud import vision
from vertexai.generative_models import GenerativeModel, Part, Image as VertexImage
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
from google.api_core.exceptions import ResourceExhausted, ServiceUnavailable, InternalServerError

# --- Configuration Variables: UPDATE THESE ---
JSON_FILE_PATH = '/content/renaissance-ocr-4aabe5b8dc65.json'
INPUT_IMAGE_FOLDER = 'extracted_images_for_ocr'
PROJECT_ID = 'renaissance-ocr'

# !! FIX 1: Changed 'global' to a valid region like 'us-central1' !!
LOCATION = 'us-central1'

GEMINI_MODEL = 'gemini-2.5-flash'
OUTPUT_TEXT_FILE = 'Extracted_OCR_Text_Spanish_Full_Async_Vision_Polished.txt'

# Polishing Prompt for Gemini
OCR_PROMPT = """
You are an expert text corrector in early modern Spanish. YOUR TASK is to correct ONLY OCR errors in the input text based on the provided image. Do not rewrite or alter the authentic use of the period.

SPECIFIC RULES FOR EARLY MODERN SPANISH:
1.  **Preserve the original text (authentic use):** Maintain the original wording, grammar, syntax, and historical spelling. Correct ONLY obvious OCR errors.
2.  **Minimum normalization:** Maintain the original use of `u`/`v`, `i`/`j`. Replace `&` with `y`. Ensure that `ñ` is correct. ſ = s, ß = ss, ç = z or c
3.  **Dehyphenation and Hyphenation:** Remove all end-of-line hyphens that join a split word.
4.  **Formatting:** Preserve all paragraph breaks and capitalization exactly as in the input. Remove page furniture (e.g., "A ij", signatures).
5.  **Output:** Produce ONLY the corrected Spanish text. DO NOT add comments, explanations, or additional formatting.
6. **Remove OCR page numbers** remove any mention of OCR page numbers
"""
# --- Resilience Configuration ---
MAX_CONCURRENT_REQUESTS = 5
RETRYABLE_EXCEPTIONS = (ResourceExhausted, ServiceUnavailable, InternalServerError)
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)

# --- Authentication and Client Initialization ---
try:
    if not os.path.exists(JSON_FILE_PATH):
        raise FileNotFoundError(f"Credentials file not found at: {JSON_FILE_PATH}.")
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = JSON_FILE_PATH

    # Initialize Vertex AI with the corrected region
    vertexai.init(project=PROJECT_ID, location=LOCATION)

    model = GenerativeModel(GEMINI_MODEL)
    vision_client = vision.ImageAnnotatorClient() # This line should now run successfully
    print(f"Gemini and Cloud Vision clients initialized correctly for region: {LOCATION}.")
except Exception as e:
    print(f"Authentication or client initialization failed: {e}")
    exit()

# --- Definition of Processing Functions ---
def run_vision_ocr(image_path):
    """Runs OCR from Document with Google Cloud Vision (synchronous)."""
    # FIX 2: The vision_client is now defined globally and accessible here
    with io.open(image_path, 'rb') as image_file:
        content = image_file.read()
    image = vision.Image(content=content)
    response = vision_client.document_text_detection(image=image)
    if response.error.message:
        raise Exception(f"Vision API Error: {response.error.message}")
    return response.full_text_annotation.text

@retry(
    wait=wait_exponential(multiplier=1, min=4, max=60),
    stop=stop_after_attempt(5),
    retry=retry_if_exception_type(RETRYABLE_EXCEPTIONS),
    reraise=True
)
async def polish_text_with_gemini_async(image_path, initial_ocr_text):
    """Polishes the initial text from Vision using Gemini and the specific prompt."""
    async with semaphore:
        try:
            vertex_ai_image = VertexImage.load_from_file(image_path)
            contents = [
                OCR_PROMPT,
                f"\n\nINPUT TEXT (from Google Vision):\n{initial_ocr_text}\n\n",
                vertex_ai_image
            ]
            response = await model.generate_content_async(contents)
            return response.text or "", image_path
        except Exception as e:
            print(f"-> Gemini polishing attempt failed for {os.path.basename(image_path)}: {e}")
            raise

# --- MAIN EXECUTION PIPELINE FUNCTION ---

async def main_async_pipeline_vision_polish_ordered():
    image_files_list = sorted([
        os.path.join(INPUT_IMAGE_FOLDER, f)
        for f in os.listdir(INPUT_IMAGE_FOLDER)
        if f.endswith('.png') or f.endswith('.jpeg')
    ])

    if not image_files_list:
        print(f"Pipeline stopped. No images found in '{INPUT_IMAGE_FOLDER}'.")
        return

    if os.path.exists(OUTPUT_TEXT_FILE):
        os.remove(OUTPUT_TEXT_FILE)
        print(f"Existing output file '{OUTPUT_TEXT_FILE}' was deleted to force a full run.")

    print(f"\nStarting the two-phase OCR process for {len(image_files_list)} images.")

    print("Running Google Vision OCR on all images...")
    vision_results = {}
    for image_path in image_files_list:
        try:
            # This function now correctly accesses the global vision_client
            vision_text = run_vision_ocr(image_path)
            vision_results[image_path] = vision_text
        except Exception as e:
            print(f"Fatal Google Vision error in {os.path.basename(image_path)}: {e}")
            return

    print(f"\nStarting ASYNCHRONOUS polishing phase with Gemini using {GEMINI_MODEL}...")
    tasks = [
        polish_text_with_gemini_async(image_path, text)
        for image_path, text in vision_results.items()
    ]

    try:
        polished_results = await asyncio.gather(*tasks)
        print(f"\n--- All images polished successfully. Ordering and saving... ---")
    except Exception as e:
        print(f"\nFATAL ERROR IN PIPELINE: A permanent error occurred during Gemini polishing after retries. Error: {e}")
        return

    sorted_polished_results = sorted(polished_results, key=lambda x: x)

    with open(OUTPUT_TEXT_FILE, 'w', encoding='utf-8') as f_out:
        header = f"Gemini Polished OCR Results ({len(sorted_polished_results)} pages) [VISION+GEMINI ORDERED OUTPUT]\n" + "=" * 80 + "\n"
        f_out.write(header)

        for text, image_path in sorted_polished_results:
            section_text = f"\n\n--- Start of OCR from {os.path.basename(image_path)} ---\n\n{text}\n\n--- End of OCR from {os.path.basename(image_path)} ---\n\n"
            f_out.write(section_text)

    print(f"\n--- Process Summary ---")
    print(f"Total images processed and saved: {len(image_files_list)}")
    print(f"All extracted text has been saved sequentially in: {OUTPUT_TEXT_FILE}")


# --- Execute the main function ---
await main_async_pipeline_vision_polish_ordered()


Gemini and Cloud Vision clients initialized correctly for region: us-central1.

Starting the two-phase OCR process for 452 images.
Running Google Vision OCR on all images...

Starting ASYNCHRONOUS polishing phase with Gemini using gemini-2.5-flash...

--- All images polished successfully. Ordering and saving... ---

--- Process Summary ---
Total images processed and saved: 452
All extracted text has been saved sequentially in: Extracted_OCR_Text_Spanish_Full_Async_Vision_Polished.txt
