<a href="https://colab.research.google.com/github/CUHK-DH-Lab/CUHK-DH-Lab.github.io/blob/main/Google_Vision_OCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Extracting Text from Images with Google Cloud Vision API (Python)
This notebook demonstrates how to use Google Cloud's Vision API to perform Optical Character Recognition (OCR) on an image file. We will use the document_text_detection feature, which is optimized for dense text documents like scans or photos of pages.

In [None]:
!pip install google-cloud-vision requests Pillow


Collecting google-cloud-vision
  Downloading google_cloud_vision-3.12.0-py3-none-any.whl.metadata (9.8 kB)
Downloading google_cloud_vision-3.12.0-py3-none-any.whl (538 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m538.2/538.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-cloud-vision
Successfully installed google-cloud-vision-3.12.0


In [None]:
import io
import os
from google.cloud import vision
from google.oauth2 import service_account

# --- Configuration ---
# Update this path if the filename is slightly different
JSON_FILE_PATH = '/content/renaissance-ocr-4aabe5b8dc65.json'
IMAGE_FILENAME = 'IMG_2056.JPG'
# ---------------------

# -----------------------------------------------------------
# 1. Authenticate using the local service account key file
# -----------------------------------------------------------
try:
    if not os.path.exists(JSON_FILE_PATH):
        raise FileNotFoundError(f"Credentials file not found at: {JSON_FILE_PATH}")

    # The library reads the file directly from disk
    credentials = service_account.Credentials.from_service_account_file(JSON_FILE_PATH)
    client = vision.ImageAnnotatorClient(credentials=credentials)
    print("Authentication successful using local JSON file.")

except Exception as e:
    print(f"Authentication failed: {e}")
    # Stop execution if authentication fails
    exit()


# -----------------------------------------------------------
# 2. Define the function to perform OCR from a local file
# -----------------------------------------------------------
def detect_text_from_local_file(image_path):
    """Detects text in a local image file using the Google Cloud Vision API."""

    if not os.path.exists(image_path):
        print(f"Error: The image file '{image_path}' was not found in the current directory.")
        return

    print(f"--- Attempting OCR on local image file: {image_path} ---")

    # Read the file into memory
    with io.open(image_path, 'rb') as image_file:
        content = image_file.read()

    # Pass the byte content directly to the Vision API
    image = vision.Image(content=content)

    # This is the correct method call for the client library
    api_response = client.text_detection(image=image)
    texts = api_response.text_annotations

    if texts:
        print('\nDetected Text (Full):')
        print("-" * 20)
        # *** FIX IS HERE: Access the FIRST element of the list (index 0) ***
        full_text = texts[0].description
        print(full_text)
        print("-" * 20)
    else:
        print('\nNo text detected.')

    # Error handling for the API call itself
    if api_response.error.message:
        error_url = 'https://cloud.google.com'
        raise Exception(
            f"{api_response.error.message} For more info on error messages, check: {error_url}"
        )


# -----------------------------------------------------------
# 3. Main execution block: Run the function
# -----------------------------------------------------------
if __name__ == '__main__':
    detect_text_from_local_file(IMAGE_FILENAME)


Authentication successful using local JSON file.
--- Attempting OCR on local image file: IMG_2056.JPG ---

Detected Text (Full):
--------------------
MAXES
ठ
Lusiadis Leonina
Libri Duodecim.
Carmen SrCeroicum
reni (simo
Lusitania
Principi Petro
Dicatum.
Patre Ignatio Archamone
Neapolitano è Societate Jesu
in Goana Provincia operario
Je Rebus Gestis Lusitanorum
in Regionibus
Vlera Marinis
Compediosa Carratio.
--------------------
Authentication successful using local JSON file.
--- Attempting OCR on local image file: IMG_2056.JPG ---

Detected Text (Full):
--------------------
MAXES
ठ
Lusiadis Leonina
Libri Duodecim.
Carmen SrCeroicum
reni (simo
Lusitania
Principi Petro
Dicatum.
Patre Ignatio Archamone
Neapolitano è Societate Jesu
in Goana Provincia operario
Je Rebus Gestis Lusitanorum
in Regionibus
Vlera Marinis
Compediosa Carratio.
--------------------


In [None]:
import io
import os
import json
from google.cloud import vision
from google.oauth2 import service_account

# --- Configuration ---
JSON_FILE_PATH = '/content/renaissance-ocr-4aabe5b8dc65.json'
IMAGE_FILENAME = 'IMG_2056.JPG'
# ---------------------

# -----------------------------------------------------------
# 1. Authenticate using the local service account key file
# -----------------------------------------------------------
try:
    if not os.path.exists(JSON_FILE_PATH):
        raise FileNotFoundError(f"Credentials file not found at: {JSON_FILE_PATH}")

    credentials = service_account.Credentials.from_service_account_file(JSON_FILE_PATH)
    client = vision.ImageAnnotatorClient(credentials=credentials)
    print("Authentication successful using local JSON file.")

except Exception as e:
    print(f"Authentication failed: {e}")
    exit()


# -----------------------------------------------------------
# 2. Define the function to perform OCR from a local file
# -----------------------------------------------------------
def detect_document_text_from_local_file(image_path):
    """Detects text in a local image file using DOCUMENT_TEXT_DETECTION."""

    if not os.path.exists(image_path):
        print(f"Error: The image file '{image_path}' was not found in the current directory.")
        return

    print(f"--- Attempting OCR on local image file using DOCUMENT mode: {image_path} ---")

    with io.open(image_path, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    # *** CHANGE IS HERE: Use document_text_detection instead of text_detection ***
    api_response = client.document_text_detection(image=image)

    # The response structure is slightly different for document mode
    full_text = api_response.full_text_annotation.text

    if full_text:
        print('\nDetected Text (Full Document Mode):')
        print("-" * 20)
        print(full_text)
        print("-" * 20)
    else:
        print('\nNo text detected in document mode.')

    if api_response.error.message:
        error_url = 'https://cloud.google.com'
        raise Exception(
            f"{api_response.error.message} For more info on error messages, check: {error_url}"
        )


# -----------------------------------------------------------
# 3. Main execution block: Run the function
# -----------------------------------------------------------
if __name__ == '__main__':
    # Call the new function
    detect_document_text_from_local_file(IMAGE_FILENAME)


Authentication successful using local JSON file.
--- Attempting OCR on local image file using DOCUMENT mode: IMG_2056.JPG ---

Detected Text (Full Document Mode):
--------------------
MAXES
ठ
Lusiadis Leonina
Libri Duodecim.
Carmen SrCeroicum
reni (simo
Lusitania
Principi Petro
Dicatum.
Patre Ignatio Archamone
Neapolitano è Societate Jesu
in Goana Provincia operario
Je Rebus Gestis Lusitanorum
in Regionibus
Vlera Marinis
Compediosa Carratio.
--------------------


Then add language hint for Latin.

In [None]:
# ... (imports and authentication code from previous response) ...

# -----------------------------------------------------------
# 2. Define the function to perform OCR with Language Hint
# -----------------------------------------------------------
def detect_document_text_with_latin_hint(image_path):
    """Detects text in a local image file using DOCUMENT_TEXT_DETECTION with a Latin hint."""

    if not os.path.exists(image_path):
        print(f"Error: The image file '{image_path}' was not found.")
        return

    print(f"--- Attempting OCR on local image file with Latin hint: {image_path} ---")

    with io.open(image_path, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    # *** ADD LANGUAGE HINT ***
    image_context = vision.ImageContext(language_hints=['la'])

    # Use document_text_detection for higher accuracy on documents
    api_response = client.document_text_detection(
        image=image,
        image_context=image_context
    )

    full_text = api_response.full_text_annotation.text

    if full_text:
        print('\nDetected Text (Document Mode, Latin Hint):')
        print("-" * 20)
        print(full_text)
        print("-" * 20)
    else:
        print('\nNo text detected in document mode.')

    if api_response.error.message:
        error_url = 'https://cloud.google.com'
        raise Exception(
            f"{api_response.error.message} For more info on error messages, check: {error_url}"
        )

# ... (main execution block __main__ below, remember to call the new function name) ...
if __name__ == '__main__':
    detect_document_text_with_latin_hint(IMAGE_FILENAME)


--- Attempting OCR on local image file with Latin hint: IMG_2056.JPG ---

Detected Text (Document Mode, Latin Hint):
--------------------
MAXES
d
Lusiadis Leonina
Libri Duodecim.
Carmen SrCeroicum
reni (simo
Lusitania
Principi Petro
Dicatum.
Patre Ignatio Archamone
Neapolitano è Societate Jesu
in Goana Provincia operario
Je Rebus Gestis Lusitanorum
in Regionibus
Vlera Marinis
Compediosa Carratio.
--------------------


Now, let's start with a pdf and extract the images from some of the first few pages

In [None]:
# --- Google Colab Code ---

# 1. Install necessary libraries
# PyMuPDF provides high-quality PDF rendering capabilities.
!pip install PyMuPDF Pillow

import fitz  # This is PyMuPDF
import io
from PIL import Image
import os

# --- Configuration ---
PDF_FILE_NAME = 'confuciussinarum00conf_0.pdf'
START_PAGE = 7  # Page number to start from (inclusive)
END_PAGE = 14   # Page number to end at (inclusive)
OUTPUT_FOLDER = 'extracted_images_pages_7_to_14'
DPI = 300 # Set a high DPI for better OCR quality (e.g., 300 or 500)
# ---------------------

# Create output directory if it doesn't exist
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)
    print(f"Created directory: {OUTPUT_FOLDER}")

# Check if the PDF file exists in the Colab environment
if not os.path.exists(PDF_FILE_NAME):
    print(f"Error: The PDF file '{PDF_FILE_NAME}' was not found in the Colab environment.")
    print("Please upload the file using the Colab file browser (folder icon on the left sidebar).")
else:
    print(f"--- Starting image extraction from {PDF_FILE_NAME} (Pages {START_PAGE} to {END_PAGE}) ---")
    try:
        # Open the PDF file
        doc = fitz.open(PDF_FILE_NAME)

        # Page numbers in fitz are 0-indexed, so we subtract 1
        start_index = START_PAGE - 1
        end_index = END_PAGE - 1

        if end_index >= doc.page_count:
            print(f"Warning: PDF only has {doc.page_count} pages. Extracting up to the last page.")
            end_index = doc.page_count - 1

        # Iterate through the specified pages
        for page_num in range(start_index, end_index + 1):
            page = doc.load_page(page_num)

            # Set a matrix for high resolution (zoom factor based on DPI/72)
            zoom = DPI / 72
            matrix = fitz.Matrix(zoom, zoom)

            # Render the page to a high-resolution pixmap (image)
            pix = page.get_pixmap(matrix=matrix, alpha=False)

            # Save the pixmap as a high-quality image file (e.g., PNG)
            image_filename = os.path.join(OUTPUT_FOLDER, f'page_{page_num + 1:02d}.png')
            pix.save(image_filename)

            print(f"Saved {image_filename}")

        doc.close()
        print(f"--- Extraction complete. Images saved in '{OUTPUT_FOLDER}/' directory. ---")

    except Exception as e:
        print(f"An error occurred during PDF processing: {e}")



Collecting PyMuPDF
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.7
Created directory: extracted_images_pages_7_to_14
--- Starting image extraction from confuciussinarum00conf_0.pdf (Pages 7 to 14) ---
Saved extracted_images_pages_7_to_14/page_07.png
Saved extracted_images_pages_7_to_14/page_08.png
Saved extracted_images_pages_7_to_14/page_09.png
Saved extracted_images_pages_7_to_14/page_10.png
Saved extracted_images_pages_7_to_14/page_11.png
Saved extracted_images_pages_7_to_14/page_12.png
Saved extracted_images_pages_7_to_14/page_13.png
Saved extracted_images_pages_7_to_14/page_14.png
--- Extraction complete. Images saved in 'extracted_images_pages_7_to_14/' directory. ---


Now let's OCR this using the Google Vision API. We will create a txt file of the extracted text, and print the first 200 words just to make sure it works.

In [None]:
# --- Google Colab Complete OCR Pipeline (FIXED CODE) ---

# 1. Install necessary libraries (Run this first!)
!pip install PyMuPDF Pillow google-cloud-vision requests

import io
import os
import json
import fitz # PyMuPDF
from PIL import Image
from google.cloud import vision
from google.oauth2 import service_account

# -----------------------------------------------------------
# --- Configuration Variables
# -----------------------------------------------------------
JSON_FILE_PATH = '/content/renaissance-ocr-4aabe5b8dc65.json'
PDF_FILE_NAME = 'confuciussinarum00conf_0.pdf'
START_PAGE = 7
END_PAGE = 14
DPI = 300 # High DPI for quality OCR
OUTPUT_IMAGE_FOLDER = 'extracted_images'
OUTPUT_TEXT_FILE = 'Extracted_OCR_Text.txt'
# -----------------------------------------------------------


# -----------------------------------------------------------
# 2. Authenticate using the local service account key file
#    >>> THIS IS WHERE 'client' IS DEFINED <<<
# -----------------------------------------------------------
try:
    if not os.path.exists(JSON_FILE_PATH):
        raise FileNotFoundError(f"Credentials file not found at: {JSON_FILE_PATH}")

    credentials = service_account.Credentials.from_service_account_file(JSON_FILE_PATH)
    # The 'client' variable is defined here and is accessible globally within this script
    client = vision.ImageAnnotatorClient(credentials=credentials)
    print("Authentication successful using local JSON file.")

except Exception as e:
    print(f"Authentication failed: {e}")
    exit()

# -----------------------------------------------------------
# 3. Define the function (The function body was merged into __main__ in the last script for simplicity)
# -----------------------------------------------------------
# The function was simplified and integrated into the main execution block in the prior response,
# which required the 'client' variable to be defined *before* the __main__ block ran.

# -----------------------------------------------------------
# 4. Main execution block: Extract images first, then OCR them
# -----------------------------------------------------------
if __name__ == '__main__':
    # --- Image Extraction Phase ---
    if not os.path.exists(OUTPUT_IMAGE_FOLDER):
        os.makedirs(OUTPUT_IMAGE_FOLDER)

    if os.path.exists(PDF_FILE_NAME):
        print(f"\nStarting image extraction from {PDF_FILE_NAME}...")
        doc = fitz.open(PDF_FILE_NAME)
        start_index = START_PAGE - 1
        end_index = min(END_PAGE - 1, doc.page_count - 1)
        image_files_to_process = []

        for page_num in range(start_index, end_index + 1):
            page = doc.load_page(page_num)
            zoom = DPI / 72
            matrix = fitz.Matrix(zoom, zoom)
            pix = page.get_pixmap(matrix=matrix, alpha=False)
            image_filename = os.path.join(OUTPUT_IMAGE_FOLDER, f'page_{page_num + 1:02d}.png')
            pix.save(image_filename)
            image_files_to_process.append(image_filename)
        doc.close()
        print("Image extraction complete.")

        # --- OCR Phase ---
        print(f"\nStarting OCR process on {len(image_files_to_process)} images...")

        full_document_text = ""

        with open(OUTPUT_TEXT_FILE, 'w', encoding='utf-8') as f_out:
            header = f"OCR Results for {PDF_FILE_NAME} (Pages {START_PAGE} to {END_PAGE})\n" + "=" * 60 + "\n"
            f_out.write(header)
            full_document_text += header

            for image_file in image_files_to_process:
                # This section now successfully uses the 'client' variable defined above
                with io.open(image_file, 'rb') as image_file_handle:
                    content = image_file_handle.read()
                image = vision.Image(content=content)
                image_context = vision.ImageContext(language_hints=['la'])
                api_response = client.document_text_detection(image=image, image_context=image_context)

                page_text = api_response.full_text_annotation.text or ""

                if page_text:
                    section_text = f"\n\n--- Start of OCR from {os.path.basename(image_file)} ---\n\n{page_text}\n\n--- End of OCR from {os.path.basename(image_file)} ---\n\n"
                    f_out.write(section_text)
                    full_document_text += section_text


        print(f"\n--- OCR Pipeline Finished ---")
        print(f"All extracted text has been saved to: {OUTPUT_TEXT_FILE}")

        # --- Print First 200 Words ---
        print("\n" + "=" * 60)
        print("FIRST 200 WORDS OF EXTRACTED TEXT:")
        print("=" * 60)
        words = full_document_text.split()
        print(' '.join(words[:200]))
        if len(words) > 200:
            print("\n[... Text truncated for display purposes ...]")
        print("=" * 60)


    else:
        print(f"Could not find PDF file '{PDF_FILE_NAME}'. Please upload it to Colab.")



Authentication successful using local JSON file.

Starting image extraction from confuciussinarum00conf_0.pdf...
Image extraction complete.

Starting OCR process on 8 images...

--- OCR Pipeline Finished ---
All extracted text has been saved to: Extracted_OCR_Text.txt

FIRST 200 WORDS OF EXTRACTED TEXT:

[... Text truncated for display purposes ...]


Now, let's try Google Gemini model through the Vertext API

First though let's check we can access the API by using it like a chatbot

In [None]:
import os
import vertexai
from vertexai.generative_models import GenerativeModel, Part, Image

# -----------------------------------------------------------
# --- Configuration Variables: UPDATE THESE ---
# -----------------------------------------------------------

# REPLACE THIS with the path to your service account JSON key file.
# Example: '/Users/YourUser/Desktop/your-project-key.json'
JSON_FILE_PATH = '/content/renaissance-ocr-4aabe5b8dc65.json'

# REPLACE THESE with your specific project details
PROJECT_ID = 'renaissance-ocr'
LOCATION = 'global'
GEMINI_MODEL = 'gemini-2.5-flash'
# -----------------------------------------------------------


# -----------------------------------------------------------
# 1. Authenticate and Initialize the Vertex AI Client
# -----------------------------------------------------------
try:
    if not os.path.exists(JSON_FILE_PATH):
        raise FileNotFoundError(f"Credentials file not found at: {JSON_FILE_PATH}")

    # Set the environment variable required for authentication
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = JSON_FILE_PATH

    # Initialize the Vertex AI SDK
    vertexai.init(project=PROJECT_ID, location=LOCATION)

    # Load the specific model instance
    model = GenerativeModel(GEMINI_MODEL)
    print(f"Gemini Model '{GEMINI_MODEL}' initialized successfully.")

except Exception as e:
    print(f"Authentication or client initialization failed: {e}")
    # Exit the script if initialization fails
    exit()

# -----------------------------------------------------------
# 2. Define the function to ask a simple text question
# -----------------------------------------------------------
def ask_question(question_text):
    """Sends a simple text prompt to the Gemini API and returns the response."""
    print(f"\nSending prompt to Gemini: '{question_text}'")

    # The 'contents' list contains only the text prompt for a chat
    contents = [question_text]

    try:
        # Generate content for a text-only prompt
        response = model.generate_content(contents)
        answer = response.text or "No answer received."
        return answer
    except Exception as e:
        print(f"API call failed: {e}")
        return "Error during API call."

# -----------------------------------------------------------
# 3. Main execution to ask the capital of France
# -----------------------------------------------------------
if __name__ == '__main__':
    # Define the specific question you want to ask
    user_question = "What is the capital of France?"

    # Get the answer using the function
    capital_city = ask_question(user_question)

    # Print the final result
    print("\n" + "=" * 40)
    print("--- Gemini Chatbox Answer ---")
    print("=" * 40)
    print(f"Question: {user_question}")
    print(f"Answer: {capital_city}")
    print("=" * 40)




Gemini Model 'gemini-2.5-flash' initialized successfully.

Sending prompt to Gemini: 'What is the capital of France?'

--- Gemini Chatbox Answer ---
Question: What is the capital of France?
Answer: The capital of France is **Paris**.


Now, let's try our task. This may take some time...

In [None]:
# --- Google Colab OCR Pipeline using Gemini (Vertex AI) ---

# 1. Install necessary libraries
!pip install google-cloud-aiplatform PyMuPDF Pillow requests --upgrade

import io
import os
import json
# No need for PIL import if we use load_from_file
import vertexai
from vertexai.generative_models import GenerativeModel, Part, Image
# Import the preview namespace as a fallback
from vertexai.preview.generative_models import Image as PreviewImage


# -----------------------------------------------------------
# --- Configuration Variables: UPDATE THESE ---
# -----------------------------------------------------------
JSON_FILE_PATH = '/content/renaissance-ocr-4aabe5b8dc65.json'
OUTPUT_IMAGE_FOLDER = 'extracted_images'
OUTPUT_TEXT_FILE = 'Extracted_OCR_Text_Gemini.txt'
PROJECT_ID = 'renaissance-ocr'
LOCATION = 'global'
GEMINI_MODEL = 'gemini-2.5-flash'
OCR_PROMPT = "Transcribe all visible text from this image. Do not print any comments or reasoning. Just the text from the book reconnecting any words that are split across lines and hyphenated. For context, it is a Latin book about China. Please correct the OCR to the most plausible Latin without being too creative. Standardize Latin spellings to OLD. Do not include page numbers from the image or anything apart from the text."
# -----------------------------------------------------------




In [None]:


# -----------------------------------------------------------
# 2. Authenticate and Initialize the Vertex AI Client
# -----------------------------------------------------------
try:
    if not os.path.exists(JSON_FILE_PATH):
        raise FileNotFoundError(f"Credentials file not found at: {JSON_FILE_PATH}")

    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = JSON_FILE_PATH
    vertexai.init(project=PROJECT_ID, location=LOCATION)
    model = GenerativeModel(GEMINI_MODEL)
    print(f"Gemini Model '{GEMINI_MODEL}' initialized successfully.")

except Exception as e:
    print(f"Authentication or client initialization failed: {e}")
    exit()

# -----------------------------------------------------------
# 3. Define the function to perform OCR using the Gemini Model (FIXED AGAIN)
# -----------------------------------------------------------
def process_image_with_gemini(image_path, output_file_handle):
    """Processes a local image file using the Gemini multimodal model."""

    print(f"  -> Processing image: {image_path}")

    try:
        # Use Image.load_from_file which is the most reliable current method
        vertex_ai_image = Image.load_from_file(image_path)
    except AttributeError:
        # Fallback for very old library versions using the preview namespace
        vertex_ai_image = PreviewImage.load_from_file(image_path)
    except Exception as e:
        print(f"Could not load image file {image_path}: {e}")
        return ""

    # Send the prompt and the image to the model
    contents = [OCR_PROMPT, vertex_ai_image]

    # Generate content (perform the OCR)
    try:
        response = model.generate_content(contents)
        full_text = response.text or ""
    except Exception as e:
        print(f"API call failed for {image_path}: {e}")
        full_text = ""

    if full_text:
        section_text = f"\n\n--- Start of OCR from {os.path.basename(image_path)} ---\n\n{full_text}\n\n--- End of OCR from {os.path.basename(image_path)} ---\n\n"
        output_file_handle.write(section_text)
        return section_text
    else:
        print(f'  -> No text detected in {image_path}.')
        return ""


# -----------------------------------------------------------
# 4. Main execution block: Process images in the folder
# -----------------------------------------------------------
if __name__ == '__main__':
    if not os.path.exists(OUTPUT_IMAGE_FOLDER):
        print(f"Image folder '{OUTPUT_IMAGE_FOLDER}' not found. Please run the previous PDF extraction code first.")
        exit()

    image_files_to_process = sorted([os.path.join(OUTPUT_IMAGE_FOLDER, f) for f in os.listdir(OUTPUT_IMAGE_FOLDER) if f.endswith('.png') or f.endswith('.jpeg')])

    if not image_files_to_process:
        print("No images found in the 'extracted_images' folder.")
        exit()

    print(f"\nStarting Gemini OCR process on {len(image_files_to_process)} images...")

    full_document_text = ""
    with open(OUTPUT_TEXT_FILE, 'w', encoding='utf-8') as f_out:
        header = f"Gemini OCR Results for Pages 7 to 14\n" + "=" * 60 + "\n"
        f_out.write(header)
        full_document_text += header

        for image_file in image_files_to_process:
            processed_text = process_image_with_gemini(image_file, f_out)
            full_document_text += processed_text

    print(f"\n--- Gemini OCR Pipeline Finished ---")
    print(f"All extracted text has been saved to: {OUTPUT_TEXT_FILE}")

    # --- Print First 200 Words ---
    print("\n" + "=" * 60)
    print("FIRST 200 WORDS OF EXTRACTED TEXT (Gemini):")
    print("=" * 60)
    words = full_document_text.split()
    print(' '.join(words[:200]))
    if len(words) > 200:
        print("\n[... Text truncated for display purposes ...]")
    print("=" * 60)


Gemini Model 'gemini-2.5-flash' initialized successfully.

Starting Gemini OCR process on 8 images...
  -> Processing image: extracted_images/page_07.png
  -> Processing image: extracted_images/page_08.png
  -> Processing image: extracted_images/page_09.png
  -> Processing image: extracted_images/page_10.png
  -> Processing image: extracted_images/page_11.png
  -> Processing image: extracted_images/page_12.png
  -> Processing image: extracted_images/page_13.png
  -> Processing image: extracted_images/page_14.png

--- Gemini OCR Pipeline Finished ---
All extracted text has been saved to: Extracted_OCR_Text_Gemini.txt

FIRST 200 WORDS OF EXTRACTED TEXT (Gemini):

[... Text truncated for display purposes ...]


Any alternative method might be to OCR with Google Vision, and then correct the OCR with Gemini AI.

In [None]:
import os
from vertexai.generative_models import GenerativeModel, Part
import vertexai

# --- Configuration Constants (Define these as per your environment) ---
# Replace with your actual project details
PROJECT_ID = 'renaissance-ocr'
LOCATION = 'global'
GEMINI_MODEL = 'gemini-2.5-flash' # A fast model suitable for text processing
JSON_FILE_PATH = 'renaissance-ocr-4aabe5b8dc65.json' # Path to your GCP credentials file

INPUT_TEXT_FILE = 'Extracted_OCR_Text.txt'
OUTPUT_CORRECTED_FILE = 'Corrected_OCR_Text.txt'

# Define the chunk size in characters (adjust as needed to fit the model's context window)
# 10,000 characters is a safe starting point.
CHUNK_SIZE = 10000

# Define the prompt for the AI
CORRECTION_PROMPT = """
You are an expert proofreader of early modern Latin. TASK: Correct ONLY OCR errors in the input text. Do not rewrite or alter authentic early modern usage. RULES: 1. Preserve the original wording, grammar, and syntax. Fix ONLY: - misread letters - broken or split words - incorrect or missing spacing - duplicated characters - corrupted punctuation caused by OCR 2. Apply minimal normalization (OLD conventions): - u = vowel, v = consonant - use i only (never j) - expand æ → ae, œ → oe - replace & or &amp; with et Do NOT modernize vocabulary or regularize historical spellings unless the form is clearly an OCR mistake. 3. De hyphenate all words broken across line breaks: - remove hyphens that occur at the end of a line, - join the two fragments into a single continuous word, - and correct any spacing errors created by the line break. Do NOT remove hyphens that belong to real Latin compounds. Preserve all paragraph breaks and capitalization. Do not add new punctuation. 4. Keep all paragraph breaks and capitalization exactly as in the input. 5. Remove page furniture only when clearly not part of the running text: examples: “A ij”, “EPISTOLA.”, catchwords, signature marks. 6. Preserve Chinese or other exotic romanizations exactly as printed once corrected. Do NOT “improve,” standardize, or re interpret them (e.g., keep forms like “Tai Ki Gin”). 7. Output ONLY the corrected Latin text. Do NOT add commentary, explanation, or formatting.
"""


# -----------------------------------------------------------
# 1. Authenticate and Initialize the Vertex AI Client
# -----------------------------------------------------------
try:
    if not os.path.exists(JSON_FILE_PATH):
        # In Colab, you might need to upload this file or use Colab secrets
        print(f"Credentials file not found at: {JSON_FILE_PATH}")
        # Placeholder for Colab file upload guidance
        # from google.colab import files
        # files.upload()
        # Then update JSON_FILE_PATH accordingly
        raise FileNotFoundError(f"Credentials file not found: {JSON_FILE_PATH}")

    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = JSON_FILE_PATH
    vertexai.init(project=PROJECT_ID, location=LOCATION)
    model = GenerativeModel(GEMINI_MODEL)
    print(f"Gemini Model '{GEMINI_MODEL}' initialized successfully.")

except Exception as e:
    print(f"Authentication or client initialization failed: {e}")
    exit()

# -----------------------------------------------------------
# 2. Define a function to read a file in chunks
# -----------------------------------------------------------
def read_in_chunks(file_path, chunk_size=CHUNK_SIZE):
    """Reads a large text file in chunks of a specified size."""
    with open(file_path, 'r', encoding='utf-8') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            yield chunk

# -----------------------------------------------------------
# 3. Define a function to correct a chunk using Gemini AI
# -----------------------------------------------------------
def correct_ocr_chunk_with_gemini(chunk_text):
    """Sends a text chunk to Gemini for correction and returns clean text."""

    # Send the prompt and the text chunk to the model
    contents = [
        CORRECTION_PROMPT,
        "Here is the text to correct:",
        chunk_text
    ]

    try:
        response = model.generate_content(contents)
        # We explicitly ask the model to only return the text
        corrected_text = response.text.strip() or ""
        return corrected_text

    except Exception as e:
        print(f"API call failed for a chunk: {e}")
        return chunk_text # Return original text if API fails

# -----------------------------------------------------------
# 4. Main execution block: Chunk, Process, and Save
# -----------------------------------------------------------
if __name__ == '__main__':
    if not os.path.exists(INPUT_TEXT_FILE):
        print(f"Input file '{INPUT_TEXT_FILE}' not found. Ensure it is uploaded or available.")
        exit()

    print(f"\nStarting OCR correction process on {INPUT_TEXT_FILE}...")

    # Open the output file in write mode
    with open(OUTPUT_CORRECTED_FILE, 'w', encoding='utf-8') as f_out:
        chunk_count = 0
        # Iterate over the input file in chunks
        for chunk in read_in_chunks(INPUT_TEXT_FILE, CHUNK_SIZE):
            chunk_count += 1
            print(f"  -> Processing chunk {chunk_count}...")

            # Correct the chunk using Gemini
            corrected_chunk = correct_ocr_chunk_with_gemini(chunk)

            # Write the corrected chunk directly to the output file
            if corrected_chunk:
                f_out.write(corrected_chunk + "\n\n") # Add newlines between chunks for readability
            else:
                print(f'  -> Chunk {chunk_count} returned no corrected text.')

    print(f"\n--- Gemini OCR Correction Pipeline Finished ---")
    print(f"All corrected text has been saved to: {OUTPUT_CORRECTED_FILE}")


Gemini Model 'gemini-2.5-flash' initialized successfully.

Starting OCR correction process on Extracted_OCR_Text.txt...
  -> Processing chunk 1...

--- Gemini OCR Correction Pipeline Finished ---
All corrected text has been saved to: Corrected_OCR_Text.txt


Now, let's try to start from Archive.org 's own bad OCR, and see if we can correct it using Gemini AI.

In [None]:
import os
from vertexai.generative_models import GenerativeModel, Part
import vertexai

# --- Configuration Constants (Define these as per your environment) ---
# Replace with your actual project details
PROJECT_ID = 'renaissance-ocr'
LOCATION = 'global'
GEMINI_MODEL = 'gemini-2.5-flash' # A fast model suitable for text processing
JSON_FILE_PATH = 'renaissance-ocr-4aabe5b8dc65.json' # Path to your GCP credentials file

INPUT_TEXT_FILE = 'confuciussinarum00conf_0_djvu.txt'
OUTPUT_CORRECTED_FILE = 'Corrected_OCR_Text_from_Archive2.txt'

# Define the chunk size in characters (adjust as needed to fit the model's context window)
# 30,000 characters is a safe starting point.
CHUNK_SIZE = 30000

# Define the prompt for the AI
CORRECTION_PROMPT = """
You are an expert proofreader of early modern Latin. TASK: Correct ONLY OCR errors in the input text. Do not rewrite or alter authentic early modern usage. RULES: 1. Preserve the original wording, grammar, and syntax. Fix ONLY: - misread letters - broken or split words - incorrect or missing spacing - duplicated characters - corrupted punctuation caused by OCR 2. Apply minimal normalization (OLD conventions): - u = vowel, v = consonant - use i only (never j) - expand æ → ae, œ → oe - replace & or &amp; with et Do NOT modernize vocabulary or regularize historical spellings unless the form is clearly an OCR mistake. 3. This is very important: De hyphenate all words broken across line breaks: - remove hyphens that occur at the end of a line, - join the two fragments into a single continuous word, - and correct any spacing errors created by the line break. Also remove hyphens that belong to real Latin compounds, as hyphens are irrelevant. Do not add new punctuation. Do not retain paragraph breaks; just produce continuous text. 4. Keep all paragraph breaks and capitalization exactly as in the input. 5. Remove page furniture only when clearly not part of the running text: examples: “A ij”, “EPISTOLA.”, catchwords, signature marks. 6. Preserve Chinese or other exotic romanizations exactly as printed once corrected. Do NOT “improve,” standardize, or re interpret them (e.g., keep forms like “Tai Ki Gin”). 7. Output ONLY the corrected Latin text. Do NOT add commentary, explanation, or formatting.
"""

# -----------------------------------------------------------
# 1. Authenticate and Initialize the Vertex AI Client
# -----------------------------------------------------------
try:
    if not os.path.exists(JSON_FILE_PATH):
        # In Colab, you might need to upload this file or use Colab secrets
        print(f"Credentials file not found at: {JSON_FILE_PATH}")
        # Placeholder for Colab file upload guidance
        # from google.colab import files
        # files.upload()
        # Then update JSON_FILE_PATH accordingly
        raise FileNotFoundError(f"Credentials file not found: {JSON_FILE_PATH}")

    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = JSON_FILE_PATH
    vertexai.init(project=PROJECT_ID, location=LOCATION)
    model = GenerativeModel(GEMINI_MODEL)
    print(f"Gemini Model '{GEMINI_MODEL}' initialized successfully.")

except Exception as e:
    print(f"Authentication or client initialization failed: {e}")
    exit()

# -----------------------------------------------------------
# 2. Define a function to read a file in chunks
# -----------------------------------------------------------
def read_in_chunks(file_path, chunk_size=CHUNK_SIZE):
    """Reads a large text file in chunks of a specified size."""
    with open(file_path, 'r', encoding='utf-8') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            yield chunk

# -----------------------------------------------------------
# 3. Define a function to correct a chunk using Gemini AI
# -----------------------------------------------------------
def correct_ocr_chunk_with_gemini(chunk_text):
    """Sends a text chunk to Gemini for correction and returns clean text."""

    # Send the prompt and the text chunk to the model
    contents = [
        CORRECTION_PROMPT,
        chunk_text
    ]

    try:
        response = model.generate_content(contents)
        # We explicitly ask the model to only return the text
        corrected_text = response.text.strip() or ""
        return corrected_text

    except Exception as e:
        print(f"API call failed for a chunk: {e}")
        return chunk_text # Return original text if API fails

# -----------------------------------------------------------
# 4. Main execution block: Chunk, Process, and Save
# -----------------------------------------------------------
if __name__ == '__main__':
    if not os.path.exists(INPUT_TEXT_FILE):
        print(f"Input file '{INPUT_TEXT_FILE}' not found. Ensure it is uploaded or available.")
        exit()

    print(f"\nStarting OCR correction process on {INPUT_TEXT_FILE}...")

    # Open the output file in write mode
    with open(OUTPUT_CORRECTED_FILE, 'w', encoding='utf-8') as f_out:
        chunk_count = 0
        # Iterate over the input file in chunks
        for chunk in read_in_chunks(INPUT_TEXT_FILE, CHUNK_SIZE):
            chunk_count += 1
            print(f"  -> Processing chunk {chunk_count}...")

            # Correct the chunk using Gemini
            corrected_chunk = correct_ocr_chunk_with_gemini(chunk)

            # Write the corrected chunk directly to the output file
            if corrected_chunk:
                # Add newlines between chunks for readability/structure preservation
                f_out.write(corrected_chunk + "\n\n")
            else:
                print(f'  -> Chunk {chunk_count} returned no corrected text.')

    print(f"\n--- Gemini OCR Correction Pipeline Finished ---")
    print(f"All corrected text has been saved to: {OUTPUT_CORRECTED_FILE}")


Gemini Model 'gemini-2.5-flash' initialized successfully.

Starting OCR correction process on confuciussinarum00conf_0_djvu.txt...
  -> Processing chunk 1...


KeyboardInterrupt: 

Can you speed it up by running the requests asynchronously ?

In [None]:
import os
import asyncio
import vertexai
from vertexai.generative_models import GenerativeModel, Part

# Use nest_asyncio to run async code easily in environments like Colab
import nest_asyncio
nest_asyncio.apply()

# --- Configuration Constants (Define these as per your environment) ---
PROJECT_ID = 'renaissance-ocr'
LOCATION = 'global'
GEMINI_MODEL = 'gemini-2.5-flash'
JSON_FILE_PATH = 'renaissance-ocr-4aabe5b8dc65.json'
INPUT_TEXT_FILE = 'confuciussinarum00conf_0_djvu.txt'
OUTPUT_CORRECTED_FILE = 'Corrected_OCR_Text_from_Archive3.txt'
CHUNK_SIZE = 10000
CORRECTION_PROMPT = """
You are an expert proofreader of early modern Latin. TASK: Correct ONLY OCR errors in the input text. Do not rewrite or alter authentic early modern usage. RULES: 1. Preserve the original wording, grammar, and syntax. Fix ONLY: - misread letters - broken or split words - incorrect or missing spacing - duplicated characters - corrupted punctuation caused by OCR 2. Apply minimal normalization (OLD conventions): - u = vowel, v = consonant - use i only (never j) - expand æ → ae, œ → oe - replace & or &amp; with et Do NOT modernize vocabulary or regularize historical spellings unless the form is clearly an OCR mistake. 3. This is very important: De hyphenate all words broken across line breaks: - remove hyphens that occur at the end of a line, - join the two fragments into a single continuous word, - and correct any spacing errors created by the line break. Also remove hyphens that belong to real Latin compounds, as hyphens are irrelevant. Do not add new punctuation. Do not retain paragraph breaks; just produce continuous text. 4. Keep all paragraph breaks and capitalization exactly as in the input. 5. Remove page furniture only when clearly not part of the running text: examples: “A ij”, “EPISTOLA.”, catchwords, signature marks. 6. Preserve Chinese or other exotic romanizations exactly as printed once corrected. Do NOT “improve,” standardize, or re interpret them (e.g., keep forms like “Tai Ki Gin”). 7. Output ONLY the corrected Latin text. Do NOT add commentary, explanation, or formatting.
"""

# -----------------------------------------------------------
# 1. Authenticate and Initialize the Vertex AI Client
# -----------------------------------------------------------
try:
    if not os.path.exists(JSON_FILE_PATH):
        print(f"Credentials file not found at: {JSON_FILE_PATH}")
        raise FileNotFoundError(f"Credentials file not found: {JSON_FILE_PATH}")

    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = JSON_FILE_PATH
    vertexai.init(project=PROJECT_ID, location=LOCATION)
    # Initialize the *one* model client we will reuse everywhere
    model = GenerativeModel(GEMINI_MODEL)
    print(f"Gemini Model '{GEMINI_MODEL}' initialized successfully.")

except Exception as e:
    print(f"Authentication or client initialization failed: {e}")
    exit()

# -----------------------------------------------------------
# 2. Define a function to read a file in chunks
# -----------------------------------------------------------
def read_in_chunks(file_path, chunk_size=CHUNK_SIZE):
    # (This function remains the same)
    with open(file_path, 'r', encoding='utf-8') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            yield chunk

# -----------------------------------------------------------
# 3. Define an ASYNCHRONOUS function to correct a chunk
# -----------------------------------------------------------
async def correct_ocr_chunk_with_gemini_async(model_client, chunk_text, chunk_id):
    """Sends a text chunk to Gemini for correction asynchronously."""

    contents = [
        CORRECTION_PROMPT,
        chunk_text
    ]

    try:
        # Await the response using the provided model_client instance
        response = await model_client.generate_content_async(contents)

        corrected_text = response.text.strip() or ""
        print(f"  -> Finished processing chunk {chunk_id}.")
        return chunk_id, corrected_text # Return ID to sort results later

    except Exception as e:
        print(f"API call failed for chunk {chunk_id}: {e}")
        return chunk_id, chunk_text # Return original text if API fails

# -----------------------------------------------------------
# 4. Main asynchronous execution block
# -----------------------------------------------------------
async def main():
    if not os.path.exists(INPUT_TEXT_FILE):
        print(f"Input file '{INPUT_TEXT_FILE}' not found. Ensure it is uploaded or available.")
        return

    print(f"\nStarting ASYNC OCR correction process on {INPUT_TEXT_FILE}...")

    tasks = []
    chunk_count = 0
    for chunk in read_in_chunks(INPUT_TEXT_FILE, CHUNK_SIZE):
        chunk_count += 1
        # Pass the initialized 'model' instance to the async function
        tasks.append(correct_ocr_chunk_with_gemini_async(model, chunk, chunk_count))

    print(f"Created {len(tasks)} tasks. Sending requests concurrently...")

    results = await asyncio.gather(*tasks, return_exceptions=True)

    processed_results = []
    for res in results:
        if isinstance(res, Exception):
            print(f"An exception occurred during one of the API calls: {res}")
        else:
            processed_results.append(res)

    processed_results.sort(key=lambda x: x[0]) # Sort by the chunk_id (index 0)

    with open(OUTPUT_CORRECTED_FILE, 'w', encoding='utf-8') as f_out:
        for chunk_id, corrected_chunk in processed_results:
            if corrected_chunk:
                f_out.write(corrected_chunk + "\n\n")

    print(f"\n--- Gemini ASYNC OCR Correction Pipeline Finished ---")

# -----------------------------------------------------------
# 5. Run the asynchronous main function in Colab
# -----------------------------------------------------------
if __name__ == '__main__':
    # asyncio.run handles running the async main loop
    asyncio.run(main())


Falling back to grpc since no async rest credentials were detected.


Gemini Model 'gemini-2.5-flash' initialized successfully.

Starting ASYNC OCR correction process on confuciussinarum00conf_0_djvu.txt...
Created 53 tasks. Sending requests concurrently...
  -> Finished processing chunk 53.
  -> Finished processing chunk 35.
  -> Finished processing chunk 1.
  -> Finished processing chunk 29.
  -> Finished processing chunk 20.
  -> Finished processing chunk 21.
  -> Finished processing chunk 4.
  -> Finished processing chunk 49.
  -> Finished processing chunk 13.
  -> Finished processing chunk 42.
  -> Finished processing chunk 7.
  -> Finished processing chunk 10.
  -> Finished processing chunk 30.
  -> Finished processing chunk 6.
  -> Finished processing chunk 51.
  -> Finished processing chunk 37.
  -> Finished processing chunk 8.
  -> Finished processing chunk 45.
  -> Finished processing chunk 3.
  -> Finished processing chunk 38.
  -> Finished processing chunk 16.
  -> Finished processing chunk 44.
  -> Finished processing chunk 52.
  -> Finished

Try playing with the chunk sizes. Reduce to 3k.

In [None]:
import os
import asyncio
import vertexai
from vertexai.generative_models import GenerativeModel, Part

# Use nest_asyncio to run async code easily in environments like Colab
import nest_asyncio
nest_asyncio.apply()

# --- Configuration Constants (Define these as per your environment) ---
PROJECT_ID = 'renaissance-ocr'
LOCATION = 'global'
GEMINI_MODEL = 'gemini-2.5-flash'
JSON_FILE_PATH = 'renaissance-ocr-4aabe5b8dc65.json'
INPUT_TEXT_FILE = 'confuciussinarum00conf_0_djvu.txt'
OUTPUT_CORRECTED_FILE = 'Corrected_OCR_Text_from_Archive4.txt'
CHUNK_SIZE = 3000
CORRECTION_PROMPT = """
You are an expert proofreader of early modern Latin. TASK: Correct ONLY OCR errors in the input text. Do not rewrite or alter authentic early modern usage. RULES: 1. Preserve the original wording, grammar, and syntax. Fix ONLY: - misread letters - broken or split words - incorrect or missing spacing - duplicated characters - corrupted punctuation caused by OCR 2. Apply minimal normalization (OLD conventions): - u = vowel, v = consonant - use i only (never j) - expand æ → ae, œ → oe - replace & or &amp; with et Do NOT modernize vocabulary or regularize historical spellings unless the form is clearly an OCR mistake. 3. This is very important: De hyphenate all words broken across line breaks: - remove hyphens that occur at the end of a line, - join the two fragments into a single continuous word, - and correct any spacing errors created by the line break. Also remove hyphens that belong to real Latin compounds, as hyphens are irrelevant. Do not add new punctuation. Do not retain paragraph breaks; just produce continuous text. 4. Keep all paragraph breaks and capitalization exactly as in the input. 5. Remove page furniture only when clearly not part of the running text: examples: “A ij”, “EPISTOLA.”, catchwords, signature marks. 6. Preserve Chinese or other exotic romanizations exactly as printed once corrected. Do NOT “improve,” standardize, or re interpret them (e.g., keep forms like “Tai Ki Gin”). 7. Output ONLY the corrected Latin text. Do NOT add commentary, explanation, or formatting.
"""

# -----------------------------------------------------------
# 1. Authenticate and Initialize the Vertex AI Client
# -----------------------------------------------------------
try:
    if not os.path.exists(JSON_FILE_PATH):
        print(f"Credentials file not found at: {JSON_FILE_PATH}")
        raise FileNotFoundError(f"Credentials file not found: {JSON_FILE_PATH}")

    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = JSON_FILE_PATH
    vertexai.init(project=PROJECT_ID, location=LOCATION)
    # Initialize the *one* model client we will reuse everywhere
    model = GenerativeModel(GEMINI_MODEL)
    print(f"Gemini Model '{GEMINI_MODEL}' initialized successfully.")

except Exception as e:
    print(f"Authentication or client initialization failed: {e}")
    exit()

# -----------------------------------------------------------
# 2. Define a function to read a file in chunks
# -----------------------------------------------------------
def read_in_chunks(file_path, chunk_size=CHUNK_SIZE):
    # (This function remains the same)
    with open(file_path, 'r', encoding='utf-8') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            yield chunk

# -----------------------------------------------------------
# 3. Define an ASYNCHRONOUS function to correct a chunk
# -----------------------------------------------------------
async def correct_ocr_chunk_with_gemini_async(model_client, chunk_text, chunk_id):
    """Sends a text chunk to Gemini for correction asynchronously."""

    contents = [
        CORRECTION_PROMPT,
        chunk_text
    ]

    try:
        # Await the response using the provided model_client instance
        response = await model_client.generate_content_async(contents)

        corrected_text = response.text.strip() or ""
        print(f"  -> Finished processing chunk {chunk_id}.")
        return chunk_id, corrected_text # Return ID to sort results later

    except Exception as e:
        print(f"API call failed for chunk {chunk_id}: {e}")
        return chunk_id, chunk_text # Return original text if API fails

# -----------------------------------------------------------
# 4. Main asynchronous execution block
# -----------------------------------------------------------
async def main():
    if not os.path.exists(INPUT_TEXT_FILE):
        print(f"Input file '{INPUT_TEXT_FILE}' not found. Ensure it is uploaded or available.")
        return

    print(f"\nStarting ASYNC OCR correction process on {INPUT_TEXT_FILE}...")

    tasks = []
    chunk_count = 0
    for chunk in read_in_chunks(INPUT_TEXT_FILE, CHUNK_SIZE):
        chunk_count += 1
        # Pass the initialized 'model' instance to the async function
        tasks.append(correct_ocr_chunk_with_gemini_async(model, chunk, chunk_count))

    print(f"Created {len(tasks)} tasks. Sending requests concurrently...")

    results = await asyncio.gather(*tasks, return_exceptions=True)

    processed_results = []
    for res in results:
        if isinstance(res, Exception):
            print(f"An exception occurred during one of the API calls: {res}")
        else:
            processed_results.append(res)

    processed_results.sort(key=lambda x: x[0]) # Sort by the chunk_id (index 0)

    with open(OUTPUT_CORRECTED_FILE, 'w', encoding='utf-8') as f_out:
        for chunk_id, corrected_chunk in processed_results:
            if corrected_chunk:
                f_out.write(corrected_chunk + "\n\n")

    print(f"\n--- Gemini ASYNC OCR Correction Pipeline Finished ---")

# -----------------------------------------------------------
# 5. Run the asynchronous main function in Colab
# -----------------------------------------------------------
if __name__ == '__main__':
    # asyncio.run handles running the async main loop
    asyncio.run(main())


Falling back to grpc since no async rest credentials were detected.


Gemini Model 'gemini-2.5-flash' initialized successfully.

Starting ASYNC OCR correction process on confuciussinarum00conf_0_djvu.txt...
Created 523 tasks. Sending requests concurrently...
  -> Finished processing chunk 60.
  -> Finished processing chunk 93.
  -> Finished processing chunk 83.
  -> Finished processing chunk 86.
  -> Finished processing chunk 78.
  -> Finished processing chunk 47.
  -> Finished processing chunk 51.
  -> Finished processing chunk 98.
  -> Finished processing chunk 43.
  -> Finished processing chunk 77.
API call failed for chunk 12: 429 Resource exhausted. Please try again later. Please refer to https://cloud.google.com/vertex-ai/generative-ai/docs/error-code-429 for more details.
  -> Finished processing chunk 75.
  -> Finished processing chunk 72.
  -> Finished processing chunk 97.
  -> Finished processing chunk 90.
API call failed for chunk 91: 429 Resource exhausted. Please try again later. Please refer to https://cloud.google.com/vertex-ai/generative-

Awesome—here’s a compact, production‑ready **Python script** that implements the **minimal, high‑ROI pre‑processing** we discussed for re‑OCR’d historical Latin (e.g., Archive.org). It’s modular, fast, and safe by default (reversible where reasonable). You can run it file‑by‑file or batch a folder.

> What it does (in this order):
>
> 1.  Strip page furniture (headers/footers/page numbers/catchwords)
> 2.  De‑hyphenate line breaks (`word-\nnext` → `wordnext`) with a small safeguard list
> 3.  Unicode normalize + glyph maps: long‑s, ligatures, curly quotes/dashes
> 4.  Optional orthography normalization (**u↔v**, **i↔j**) — *off by default*
> 5.  Whitespace normalization (preserve paragraph breaks)
> 6.  Protected terms pass (names/terms you never want normalized)

> Output: a cleaned `.txt` next to the input (or a chosen output file).  
> Reproducibility: you get a simple JSON log of basic counts (what lines were dropped, hyphens joined, etc.).

***

## `preprocess_early_modern_latin.py`

```python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Minimal pre-processing for early-modern Latin OCR (Archive.org etc.).
- Removes page furniture (headers/footers/page numbers/catchwords)
- De-hyphenates line-break hyphens safely
- Applies Unicode normalization + glyph maps (ſ, æ/œ, curly quotes/dashes)
- (Optional) orthography normalization (u/v, i/j)  -- OFF by default
- Preserves paragraph boundaries; collapses intra-paragraph spaces
- Respects a protected-terms list (no changes inside those tokens)

Usage:
  python preprocess_early_modern_latin.py input.txt [-o output.txt]
       [--protect protected_terms.txt]
       [--normalize-uv] [--normalize-ij]
       [--keep-ligatures]    # if you DON'T want æ→ae / œ→oe
       [--no-page-furniture] # if the OCR has no headers/footers
       [--log log.json]

"""

import re
import sys
import json
import argparse
import unicodedata
from pathlib import Path

# --------------------------
# Helpers / configuration
# --------------------------

HEADER_FOOTER_MAXLEN = 80  # many running heads are short/centered
CATCHWORD_MAXLEN = 14      # page-end catchwords are very short
ROMAN_RE = re.compile(r'^[IVXLCDM.\s]{2,}$')  # roman-numeral-only line
PAGE_NUM_RE = re.compile(r'^\s*\d{1,4}\s*$')  # bare page numbers
CENTERED_DOTS_RE = re.compile(r'^\s*[•··•]+?\s*$')

# Things that often show up as “furniture-like” lines
FURNITURE_CUES = (
    'PROEMIALIS DECLARATIO', 'PROEMIALIS', 'DECLARATIO', 'OPERIS',
    'PARAGRAPHUS', 'CHRONOLOGICA', 'CUM PRIVILEGIO REGIS'
)

SAFE_COMPOUND_PREFIXES = {
    # Minimal whitelist: do NOT de-hyphenate if exactly these prefixes
    # (extend if you see false joins in your corpus)
    'co', 're', 'pre', 'semi', 'anti', 'inter', 'intra', 'super', 'sub', 'trans'
}

# Map curly quotes/dashes to ASCII; leave apostrophes
PUNCT_MAP = {
    '\u2018': "'", '\u2019': "'", '\u201A': ',', '\u201B': "'",
    '\u201C': '"', '\u201D': '"', '\u201E': '"',
    '\u2013': '-', '\u2014': '-', '\u2212': '-',  # en/em/minus
    '\u00A0': ' ',  # non-breaking space
}

# Glyph maps (reversible in principle; keep raw elsewhere)
def glyph_normalize(s: str, keep_ligatures: bool) -> str:
    # long s
    s = s.replace('\u017F', 's')
    if not keep_ligatures:
        s = s.replace('æ', 'ae').replace('Æ', 'AE')
        s = s.replace('œ', 'oe').replace('Œ', 'OE')
    # straight quotes/dashes
    for k, v in PUNCT_MAP.items():
        s = s.replace(k, v)
    # NFKC helps normalize compatibility glyphs while preserving text
    s = unicodedata.normalize('NFKC', s)
    return s

def load_protected_terms(path: Path):
    if not path or not path.exists():
        return set()
    terms = set()
    for line in path.read_text(encoding='utf-8', errors='ignore').splitlines():
        t = line.strip()
        if t and not t.startswith('#'):
            terms.add(t)
    return terms

def protect_terms(text: str, protected: set):
    # Wrap protected tokens with sentinel markers so later passes don't alter them
    # Use \b-like behavior with Latin word boundaries (letters only)
    # We protect case-insensitively but preserve original casing.
    if not protected:
        return text, {}
    MARK_L = '\uE000'  # Private Use Area
    MARK_R = '\uE001'
    replacements = {}
    # Sort longer first to avoid partial overlaps
    for term in sorted(protected, key=len, reverse=True):
        # Word-ish boundaries: allow letters/digits around; avoid splitting punctuation
        pattern = re.compile(rf'(?<!\w)({re.escape(term)})', flags=re.IGNORECASE)
        def repl(m):
            original = m.group(1)
            wrapped = f"{MARK_L}{original}{MARK_R}"
            replacements[wrapped] = original
            return wrapped
        text = pattern.sub(repl, text)
    return text, {'MARK_L': MARK_L, 'MARK_R': MARK_R}

def unprotect_terms(text: str, replacements_map: dict):
    if not replacements_map:
        return text
    # Simply remove the sentinels; original casing remains since we wrapped the matched form
    text = text.replace(replacements_map['MARK_L'], '').replace(replacements_map['MARK_R'], '')
    return text

# --------------------------
# Page furniture removal
# --------------------------

def is_page_furniture(line: str) -> bool:
    L = line.strip()
    if not L:
        return False
    if PAGE_NUM_RE.match(L):
        return True
    if ROMAN_RE.match(L):
        return True
    if len(L) <= CATCHWORD_MAXLEN and L.isalpha() and L[0].islower():
        # common catchword signal: short lowercase token alone near a page break
        return True
    if CENTERED_DOTS_RE.match(L):
        return True
    if len(L) <= HEADER_FOOTER_MAXLEN and L.isupper():
        # all-caps short line (running head)
        # check for typical cues
        for cue in FURNITURE_CUES:
            if cue in L:
                return True
    return False

def remove_page_furniture(text: str):
    lines = text.splitlines()
    kept = []
    removed = 0
    for i, L in enumerate(lines):
        if is_page_furniture(L):
            removed += 1
            continue
        kept.append(L)
    return "\n".join(kept), removed

# --------------------------
# De-hyphenation at line breaks
# --------------------------

def should_join(left_token: str, right_token: str) -> bool:
    # Avoid joining if left_token is in a safe prefix list (e.g., "co-")
    # Very conservative: only block if the entire left_token is a prefix
    return left_token.lower() not in SAFE_COMPOUND_PREFIXES

def dehyphenate(text: str):
    """
    Join lines ending with hyphen:
       "... verbo-\nrum ..." -> "... verborum ..."
    but skip if left piece is a protected prefix (e.g., "co-")
    """
    # We'll work on raw text to keep paragraphing
    # Replace pattern word-<newline>lowercase
    count = 0
    def repl(m):
        nonlocal count
        left = m.group(1)
        right_first = m.group(2)
        if should_join(left, right_first):
            count += 1
            return left + right_first
        # keep hyphen + newline if it's a safe prefix case
        return left + '-\n' + right_first

    # Do multiple passes for rare cascades
    pattern = re.compile(r'([A-Za-zÀ-ÿ]+)-\n([a-zà-ÿ])')
    new_text = pattern.sub(repl, text)
    return new_text, count

# --------------------------
# Orthography normalization (optional)
# --------------------------

def normalize_uv(text: str) -> str:
    # Conservative: change "v" as consonant before vowel at word start to "v", "u" otherwise.
    # For analytics, many choose to map all u/v to "u" or to "v". We'll provide a simple policy:
    # - Map all "v" between vowels to "u" (e.g., "aue" variants) and all standalone "u" at word start before vowel to "v".
    # This is simplistic but serviceable for counts; tune if needed.
    # 1) u->v at word start before vowel
    text = re.sub(r'\buU', lambda m: 'v' if m.group(0).islower() else 'V', text)
    # 2) v->u between vowels (a simple heuristic)
    text = re.sub(r'([aeiouAEIOU])vV',
                  lambda m: m.group(1) + ('u' if m.group(0)[1].islower() else 'U') + m.group(2), text)
    return text

def normalize_ij(text: str) -> str:
    # Very conservative: map "j" in vowel context to "i", but do NOT touch "J" initials of proper names.
    # Many projects prefer a flat map j→i; we give a safer default.
    text = re.sub(r'(?<!\b[J])j', 'i', text)  # replace lower j unless "J" initial
    return text

# --------------------------
# Whitespace normalization
# --------------------------

def normalize_whitespace(text: str) -> str:
    # Preserve paragraph breaks but normalize intra-paragraph spaces
    # 1) Canonicalize Windows newlines
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    # 2) Compress >2 blank lines down to 2 (paragraph spacing)
    text = re.sub(r'\n{3,}', '\n\n', text)
    # 3) Within paragraphs, collapse multiple spaces/tabs
    paras = [re.sub(r'[ \t]+', ' ', p).strip() for p in text.split('\n')]
    return "\n".join(paras)

# --------------------------
# Main pipeline
# --------------------------

def preprocess(
    raw_text: str,
    keep_ligatures: bool = False,
    drop_furniture: bool = True,
    do_uv: bool = False,
    do_ij: bool = False,
    protected_terms: set = None
):
    log = {
        'removed_furniture_lines': 0,
        'dehyphen_joins': 0,
        'applied_uv': do_uv,
        'applied_ij': do_ij,
        'keep_ligatures': keep_ligatures,
        'protected_terms': len(protected_terms or [])
    }

    # 0) Protect terms
    protected_map = {}
    if protected_terms:
        raw_text, protected_map = protect_terms(raw_text, protected_terms)

    # 1) Page furniture
    if drop_furniture:
        raw_text, removed = remove_page_furniture(raw_text)
        log['removed_furniture_lines'] = removed

    # 2) Dehyphenate
    raw_text, joins = dehyphenate(raw_text)
    log['dehyphen_joins'] = joins

    # 3) Glyph normalization
    raw_text = glyph_normalize(raw_text, keep_ligatures=keep_ligatures)

    # 4) Optional orthography normalization
    if do_uv:
        raw_text = normalize_uv(raw_text)
    if do_ij:
        raw_text = normalize_ij(raw_text)

    # 5) Whitespace normalization (preserve paragraph breaks)
    raw_text = normalize_whitespace(raw_text)

    # 6) Unprotect terms
    raw_text = unprotect_terms(raw_text, protected_map)

    return raw_text, log

def main():
    ap = argparse.ArgumentParser(description="Minimal pre-processing for early-modern Latin OCR.")
    ap.add_argument("input", type=str, help="Path to input .txt")
    ap.add_argument("-o", "--output", type=str, help="Path to output .txt (default: alongside input)")
    ap.add_argument("--protect", type=str, help="Optional protected-terms file (one term per line)")
    ap.add_argument("--normalize-uv", action="store_true", help="Apply u/v normalization (conservative heuristic)")
    ap.add_argument("--normalize-ij", action="store_true", help="Apply i/j normalization (conservative heuristic)")
    ap.add_argument("--keep-ligatures", action="store_true", help="Keep æ/œ ligatures instead of expanding to ae/oe")
    ap.add_argument("--no-page-furniture", action="store_true", help="Do not strip page furniture")
    ap.add_argument("--log", type=str, help="Write summary log JSON")
    args = ap.parse_args()

    in_path = Path(args.input)
    if not in_path.exists():
        print(f"ERROR: Input not found: {in_path}", file=sys.stderr)
        sys.exit(1)

    out_path = Path(args.output) if args.output else in_path.with_suffix(".clean.txt")
    prot_set = load_protected_terms(Path(args.protect)) if args.protect else set()

    raw = in_path.read_text(encoding="utf-8", errors="ignore")
    cleaned, log = preprocess(
        raw_text=raw,
        keep_ligatures=args.keep_ligatures,
        drop_furniture=not args.no_page_furniture,
        do_uv=args.normalize_uv,
        do_ij=args.normalize_ij,
        protected_terms=prot_set
    )
    out_path.write_text(cleaned, encoding="utf-8")

    if args.log:
        Path(args.log).write_text(json.dumps({
            'input': str(in_path),
            'output': str(out_path),
            **log
        }, ensure_ascii=False, indent=2), encoding="utf-8")

    print(f"✓ Wrote: {out_path}")
    if args.log:
        print(f"✓ Log:   {args.log}")

if __name__ == "__main__":
    main()
```

***

## How to use

```bash
# 1) Basic cleaning (recommended defaults)
python preprocess_early_modern_latin.py Corrected_OCR_Text_from_Archive4.txt \
  -o Corrected_OCR_Text_from_Archive4.clean.txt \
  --log Corrected_OCR_Text_from_Archive4.log.json

# 2) With protected terms (names, Latinized Sinological terms you never want changed)
python preprocess_early_modern_latin.py Corrected_OCR_Text_from_Archive4.txt \
  -o Corrected_OCR_Text_from_Archive4.clean.txt \
  --protect protected_terms.txt \
  --log Corrected_OCR_Text_from_Archive4.log.json

# 3) If you want orthography normalization for analytics tracks
python preprocess_early_modern_latin.py Corrected_OCR_Text_from_Archive4.txt \
  -o Corrected_OCR_Text_from_Archive4.norm.txt \
  --normalize-uv --normalize-ij \
  --protect protected_terms.txt \
  --log Corrected_OCR_Text_from_Archive4.norm.log.json
```

**`protected_terms.txt` example**

    Confucius
    Confucium
    Philippus Couplet
    Intorcetta
    Herdtrich
    Rougemont
    Siamensium
    Sinensium
    Xam ti
    Tai Kie

> Tip: keep two parallel versions:  
> **(A)** “clean” (glyphs normalized; paragraphing preserved; no u/v or i/j changes)  
> **(B)** “norm” (with u/v and/or i/j normalization)  
> Use **(B)** for aggregation (n‑grams, topics); use **(A)** to quote or cross‑check.

***

## What to expect

*   This minimal pass typically lowers normalized WER a few points “for free,” especially by removing page furniture and fixing line‑break hyphens.
*   It’s safe: no paraphrasing, and the optional orthography step is **off by default**.
*   If you later want to add a **confusion‑model post‑correction** layer or **short‑span LLM cleanups**, we can hook those right after this script.

If you’d like, I can **drop in a confusion‑map booster** (learned from your aligned epistle) as a small, optional post‑pass and re‑run WER/CER so you can see before/after deltas.


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import unicodedata
from pathlib import Path
import os
from collections import Counter, defaultdict

# --- Configuration (Kept simple) ---
INPUT_TEXT_FILE  = 'confuciussinarum00conf_0_djvu.txt'
OUTPUT_CLEANED_FILE = 'OCR_Error_Cleaned.txt'

# Map curly quotes/dashes/ligatures/OCR errors
PUNCT_MAP = {
    '\u017F': 's',         # long s (ſ)
    'æ': 'ae', 'Æ': 'AE',
    'œ': 'oe', 'Œ': 'OE',
    '\u2018': "'", '\u2019': "'", '\u201A': ',', '\u201B': "'",
    '\u201C': '"', '\u201D': '"', '\u201E': '"',
    '\u2013': '-', '\u2014': '-', '\u2212': '-',   # en/em/minus → hyphen
    '\u00AD': '-',                                 # soft hyphen → hyphen
    '\u00A0': ' ', '\u2002': ' ', '\u2003': ' ', '\u2009': ' ', # spaces
}

# --- Patterns for Removal (Page Numbers Only) ---

# Roman-only line (common for front matter page numbers)
# Requires at least one character and optional surrounding whitespace/periods.
ROMAN_LINE = re.compile(r'^\s*[IVXLCDM]+\.*\s*$')

# A page-number line: bare arabic up to 4 digits
ARABIC_PAGE = re.compile(r'^\s*\d{1,4}\s*$')

# Centered dot filler or dot leaders which often appear with page numbers in TOCs
FILLER_LINE = re.compile(r'^\s*[•·.\u2022]{3,}\s*$')


# --- Patterns for Fixing OCR Errors ---

# A token of single-letter small-caps separated by spaces: "L U D O V I C O"
SPACED_SMALLCAPS = re.compile(r'\b(?:[A-Z]\s){2,}[A-Z]\b')


# --- Utilities ---

def normalize_glyphs(s: str) -> str:
    """Standardize unicode punctuation, ligatures, and OCR noise characters."""
    for k, v in PUNCT_MAP.items():
        s = s.replace(k, v)
    s = unicodedata.normalize('NFKC', s)
    return s

def strip_scanner_artifacts(s: str) -> str:
    """Remove common stray OCR artifacts like escaped parentheses/backslashes."""
    s = s.replace('\\(', '(').replace('\\)', ')')
    # collapse runs of stray backslashes left by OCR
    s = re.sub(r'\\{2,}', r'\\', s)
    return s

def fix_spaced_smallcaps(s: str) -> str:
    """Join sequences like "L U D O V I C O" -> "LUDOVICO"."""
    def _join(m):
        return m.group(0).replace(' ', '')
    return SPACED_SMALLCAPS.sub(_join, s)

def remove_page_numbers_and_fillers(text: str) -> str:
    """
    Iterates through lines and removes only lines matching known page number formats
    or filler dots/leaders.
    """
    lines = text.splitlines()
    kept = []
    for L in lines:
        S = L.strip()
        if not S:
            kept.append(L) # Keep blank lines
            continue

        # Explicitly remove only identified page furniture
        if ARABIC_PAGE.match(S):
            continue
        if ROMAN_LINE.match(S):
            continue
        if FILLER_LINE.match(S):
            continue

        # If it doesn't match a page number pattern, keep it (including headings)
        kept.append(L)
    return "\n".join(kept)

def smart_dehyphenate(s: str) -> str:
    """
    Join end-of-line hyphens when the following line clearly continues the word
    (i.e., starts with a lowercase letter).
    Avoids joining across structural breaks or legitimate compound words.
    """
    # Join standard hyphenated word parts (e.g., 'contin-uation' -> 'continuation')
    s = re.sub(r'([A-Za-zÀ-ÿ]{2,})-\n([a-zà-ÿ]{2,})', r'\1\2', s)
    # Also handle cases where there might be leading whitespace/noise on the next line
    s = re.sub(r'([A-Za-zÀ-ÿ]{2,})-\n([^\S\r\n]*[a-zà-ÿ]{1,})', r'\1\2', s)

    # Simple heuristic to remove soft hyphens that were normalized to '-' *within* a line
    s = re.sub(r'(\w)-(\w)', lambda m: m.group(1)+m.group(2) if len(m.group(1))>1 and len(m.group(2))>1 else m.group(0), s)
    return s

def normalize_whitespace_preserve_structure(s: str) -> str:
    """
    Cleans up excessive spaces/newlines while trying to preserve paragraph breaks.
    """
    # Normalize internal whitespace in paragraphs but preserve blank lines
    paragraphs = re.split(r'\n{2,}', s)
    cleaned = []
    for p in paragraphs:
        # collapse spaces/tabs but keep single newlines inside blocks
        p1 = re.sub(r'[ \t]+', ' ', p.strip())
        # collapse 3+ newlines to 2 within a 'paragraph' processing block if they exist
        p1 = re.sub(r'\n{3,}', '\n\n', p1)
        cleaned.append(p1)
    out = "\n\n".join([x for x in cleaned if x])
    # final trim of trailing whitespace on lines
    out = re.sub(r'[ \t]+$', '', out, flags=re.MULTILINE)
    return out

def process(text: str) -> str:
    """Main processing pipeline."""
    t = normalize_glyphs(text)                   # 1) Unicode/Glyphs first
    t = strip_scanner_artifacts(t)               # 2) Remove OCR noise artifacts
    t = remove_page_numbers_and_fillers(t)       # 3) Remove ONLY page numbers/fillers
    t = smart_dehyphenate(t)                     # 4) Join hyphenated words
    t = fix_spaced_smallcaps(t)                  # 5) Fix "L U D O V I C O" -> "LUDOVICO"
    t = normalize_whitespace_preserve_structure(t)# 6) Clean up spacing/newlines
    return t

# --- Execution ---

def main():
    try:
        with open(INPUT_TEXT_FILE, 'r', encoding='utf-8') as f:
            raw_text = f.read()
    except FileNotFoundError:
        print(f"Error: Input file '{INPUT_TEXT_FILE}' not found.")
        return
    except Exception as e:
        print(f"Error reading input file: {e}")
        return

    cleaned_text = process(raw_text)

    try:
        with open(OUTPUT_CLEANED_FILE, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)
        print(f"Successfully cleaned text written to '{OUTPUT_CLEANED_FILE}'")
    except Exception as e:
        print(f"Error writing output file: {e}")

if __name__ == "__main__":
    main()


Successfully cleaned text written to 'OCR_Error_Cleaned.txt'


OK, so now let's clean this pre-processed text with Gemini

In [None]:
import os
import asyncio
import vertexai
from vertexai.generative_models import GenerativeModel, Part

# Use nest_asyncio to run async code easily in environments like Colab
import nest_asyncio
nest_asyncio.apply()

# --- Configuration Constants (Define these as per your environment) ---
PROJECT_ID = 'renaissance-ocr'
LOCATION = 'global'
GEMINI_MODEL = 'gemini-2.5-flash'
JSON_FILE_PATH = 'renaissance-ocr-4aabe5b8dc65.json'
INPUT_TEXT_FILE = 'OCR_Error_Cleaned.txt'
OUTPUT_CORRECTED_FILE = 'Corrected_Pre-Processed_OCR_Text_from_Archive3.txt'
CHUNK_SIZE = 10000
CORRECTION_PROMPT = """
You are an expert proofreader of early modern Latin. TASK: Correct ONLY OCR errors in the input text. Do not rewrite or alter authentic early modern usage. RULES: 1. Preserve the original wording, grammar, and syntax. Fix ONLY: - misread letters - broken or split words - incorrect or missing spacing - duplicated characters - corrupted punctuation caused by OCR 2. Apply minimal normalization (OLD conventions): - u = vowel, v = consonant - use i only (never j) - expand æ → ae, œ → oe - replace & or &amp; with et Do NOT modernize vocabulary or regularize historical spellings unless the form is clearly an OCR mistake. 3. This is very important: De hyphenate all words broken across line breaks: - remove hyphens that occur at the end of a line, - join the two fragments into a single continuous word, - and correct any spacing errors created by the line break. Also remove hyphens that belong to real Latin compounds, as hyphens are irrelevant. Do not add new punctuation. Do not retain paragraph breaks; just produce continuous text. 4. Keep all paragraph breaks and capitalization exactly as in the input. 5. Remove page furniture only when clearly not part of the running text: examples: “A ij”, “EPISTOLA.”, catchwords, signature marks. 6. Preserve Chinese or other exotic romanizations exactly as printed once corrected. Do NOT “improve,” standardize, or re interpret them (e.g., keep forms like “Tai Ki Gin”). 7. Output ONLY the corrected Latin text. Do NOT add commentary, explanation, or formatting.
"""

# -----------------------------------------------------------
# 1. Authenticate and Initialize the Vertex AI Client
# -----------------------------------------------------------
try:
    if not os.path.exists(JSON_FILE_PATH):
        print(f"Credentials file not found at: {JSON_FILE_PATH}")
        raise FileNotFoundError(f"Credentials file not found: {JSON_FILE_PATH}")

    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = JSON_FILE_PATH
    vertexai.init(project=PROJECT_ID, location=LOCATION)
    # Initialize the *one* model client we will reuse everywhere
    model = GenerativeModel(GEMINI_MODEL)
    print(f"Gemini Model '{GEMINI_MODEL}' initialized successfully.")

except Exception as e:
    print(f"Authentication or client initialization failed: {e}")
    exit()

# -----------------------------------------------------------
# 2. Define a function to read a file in chunks
# -----------------------------------------------------------
def read_in_chunks(file_path, chunk_size=CHUNK_SIZE):
    # (This function remains the same)
    with open(file_path, 'r', encoding='utf-8') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            yield chunk

# -----------------------------------------------------------
# 3. Define an ASYNCHRONOUS function to correct a chunk
# -----------------------------------------------------------
async def correct_ocr_chunk_with_gemini_async(model_client, chunk_text, chunk_id):
    """Sends a text chunk to Gemini for correction asynchronously."""

    contents = [
        CORRECTION_PROMPT,
        chunk_text
    ]

    try:
        # Await the response using the provided model_client instance
        response = await model_client.generate_content_async(contents)

        corrected_text = response.text.strip() or ""
        print(f"  -> Finished processing chunk {chunk_id}.")
        return chunk_id, corrected_text # Return ID to sort results later

    except Exception as e:
        print(f"API call failed for chunk {chunk_id}: {e}")
        return chunk_id, chunk_text # Return original text if API fails

# -----------------------------------------------------------
# 4. Main asynchronous execution block
# -----------------------------------------------------------
async def main():
    if not os.path.exists(INPUT_TEXT_FILE):
        print(f"Input file '{INPUT_TEXT_FILE}' not found. Ensure it is uploaded or available.")
        return

    print(f"\nStarting ASYNC OCR correction process on {INPUT_TEXT_FILE}...")

    tasks = []
    chunk_count = 0
    for chunk in read_in_chunks(INPUT_TEXT_FILE, CHUNK_SIZE):
        chunk_count += 1
        # Pass the initialized 'model' instance to the async function
        tasks.append(correct_ocr_chunk_with_gemini_async(model, chunk, chunk_count))

    print(f"Created {len(tasks)} tasks. Sending requests concurrently...")

    results = await asyncio.gather(*tasks, return_exceptions=True)

    processed_results = []
    for res in results:
        if isinstance(res, Exception):
            print(f"An exception occurred during one of the API calls: {res}")
        else:
            processed_results.append(res)

    processed_results.sort(key=lambda x: x[0]) # Sort by the chunk_id (index 0)

    with open(OUTPUT_CORRECTED_FILE, 'w', encoding='utf-8') as f_out:
        for chunk_id, corrected_chunk in processed_results:
            if corrected_chunk:
                f_out.write(corrected_chunk + "\n\n")

    print(f"\n--- Gemini ASYNC OCR Correction Pipeline Finished ---")

# -----------------------------------------------------------
# 5. Run the asynchronous main function in Colab
# -----------------------------------------------------------
if __name__ == '__main__':
    # asyncio.run handles running the async main loop
    asyncio.run(main())


Falling back to grpc since no async rest credentials were detected.


Gemini Model 'gemini-2.5-flash' initialized successfully.

Starting ASYNC OCR correction process on OCR_Error_Cleaned.txt...
Created 138 tasks. Sending requests concurrently...
  -> Finished processing chunk 131.
  -> Finished processing chunk 23.
  -> Finished processing chunk 52.
  -> Finished processing chunk 5.
  -> Finished processing chunk 6.
  -> Finished processing chunk 106.
  -> Finished processing chunk 123.
  -> Finished processing chunk 46.
  -> Finished processing chunk 44.
  -> Finished processing chunk 111.
  -> Finished processing chunk 42.
  -> Finished processing chunk 21.
  -> Finished processing chunk 80.
  -> Finished processing chunk 98.
  -> Finished processing chunk 110.
  -> Finished processing chunk 41.
  -> Finished processing chunk 102.
  -> Finished processing chunk 95.
  -> Finished processing chunk 75.
  -> Finished processing chunk 121.
  -> Finished processing chunk 2.
  -> Finished processing chunk 7.
  -> Finished processing chunk 39.
  -> Finished p

Let's try it with the medical text

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import unicodedata
from pathlib import Path
import os
from collections import Counter, defaultdict

# --- Configuration (Kept simple) ---
INPUT_TEXT_FILE  = 'BIUSante_154971_djvu.txt'
OUTPUT_CLEANED_FILE = 'Medicinae_OCR_Error_Cleaned.txt'

# Map curly quotes/dashes/ligatures/OCR errors
PUNCT_MAP = {
    '\u017F': 's',         # long s (ſ)
    'æ': 'ae', 'Æ': 'AE',
    'œ': 'oe', 'Œ': 'OE',
    '\u2018': "'", '\u2019': "'", '\u201A': ',', '\u201B': "'",
    '\u201C': '"', '\u201D': '"', '\u201E': '"',
    '\u2013': '-', '\u2014': '-', '\u2212': '-',   # en/em/minus → hyphen
    '\u00AD': '-',                                 # soft hyphen → hyphen
    '\u00A0': ' ', '\u2002': ' ', '\u2003': ' ', '\u2009': ' ', # spaces
}

# --- Patterns for Removal (Page Numbers Only) ---

# Roman-only line (common for front matter page numbers)
# Requires at least one character and optional surrounding whitespace/periods.
ROMAN_LINE = re.compile(r'^\s*[IVXLCDM]+\.*\s*$')

# A page-number line: bare arabic up to 4 digits
ARABIC_PAGE = re.compile(r'^\s*\d{1,4}\s*$')

# Centered dot filler or dot leaders which often appear with page numbers in TOCs
FILLER_LINE = re.compile(r'^\s*[•·.\u2022]{3,}\s*$')


# --- Patterns for Fixing OCR Errors ---

# A token of single-letter small-caps separated by spaces: "L U D O V I C O"
SPACED_SMALLCAPS = re.compile(r'\b(?:[A-Z]\s){2,}[A-Z]\b')


# --- Utilities ---

def normalize_glyphs(s: str) -> str:
    """Standardize unicode punctuation, ligatures, and OCR noise characters."""
    for k, v in PUNCT_MAP.items():
        s = s.replace(k, v)
    s = unicodedata.normalize('NFKC', s)
    return s

def strip_scanner_artifacts(s: str) -> str:
    """Remove common stray OCR artifacts like escaped parentheses/backslashes."""
    s = s.replace('\\(', '(').replace('\\)', ')')
    # collapse runs of stray backslashes left by OCR
    s = re.sub(r'\\{2,}', r'\\', s)
    return s

def fix_spaced_smallcaps(s: str) -> str:
    """Join sequences like "L U D O V I C O" -> "LUDOVICO"."""
    def _join(m):
        return m.group(0).replace(' ', '')
    return SPACED_SMALLCAPS.sub(_join, s)

def remove_page_numbers_and_fillers(text: str) -> str:
    """
    Iterates through lines and removes only lines matching known page number formats
    or filler dots/leaders.
    """
    lines = text.splitlines()
    kept = []
    for L in lines:
        S = L.strip()
        if not S:
            kept.append(L) # Keep blank lines
            continue

        # Explicitly remove only identified page furniture
        if ARABIC_PAGE.match(S):
            continue
        if ROMAN_LINE.match(S):
            continue
        if FILLER_LINE.match(S):
            continue

        # If it doesn't match a page number pattern, keep it (including headings)
        kept.append(L)
    return "\n".join(kept)

def smart_dehyphenate(s: str) -> str:
    """
    Join end-of-line hyphens when the following line clearly continues the word
    (i.e., starts with a lowercase letter).
    Avoids joining across structural breaks or legitimate compound words.
    """
    # Join standard hyphenated word parts (e.g., 'contin-uation' -> 'continuation')
    s = re.sub(r'([A-Za-zÀ-ÿ]{2,})-\n([a-zà-ÿ]{2,})', r'\1\2', s)
    # Also handle cases where there might be leading whitespace/noise on the next line
    s = re.sub(r'([A-Za-zÀ-ÿ]{2,})-\n([^\S\r\n]*[a-zà-ÿ]{1,})', r'\1\2', s)

    # Simple heuristic to remove soft hyphens that were normalized to '-' *within* a line
    s = re.sub(r'(\w)-(\w)', lambda m: m.group(1)+m.group(2) if len(m.group(1))>1 and len(m.group(2))>1 else m.group(0), s)
    return s

def normalize_whitespace_preserve_structure(s: str) -> str:
    """
    Cleans up excessive spaces/newlines while trying to preserve paragraph breaks.
    """
    # Normalize internal whitespace in paragraphs but preserve blank lines
    paragraphs = re.split(r'\n{2,}', s)
    cleaned = []
    for p in paragraphs:
        # collapse spaces/tabs but keep single newlines inside blocks
        p1 = re.sub(r'[ \t]+', ' ', p.strip())
        # collapse 3+ newlines to 2 within a 'paragraph' processing block if they exist
        p1 = re.sub(r'\n{3,}', '\n\n', p1)
        cleaned.append(p1)
    out = "\n\n".join([x for x in cleaned if x])
    # final trim of trailing whitespace on lines
    out = re.sub(r'[ \t]+$', '', out, flags=re.MULTILINE)
    return out

def process(text: str) -> str:
    """Main processing pipeline."""
    t = normalize_glyphs(text)                   # 1) Unicode/Glyphs first
    t = strip_scanner_artifacts(t)               # 2) Remove OCR noise artifacts
    t = remove_page_numbers_and_fillers(t)       # 3) Remove ONLY page numbers/fillers
    t = smart_dehyphenate(t)                     # 4) Join hyphenated words
    t = fix_spaced_smallcaps(t)                  # 5) Fix "L U D O V I C O" -> "LUDOVICO"
    t = normalize_whitespace_preserve_structure(t)# 6) Clean up spacing/newlines
    return t

# --- Execution ---

def main():
    try:
        with open(INPUT_TEXT_FILE, 'r', encoding='utf-8') as f:
            raw_text = f.read()
    except FileNotFoundError:
        print(f"Error: Input file '{INPUT_TEXT_FILE}' not found.")
        return
    except Exception as e:
        print(f"Error reading input file: {e}")
        return

    cleaned_text = process(raw_text)

    try:
        with open(OUTPUT_CLEANED_FILE, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)
        print(f"Successfully cleaned text written to '{OUTPUT_CLEANED_FILE}'")
    except Exception as e:
        print(f"Error writing output file: {e}")

if __name__ == "__main__":
    main()


Successfully cleaned text written to 'Medicinae_OCR_Error_Cleaned.txt'


And now let's clean up the medical text with Gemini

In [None]:
import os
import asyncio
import vertexai
from vertexai.generative_models import GenerativeModel, Part

# Use nest_asyncio to run async code easily in environments like Colab
import nest_asyncio
nest_asyncio.apply()

# --- Configuration Constants (Define these as per your environment) ---
PROJECT_ID = 'renaissance-ocr'
LOCATION = 'global'
GEMINI_MODEL = 'gemini-2.5-flash'
JSON_FILE_PATH = 'renaissance-ocr-4aabe5b8dc65.json'
INPUT_TEXT_FILE = 'Medicinae_OCR_Error_Cleaned.txt'
OUTPUT_CORRECTED_FILE = 'Gemini_Medicinae_OCR_Error_Cleaned.txt'
CHUNK_SIZE = 10000
CORRECTION_PROMPT = """
You are an expert proofreader of early modern Latin. TASK: Correct ONLY OCR errors in the input text. Do not rewrite or alter authentic early modern usage. RULES: 1. Preserve the original wording, grammar, and syntax. Fix ONLY: - misread letters - broken or split words - incorrect or missing spacing - duplicated characters - corrupted punctuation caused by OCR 2. Apply minimal normalization (OLD conventions): - u = vowel, v = consonant - use i only (never j) - expand æ → ae, œ → oe - replace & or &amp; with et Do NOT modernize vocabulary or regularize historical spellings unless the form is clearly an OCR mistake. 3. This is very important: De hyphenate all words broken across line breaks: - remove hyphens that occur at the end of a line, - join the two fragments into a single continuous word, - and correct any spacing errors created by the line break. Also remove hyphens that belong to real Latin compounds, as hyphens are irrelevant. Do not add new punctuation. Do not retain paragraph breaks; just produce continuous text. 4. Keep all paragraph breaks and capitalization exactly as in the input. 5. Remove page furniture only when clearly not part of the running text: examples: “A ij”, “EPISTOLA.”, catchwords, signature marks. 6. Preserve Chinese or other exotic romanizations exactly as printed once corrected. Do NOT “improve,” standardize, or re interpret them (e.g., keep forms like “Tai Ki Gin”). 7. Output ONLY the corrected Latin text. Do NOT add commentary, explanation, or formatting.
"""

# -----------------------------------------------------------
# 1. Authenticate and Initialize the Vertex AI Client
# -----------------------------------------------------------
try:
    if not os.path.exists(JSON_FILE_PATH):
        print(f"Credentials file not found at: {JSON_FILE_PATH}")
        raise FileNotFoundError(f"Credentials file not found: {JSON_FILE_PATH}")

    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = JSON_FILE_PATH
    vertexai.init(project=PROJECT_ID, location=LOCATION)
    # Initialize the *one* model client we will reuse everywhere
    model = GenerativeModel(GEMINI_MODEL)
    print(f"Gemini Model '{GEMINI_MODEL}' initialized successfully.")

except Exception as e:
    print(f"Authentication or client initialization failed: {e}")
    exit()

# -----------------------------------------------------------
# 2. Define a function to read a file in chunks
# -----------------------------------------------------------
def read_in_chunks(file_path, chunk_size=CHUNK_SIZE):
    # (This function remains the same)
    with open(file_path, 'r', encoding='utf-8') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            yield chunk

# -----------------------------------------------------------
# 3. Define an ASYNCHRONOUS function to correct a chunk
# -----------------------------------------------------------
async def correct_ocr_chunk_with_gemini_async(model_client, chunk_text, chunk_id):
    """Sends a text chunk to Gemini for correction asynchronously."""

    contents = [
        CORRECTION_PROMPT,
        chunk_text
    ]

    try:
        # Await the response using the provided model_client instance
        response = await model_client.generate_content_async(contents)

        corrected_text = response.text.strip() or ""
        print(f"  -> Finished processing chunk {chunk_id}.")
        return chunk_id, corrected_text # Return ID to sort results later

    except Exception as e:
        print(f"API call failed for chunk {chunk_id}: {e}")
        return chunk_id, chunk_text # Return original text if API fails

# -----------------------------------------------------------
# 4. Main asynchronous execution block
# -----------------------------------------------------------
async def main():
    if not os.path.exists(INPUT_TEXT_FILE):
        print(f"Input file '{INPUT_TEXT_FILE}' not found. Ensure it is uploaded or available.")
        return

    print(f"\nStarting ASYNC OCR correction process on {INPUT_TEXT_FILE}...")

    tasks = []
    chunk_count = 0
    for chunk in read_in_chunks(INPUT_TEXT_FILE, CHUNK_SIZE):
        chunk_count += 1
        # Pass the initialized 'model' instance to the async function
        tasks.append(correct_ocr_chunk_with_gemini_async(model, chunk, chunk_count))

    print(f"Created {len(tasks)} tasks. Sending requests concurrently...")

    results = await asyncio.gather(*tasks, return_exceptions=True)

    processed_results = []
    for res in results:
        if isinstance(res, Exception):
            print(f"An exception occurred during one of the API calls: {res}")
        else:
            processed_results.append(res)

    processed_results.sort(key=lambda x: x[0]) # Sort by the chunk_id (index 0)

    with open(OUTPUT_CORRECTED_FILE, 'w', encoding='utf-8') as f_out:
        for chunk_id, corrected_chunk in processed_results:
            if corrected_chunk:
                f_out.write(corrected_chunk + "\n\n")

    print(f"\n--- Gemini ASYNC OCR Correction Pipeline Finished ---")

# -----------------------------------------------------------
# 5. Run the asynchronous main function in Colab
# -----------------------------------------------------------
if __name__ == '__main__':
    # asyncio.run handles running the async main loop
    asyncio.run(main())


Falling back to grpc since no async rest credentials were detected.


Gemini Model 'gemini-2.5-flash' initialized successfully.

Starting ASYNC OCR correction process on Medicinae_OCR_Error_Cleaned.txt...
Created 46 tasks. Sending requests concurrently...
  -> Finished processing chunk 46.
  -> Finished processing chunk 10.
  -> Finished processing chunk 39.
  -> Finished processing chunk 35.
  -> Finished processing chunk 41.
  -> Finished processing chunk 45.
  -> Finished processing chunk 5.
  -> Finished processing chunk 6.
  -> Finished processing chunk 21.
  -> Finished processing chunk 11.
  -> Finished processing chunk 25.
  -> Finished processing chunk 40.
  -> Finished processing chunk 26.
  -> Finished processing chunk 32.
  -> Finished processing chunk 37.
  -> Finished processing chunk 8.
  -> Finished processing chunk 7.
  -> Finished processing chunk 44.
  -> Finished processing chunk 2.
  -> Finished processing chunk 22.
  -> Finished processing chunk 34.
  -> Finished processing chunk 42.
  -> Finished processing chunk 23.
  -> Finished 

OK, now let's try a Spanish text, Mendoza's Historia del gran reyno de la China.

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import unicodedata
from pathlib import Path
import os
from collections import Counter, defaultdict

# --- Configuration (Kept simple) ---
INPUT_TEXT_FILE  = 'bub_gb_6QiHhHwp8MMC_djvu.txt'
OUTPUT_CLEANED_FILE = 'OCR_Error_Cleaned_Spanish.txt'

# Character ranges for Spanish-inclusive alphanumeric (A-Z, a-z, plus accents and ñ)
SPANISH_ALPHA_PATTERN = r'[A-Za-z\u00C0-\u017F]'

# Map curly quotes/dashes/ligatures/OCR errors
PUNCT_MAP = {
    '\u017F': 's',         # long s (ſ)
    'æ': 'ae', 'Æ': 'AE',
    'œ': 'oe', 'Œ': 'OE',
    '\u2018': "'", '\u2019': "'", '\u201A': ',', '\u201B': "'", # quotes
    '\u201C': '"', '\u201D': '"', '\u201E': '"',
    '\u2013': '-', '\u2014': '-', '\u2212': '-',   # en/em/minus → hyphen
    '\u00AD': '-',                                 # soft hyphen → hyphen
    '\u00A0': ' ', '\u2002': ' ', '\u2003': ' ', '\u2009': ' ', # spaces
    # Specific OCR errors for Spanish (e.g. n followed by tilde symbol as two chars)
    'ñ': 'ñ',
    'Ñ': 'Ñ',
}

# --- Patterns for Removal (Page Numbers Only) ---

# Roman-only line (common for front matter page numbers)
ROMAN_LINE = re.compile(r'^\s*[IVXLCDM]+\.*\s*$', re.IGNORECASE)

# A page-number line: bare arabic up to 4 digits
ARABIC_PAGE = re.compile(r'^\s*\d{1,4}\s*$')

# Centered dot filler or dot leaders which often appear with page numbers in TOCs
FILLER_LINE = re.compile(r'^\s*[•·.\u2022]{3,}\s*$')


# --- Patterns for Fixing OCR Errors ---

# A token of single-letter small-caps separated by spaces: "L U D O V I C O"
SPACED_SMALLCAPS = re.compile(r'\b(?:[A-Z]\s){2,}[A-Z]\b')


# --- Utilities ---

def normalize_glyphs_spanish(s: str) -> str:
    """Standardize unicode punctuation, ligatures, and OCR noise characters."""
    for k, v in PUNCT_MAP.items():
        s = s.replace(k, v)
    # NFKC normalizes many common presentation forms into canonical forms (e.g. ligatures if they weren't in PUNCT_MAP)
    s = unicodedata.normalize('NFKC', s)
    return s

def strip_scanner_artifacts(s: str) -> str:
    """Remove common stray OCR artifacts like escaped parentheses/backslashes."""
    s = s.replace('\\(', '(').replace('\\)', ')')
    # collapse runs of stray backslashes left by OCR
    s = re.sub(r'\\{2,}', r'\\', s)
    return s

def fix_spaced_smallcaps(s: str) -> str:
    """Join sequences like "L U D O V I C O" -> "LUDOVICO"."""
    def _join(m):
        return m.group(0).replace(' ', '')
    return SPACED_SMALLCAPS.sub(_join, s)

def remove_page_numbers_and_fillers(text: str) -> str:
    """
    Iterates through lines and removes only lines matching known page number formats
    or filler dots/leaders.
    """
    lines = text.splitlines()
    kept = []
    for L in lines:
        S = L.strip()
        if not S:
            kept.append(L) # Keep blank lines
            continue

        # Explicitly remove only identified page furniture
        if ARABIC_PAGE.match(S):
            continue
        if ROMAN_LINE.match(S):
            continue
        if FILLER_LINE.match(S):
            continue

        # If it doesn't match a page number pattern, keep it
        kept.append(L)
    return "\n".join(kept)

def smart_dehyphenate_spanish(s: str) -> str:
    """
    Join end-of-line hyphens when the following line clearly continues the word
    (i.e., starts with a lowercase letter, including Spanish accented lowercase).
    """
    # Regex pattern to match a hyphen at the end of a line followed by a word starting with a lowercase char.
    # We use the defined SPANISH_ALPHA_PATTERN for robustness.
    lowercase_alpha_range = r'[a-z\u00E0-\u00FF]'

    # Pattern 1: Match multi-char word part, hyphen, newline, multi-char lowercase word part
    pattern_eol = re.compile(rf'({SPANISH_ALPHA_PATTERN}{{2,}})-\n({lowercase_alpha_range}{{2,}})', re.MULTILINE)
    s = pattern_eol.sub(r'\1\2', s)

    # Pattern 2: Also handle cases where there might be leading whitespace/noise on the next line before lowercase char
    pattern_eol_whitespace = re.compile(rf'({SPANISH_ALPHA_PATTERN}{{2,}})-\n([^\S\r\n]*{lowercase_alpha_range}{{1,}})', re.MULTILINE)
    s = pattern_eol_whitespace.sub(r'\1\2', s)

    # Simple heuristic to remove soft hyphens that were normalized to '-' *within* a line
    # Only join if both sides of the hyphen are at least two characters long, to avoid joining legitimate compound words like 'well-being' (though less common in Spanish)
    pattern_in_line = re.compile(r'(\w)-(\w)')
    s = pattern_in_line.sub(lambda m: m.group(1)+m.group(2) if len(m.group(1))>1 and len(m.group(2))>1 else m.group(0), s)

    return s

def normalize_whitespace_preserve_structure(s: str) -> str:
    """
    Cleans up excessive spaces/newlines while trying to preserve paragraph breaks.
    """
    # Normalize internal whitespace in paragraphs but preserve blank lines
    paragraphs = re.split(r'\n{2,}', s)
    cleaned = []
    for p in paragraphs:
        # collapse spaces/tabs but keep single newlines inside blocks
        p1 = re.sub(r'[ \t]+', ' ', p.strip())
        # collapse 3+ newlines to 2 within a 'paragraph' processing block if they exist
        p1 = re.sub(r'\n{3,}', '\n\n', p1)
        cleaned.append(p1)
    out = "\n\n".join([x for x in cleaned if x])
    # final trim of trailing whitespace on lines
    out = re.sub(r'[ \t]+$', '', out, flags=re.MULTILINE)
    return out

def process(text: str) -> str:
    """Main processing pipeline for Early Modern Spanish OCR."""
    t = normalize_glyphs_spanish(text)            # 1) Unicode/Glyphs first (includes ñ fixes)
    t = strip_scanner_artifacts(t)               # 2) Remove OCR noise artifacts
    t = remove_page_numbers_and_fillers(t)       # 3) Remove ONLY page numbers/fillers
    t = smart_dehyphenate_spanish(t)             # 4) Join hyphenated words (Spanish specific regex)
    t = fix_spaced_smallcaps(t)                  # 5) Fix "L U D O V I C O" -> "LUDOVICO"
    t = normalize_whitespace_preserve_structure(t)# 6) Clean up spacing/newlines
    return t

# --- Execution ---

def main():
    try:
        with open(INPUT_TEXT_FILE, 'r', encoding='utf-8') as f:
            raw_text = f.read()
    except FileNotFoundError:
        print(f"Error: Input file '{INPUT_TEXT_FILE}' not found.")
        return
    except Exception as e:
        print(f"Error reading input file: {e}")
        return

    cleaned_text = process(raw_text)

    try:
        with open(OUTPUT_CLEANED_FILE, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)
        print(f"Successfully cleaned text written to '{OUTPUT_CLEANED_FILE}'")
    except Exception as e:
        print(f"Error writing output file: {e}")

if __name__ == "__main__":
    main()


Successfully cleaned text written to 'OCR_Error_Cleaned_Spanish.txt'


**Now let's clean it with Gemini**

In [None]:
import os
import asyncio
import vertexai
from vertexai.generative_models import GenerativeModel, Part

# Use nest_asyncio to run async code easily in environments like Colab
import nest_asyncio
nest_asyncio.apply()

# --- Configuration Constants (Define these as per your environment) ---
PROJECT_ID = 'renaissance-ocr'
LOCATION = 'global'
GEMINI_MODEL = 'gemini-2.5-flash'
JSON_FILE_PATH = 'renaissance-ocr-4aabe5b8dc65.json'
INPUT_TEXT_FILE = 'OCR_Error_Cleaned_Spanish.txt'
OUTPUT_CORRECTED_FILE = 'Gemini_OCR_Error_Cleaned_Spanish.txt'
CHUNK_SIZE = 10000
CORRECTION_PROMPT = """Eres un corrector experto de textos en español moderno temprano. TU TAREA es corregir ÚNICAMENTE los errores de OCR en el texto de entrada. No reescribas ni alteres el uso auténtico del período.

REGLAS ESPECÍFICAS PARA ESPAÑOL MODERNO TEMPRANO:

1.  **Preservar el texto original (uso auténtico):**
    *   Mantén la redacción, gramática, sintaxis, y la ortografía histórica original (e.g., `hazienda`, `quando`, `scrito`, `vuestra merced`).
    *   Corrige SOLAMENTE errores evidentes de OCR:
        *   Letras mal leídas por el OCR.
        *   Palabras rotas o divididas incorrectamente.
        *   Espacios incorrectos o faltantes.
        *   Caracteres duplicados o puntuación corrompida.

2.  **Normalización mínima (convenciones de la época):**
    *   Mantén el uso original de `u` (vocal) y `v` (consonante), `i` (vocal) e `j` (consonante), si la fuente original los diferencia. Si la fuente usa `i` para ambos sonidos, mantén `i`.
    *   Reemplazar `&` o `&amp;` con `y`.
    *   Asegurar que la `ñ` (e.g., `n~o` -> `ño`) esté correctamente formada.
    *   NO modernices el vocabulario ni regularices la ortografía histórica a menos que la forma sea claramente un error de OCR.

3.  **Desguionizado e Hiphenación:**
    *   Elimina todos los guiones de final de línea que unan una palabra partida: retira el guion y une las dos partes en una sola palabra continua.
    *   Corrige cualquier error de espaciado resultante de la rotura de línea.

4.  **Formato:**
    *   Conserva todas las pausas de párrafo y la capitalización exactamente igual que en la entrada.
    *   Elimina el mobiliario de página (elementos de imprenta) solo cuando esté claro que no forma parte del texto corrido: ejemplos: "A ij", "EPISTOLA.", signaturas, reclamos tipográficos.

5.  **Salida:**
    *   Produce ÚNICAMENTE el texto en español corregido. NO añadas comentarios, explicaciones, ni formato adicional (como negritas o cursivas).

    """

# -----------------------------------------------------------
# 1. Authenticate and Initialize the Vertex AI Client
# -----------------------------------------------------------
try:
    if not os.path.exists(JSON_FILE_PATH):
        print(f"Credentials file not found at: {JSON_FILE_PATH}")
        raise FileNotFoundError(f"Credentials file not found: {JSON_FILE_PATH}")

    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = JSON_FILE_PATH
    vertexai.init(project=PROJECT_ID, location=LOCATION)
    # Initialize the *one* model client we will reuse everywhere
    model = GenerativeModel(GEMINI_MODEL)
    print(f"Gemini Model '{GEMINI_MODEL}' initialized successfully.")

except Exception as e:
    print(f"Authentication or client initialization failed: {e}")
    exit()

# -----------------------------------------------------------
# 2. Define a function to read a file in chunks
# -----------------------------------------------------------
def read_in_chunks(file_path, chunk_size=CHUNK_SIZE):
    # (This function remains the same)
    with open(file_path, 'r', encoding='utf-8') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            yield chunk

# -----------------------------------------------------------
# 3. Define an ASYNCHRONOUS function to correct a chunk
# -----------------------------------------------------------
async def correct_ocr_chunk_with_gemini_async(model_client, chunk_text, chunk_id):
    """Sends a text chunk to Gemini for correction asynchronously."""

    contents = [
        CORRECTION_PROMPT,
        chunk_text
    ]

    try:
        # Await the response using the provided model_client instance
        response = await model_client.generate_content_async(contents)

        corrected_text = response.text.strip() or ""
        print(f"  -> Finished processing chunk {chunk_id}.")
        return chunk_id, corrected_text # Return ID to sort results later

    except Exception as e:
        print(f"API call failed for chunk {chunk_id}: {e}")
        return chunk_id, chunk_text # Return original text if API fails

# -----------------------------------------------------------
# 4. Main asynchronous execution block
# -----------------------------------------------------------
async def main():
    if not os.path.exists(INPUT_TEXT_FILE):
        print(f"Input file '{INPUT_TEXT_FILE}' not found. Ensure it is uploaded or available.")
        return

    print(f"\nStarting ASYNC OCR correction process on {INPUT_TEXT_FILE}...")

    tasks = []
    chunk_count = 0
    for chunk in read_in_chunks(INPUT_TEXT_FILE, CHUNK_SIZE):
        chunk_count += 1
        # Pass the initialized 'model' instance to the async function
        tasks.append(correct_ocr_chunk_with_gemini_async(model, chunk, chunk_count))

    print(f"Created {len(tasks)} tasks. Sending requests concurrently...")

    results = await asyncio.gather(*tasks, return_exceptions=True)

    processed_results = []
    for res in results:
        if isinstance(res, Exception):
            print(f"An exception occurred during one of the API calls: {res}")
        else:
            processed_results.append(res)

    processed_results.sort(key=lambda x: x[0]) # Sort by the chunk_id (index 0)

    with open(OUTPUT_CORRECTED_FILE, 'w', encoding='utf-8') as f_out:
        for chunk_id, corrected_chunk in processed_results:
            if corrected_chunk:
                f_out.write(corrected_chunk + "\n\n")

    print(f"\n--- Gemini ASYNC OCR Correction Pipeline Finished ---")

# -----------------------------------------------------------
# 5. Run the asynchronous main function in Colab
# -----------------------------------------------------------
if __name__ == '__main__':
    # asyncio.run handles running the async main loop
    asyncio.run(main())


Falling back to grpc since no async rest credentials were detected.


Gemini Model 'gemini-2.5-flash' initialized successfully.

Starting ASYNC OCR correction process on OCR_Error_Cleaned_Spanish.txt...
Created 75 tasks. Sending requests concurrently...
  -> Finished processing chunk 70.
  -> Finished processing chunk 62.
  -> Finished processing chunk 36.
  -> Finished processing chunk 9.
  -> Finished processing chunk 24.
  -> Finished processing chunk 64.
  -> Finished processing chunk 72.
  -> Finished processing chunk 66.
  -> Finished processing chunk 21.
  -> Finished processing chunk 61.
  -> Finished processing chunk 13.
  -> Finished processing chunk 27.
  -> Finished processing chunk 6.
  -> Finished processing chunk 5.
  -> Finished processing chunk 8.
  -> Finished processing chunk 19.
  -> Finished processing chunk 58.
  -> Finished processing chunk 67.
  -> Finished processing chunk 14.
  -> Finished processing chunk 47.
  -> Finished processing chunk 26.
  -> Finished processing chunk 45.
  -> Finished processing chunk 42.
  -> Finished p

OK, now let's try OCR'ing using Gemini 2.5

In [None]:
# --- Single Cell Code to Verify Installation and Import ---

print("Starting installation of system dependencies and python libraries...")

# 1. Install necessary libraries
# apt-get installs the 'poppler-utils' system dependency required by pdf2image
!apt-get install -y poppler-utils > /dev/null
!pip install pdf2image Pillow requests > /dev/null

print("Installation steps finished. Attempting import statements now.")

# 2. Attempt imports to verify success
try:
    import os
    import requests
    from pdf2image import convert_from_path
    from PIL import Image
    print("\nSUCCESS: All modules were imported correctly!")
    print("You can now proceed with the full OCR pipeline code.")
except ImportError as e:
    print(f"\nERROR: Failed to import modules. Traceback: {e}")

# 3. Quick verification function
def verify_pdf2image():
    print(f"pdf2image found: {convert_from_path is not None}")

verify_pdf2image()


Starting installation of system dependencies and python libraries...
Installation steps finished. Attempting import statements now.

SUCCESS: All modules were imported correctly!
You can now proceed with the full OCR pipeline code.
pdf2image found: True


In [None]:
# -----------------------------------------------------------
# --- Full Google Colab OCR Pipeline (Asynchronous/Fastest Way) ---
# -----------------------------------------------------------

import os
import io
import json
import vertexai
import asyncio
import concurrent.futures
from pdf2image import convert_from_path, pdfinfo_from_path
from PIL import Image
from vertexai.generative_models import GenerativeModel, Part, Image as VertexImage
from vertexai.preview.generative_models import Image as PreviewImage

In [None]:


# -----------------------------------------------------------
# --- 1. Configuration Variables (Same as before) ---
# -----------------------------------------------------------

# Path to your uploaded GCP JSON key file
JSON_FILE_PATH = '/content/renaissance-ocr-4aabe5b8dc65.json'

# Path to your uploaded PDF file
LOCAL_PDF_PATH = '/content/bub_gb_6QiHhHwp8MMC.pdf'

# GCP Project Details
PROJECT_ID = 'renaissance-ocr'
LOCATION = 'global'
GEMINI_MODEL = 'gemini-2.5-flash'
OUTPUT_IMAGE_FOLDER = 'extracted_images'
OUTPUT_TEXT_FILE = 'Extracted_OCR_Text_Spanish_Full_Async.txt'

# Specific prompt tailored for Early Modern Spanish (based on your rules)
OCR_PROMPT = """
Eres un corrector experto de textos en español moderno temprano. TU TAREA es corregir ÚNICAMENTE los errores de OCR en el texto de entrada basándose en la imagen proporcionada. No reescribas ni alteres el uso auténtico del período.

REGLAS ESPECÍFICAS PARA ESPAÑOL MODERNO TEMPRANO:

1.  **Preservar el texto original (uso auténtico):** Mantén la redacción, gramática, sintaxis, y la ortografía histórica original. Corrige SOLAMENTE errores evidentes de OCR.
2.  **Normalización mínima:** Mantén el uso original de `u`/`v`, `i`/`j`. Reemplazar `&` con `y`. Asegurar que la `ñ` esté correcta.
3.  **Desguionizado e Hiphenación:** Elimina todos los guiones de final de línea que unan una palabra partida.
4.  **Formato:** Conserva todas las pausas de párrafo y la capitalización exactamente igual que en la entrada. Elimina mobiliario de página (e.g., "A ij", signaturas).
5.  **Salida:** Produce ÚNICAMENTE el texto en español corregido. NO añadas comentarios, explicaciones, ni formato adicional.
"""
# -----------------------------------------------------------

# -----------------------------------------------------------
# --- 2. Install System Dependencies & Python Libraries (Safety Net) ---
# -----------------------------------------------------------
print("Ensuring dependencies are installed...")
!apt-get install -y poppler-utils > /dev/null
!pip install google-cloud-aiplatform PyMuPDF Pillow requests pdf2image --upgrade > /dev/null
print("Dependencies check complete.")

# -----------------------------------------------------------
# --- 3. Authenticate and Initialize the Vertex AI Client ---
# -----------------------------------------------------------
try:
    if not os.path.exists(JSON_FILE_PATH):
        raise FileNotFoundError(f"Credentials file not found at: {JSON_FILE_PATH}.")
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = JSON_FILE_PATH
    vertexai.init(project=PROJECT_ID, location=LOCATION)
    model = GenerativeModel(GEMINI_MODEL)
    print(f"Gemini Model '{GEMINI_MODEL}' initialized successfully for project '{PROJECT_ID}'.")
except Exception as e:
    print(f"Authentication or client initialization failed: {e}")
    exit()

# -----------------------------------------------------------
# --- 4. Define Functions (With automatic page count detection and ASYNC processing) ---
# -----------------------------------------------------------

def extract_pdf_pages_from_local_file(local_pdf_path, output_folder):
    """Detects total pages and extracts all pages as high-quality images."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"Created folder: {output_folder}")
    if not os.path.exists(local_pdf_path):
        print(f"Error: Local PDF file not found at {local_pdf_path}")
        return []
    try:
        info = pdfinfo_from_path(local_pdf_path, userpw=None, poppler_path=None)
        total_pages = info["Pages"]
        print(f"PDF detected to have a total of {total_pages} pages.")
    except Exception as e:
        print(f"Could not determine page count automatically: {e}. Defaulting to 1000 max pages.")
        total_pages = 1000
    print(f"Converting pages 1 to {total_pages} into high-quality images (300 DPI)...")
    image_paths = []
    try:
        images = convert_from_path(local_pdf_path, dpi=300, fmt='png', first_page=1, last_page=total_pages, use_cropbox=True)
        for i, img in enumerate(images):
            img_path = os.path.join(output_folder, f"page_{i+1:04d}.png")
            img.save(img_path)
            image_paths.append(img_path)
        print(f"Image extraction complete for {len(image_paths)} pages.")
        return sorted(image_paths)
    except Exception as e:
        print(f"An error occurred during PDF conversion: {e}")
        return []

# ASYNC function to call the API
async def process_image_with_gemini_async(image_path):
    """Processes a local image file using the Gemini multimodal model asynchronously."""
    # print(f"  -> Sending {os.path.basename(image_path)}") # Optional verbose logging
    try:
        vertex_ai_image = VertexImage.load_from_file(image_path)
    except Exception as e:
        return f"Error loading image {image_path}: {e}", image_path

    contents = [OCR_PROMPT, vertex_ai_image]

    try:
        # Use generate_content_async for non-blocking API calls
        response = await model.generate_content_async(contents)
        full_text = response.text or ""
        if full_text:
            return full_text, image_path
        else:
            return "No text detected", image_path
    except Exception as e:
        return f"API call failed for {image_path}: {e}", image_path

# -----------------------------------------------------------
# --- 5. Main Execution Pipeline (Updated for Async Execution) ---
# -----------------------------------------------------------

async def main_async_pipeline():
    # Step A: Extract ALL images from the local PDF file
    image_files_list = extract_pdf_pages_from_local_file(
        LOCAL_PDF_PATH, OUTPUT_IMAGE_FOLDER
    )

    if not image_files_list:
        print("Pipeline halted because no images were extracted.")
        return

    # Step B: Process ALL images using Gemini OCR asynchronously
    print(f"\nStarting ASYNC Gemini OCR process on {len(image_files_list)} images...")

    # Create a list of all asynchronous tasks
    tasks = [process_image_with_gemini_async(image_path) for image_path in image_files_list]

    # Run all tasks concurrently and wait for them to complete
    results = await asyncio.gather(*tasks)

    # Step C: Combine results and save to file
    print(f"\n--- Gemini OCR Pipeline Finished ---")

    full_document_text = ""
    with open(OUTPUT_TEXT_FILE, 'w', encoding='utf-8') as f_out:
        header = f"Gemini OCR Results for the full document ({len(image_files_list)} pages) [ASYNC PROCESSING]\n" + "=" * 60 + "\n"
        f_out.write(header)
        full_document_text += header

        # results is a list of (text, image_path). We process them in order they were submitted
        for text, image_path in results:
            section_text = f"\n\n--- Start of OCR from {os.path.basename(image_path)} ---\n\n{text}\n\n--- End of OCR from {os.path.basename(image_path)} ---\n\n"
            f_out.write(section_text)
            full_document_text += section_text
            print(f"  -> Finished processing {os.path.basename(image_path)}")

    print(f"All extracted text has been saved to: {OUTPUT_TEXT_FILE}")

    # Step D: Print First 200 Words for verification
    print("\n" + "=" * 60)
    print("FIRST 200 WORDS OF EXTRACTED TEXT (Gemini):")
    print("=" * 60)
    words = ' '.join(full_document_text.splitlines()[3:]).split()
    print(' '.join(words[:200]))
    if len(words) > 200:
        print("\n[... Text truncated for display purposes ...]")
    print("=" * 60)

# -----------------------------------------------------------
# --- Execute the Async Main Function ---
# -----------------------------------------------------------
if __name__ == '__main__':
    # Running the async main function in the notebook environment
    await main_async_pipeline()


Ensuring dependencies are installed...
Dependencies check complete.
Gemini Model 'gemini-2.5-flash' initialized successfully for project 'renaissance-ocr'.
PDF detected to have a total of 452 pages.
Converting pages 1 to 452 into high-quality images (300 DPI)...


Falling back to grpc since no async rest credentials were detected.


Image extraction complete for 452 pages.

Starting ASYNC Gemini OCR process on 452 images...

--- Gemini OCR Pipeline Finished ---
  -> Finished processing page_0001.png
  -> Finished processing page_0002.png
  -> Finished processing page_0003.png
  -> Finished processing page_0004.png
  -> Finished processing page_0005.png
  -> Finished processing page_0006.png
  -> Finished processing page_0007.png
  -> Finished processing page_0008.png
  -> Finished processing page_0009.png
  -> Finished processing page_0010.png
  -> Finished processing page_0011.png
  -> Finished processing page_0012.png
  -> Finished processing page_0013.png
  -> Finished processing page_0014.png
  -> Finished processing page_0015.png
  -> Finished processing page_0016.png
  -> Finished processing page_0017.png
  -> Finished processing page_0018.png
  -> Finished processing page_0019.png
  -> Finished processing page_0020.png
  -> Finished processing page_0021.png
  -> Finished processing page_0022.png
  -> Finish

Let's divide the code into blocks as it's more pythonic.

In [None]:
# Block 1: Configuration, Installs, and Authentication

import os
import io
import json
import vertexai
import asyncio
from pdf2image import convert_from_path, pdfinfo_from_path
from PIL import Image
from vertexai.generative_models import GenerativeModel, Part, Image as VertexImage
from vertexai.preview.generative_models import Image as PreviewImage # Fallback

# -----------------------------------------------------------
# --- 1. Configuration Variables: UPDATE THESE ---
# -----------------------------------------------------------

# Path to your uploaded GCP JSON key file
JSON_FILE_PATH = '/content/renaissance-ocr-4aabe5b8dc65.json'

# Path to your uploaded PDF file
LOCAL_PDF_PATH = '/content/bub_gb_6QiHhHwp8MMC.pdf'

# GCP Project Details
PROJECT_ID = 'renaissance-ocr'
LOCATION = 'global'
GEMINI_MODEL = 'gemini-2.5-flash' # Can change to 'gemini-2.5-pro' if needed
OUTPUT_IMAGE_FOLDER = 'extracted_images'
OUTPUT_TEXT_FILE = 'Extracted_OCR_Text_Spanish_Full_Async.txt'

# Specific prompt tailored for Early Modern Spanish
OCR_PROMPT = """
Eres un corrector experto de textos en español moderno temprano. TU TAREA es corregir ÚNICAMENTE los errores de OCR en el texto de entrada basándose en la imagen proporcionada. No reescribas ni alteres el uso auténtico del período.

REGLAS ESPECÍFICAS PARA ESPAÑOL MODERNO TEMPRANO:
1.  **Preservar el texto original (uso auténtico):** Mantén la redacción, gramática, sintaxis, y la ortografía histórica original. Corrige SOLAMENTE errores evidentes de OCR.
2.  **Normalización mínima:** Mantén el uso original de `u`/`v`, `i`/`j`. Reemplazar `&` con `y`. Asegurar que la `ñ` esté correcta.  ſ = s, ß = ss.
3.  **Desguionizado e Hiphenación:** Elimina todos los guiones de final de línea que unan una palabra partida.
4.  **Formato:** Conserva todas las pausas de párrafo y la capitalización exactamente igual que en la entrada. Elimina mobiliario de página (e.g., "A ij", signaturas).
5.  **Salida:** Produce ÚNICAMENTE el texto en español corregido. NO añadas comentarios, explicaciones, ni formato adicional.
6. **Emimina números de página de OCR** eliminar cualquier mención de números de página de OCR
"""
# -----------------------------------------------------------

# -----------------------------------------------------------
# --- 2. Install System Dependencies & Python Libraries ---
# -----------------------------------------------------------
print("Ensuring dependencies are installed...")
!apt-get install -y poppler-utils > /dev/null
!pip install google-cloud-aiplatform PyMuPDF Pillow requests pdf2image --upgrade > /dev/null
print("Dependencies check complete.")

# -----------------------------------------------------------
# --- 3. Authenticate and Initialize the Vertex AI Client ---
# -----------------------------------------------------------
try:
    if not os.path.exists(JSON_FILE_PATH):
        raise FileNotFoundError(f"Credentials file not found at: {JSON_FILE_PATH}.")
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = JSON_FILE_PATH
    vertexai.init(project=PROJECT_ID, location=LOCATION)
    # Initialize the model globally
    model = GenerativeModel(GEMINI_MODEL)
    print(f"Gemini Model '{GEMINI_MODEL}' initialized successfully for project '{PROJECT_ID}'.")
except Exception as e:
    print(f"Authentication or client initialization failed: {e}")
    exit()


Ensuring dependencies are installed...
Dependencies check complete.
Gemini Model 'gemini-2.5-flash' initialized successfully for project 'renaissance-ocr'.




In [None]:
# Block 2: Define Functions and Extract Images (This might take a few minutes)

def extract_pdf_pages_from_local_file(local_pdf_path, output_folder):
    """Detects total pages and extracts all pages as high-quality images."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"Created folder: {output_folder}")
    if not os.path.exists(local_pdf_path):
        print(f"Error: Local PDF file not found at {local_pdf_path}")
        return []
    try:
        info = pdfinfo_from_path(local_pdf_path, userpw=None, poppler_path=None)
        total_pages = info["Pages"]
        print(f"PDF detected to have a total of {total_pages} pages.")
    except Exception as e:
        print(f"Could not determine page count automatically: {e}. Defaulting to 452 max pages.")
        total_pages = 452 # Fallback to known page count

    print(f"Converting pages 1 to {total_pages} into high-quality images (300 DPI)...")
    image_paths = []
    try:
        images = convert_from_path(local_pdf_path, dpi=300, fmt='png', first_page=1, last_page=total_pages, use_cropbox=True)
        for i, img in enumerate(images):
            img_path = os.path.join(output_folder, f"page_{i+1:04d}.png")
            img.save(img_path)
            image_paths.append(img_path)
        print(f"Image extraction complete for {len(image_paths)} pages.")
        return sorted(image_paths)
    except Exception as e:
        print(f"An error occurred during PDF conversion: {e}")
        return []

# ASYNC function to call the API (needs 'model' from Block 1)
async def process_image_with_gemini_async(image_path):
    """Processes a local image file using the Gemini multimodal model asynchronously."""
    try:
        vertex_ai_image = VertexImage.load_from_file(image_path)
    except Exception as e:
        return f"Error loading image {image_path}: {e}", image_path

    contents = [OCR_PROMPT, vertex_ai_image]

    try:
        response = await model.generate_content_async(contents)
        full_text = response.text or ""
        if full_text:
            return full_text, image_path
        else:
            return "No text detected", image_path
    except Exception as e:
        return f"API call failed for {image_path}: {e}", image_path

# Execute image extraction immediately
image_files_list = extract_pdf_pages_from_local_file(
    LOCAL_PDF_PATH, OUTPUT_IMAGE_FOLDER
)


In [None]:
# Block 3: Run Asynchronous OCR and Save Results

async def main_async_pipeline():
    if not image_files_list:
        print("Pipeline halted because no images were extracted in Block 2.")
        return

    # Process ALL images using Gemini OCR asynchronously
    print(f"\nStarting ASYNC Gemini OCR process using {GEMINI_MODEL} on {len(image_files_list)} images...")

    # Create a list of all asynchronous tasks
    tasks = [process_image_with_gemini_async(image_path) for image_path in image_files_list]

    # Run all tasks concurrently and wait for them to complete
    results = await asyncio.gather(*tasks)

    # Combine results and save to file
    print(f"\n--- Gemini OCR Pipeline Finished ---")

    full_document_text = ""
    with open(OUTPUT_TEXT_FILE, 'w', encoding='utf-8') as f_out:
        header = f"Gemini OCR Results for the full document ({len(image_files_list)} pages) [ASYNC PROCESSING]\n" + "=" * 60 + "\n"
        f_out.write(header)
        full_document_text += header

        for text, image_path in results:
            section_text = f"\n\n--- Start of OCR from {os.path.basename(image_path)} ---\n\n{text}\n\n--- End of OCR from {os.path.basename(image_path)} ---\n\n"
            f_out.write(section_text)
            full_document_text += section_text
            # print(f"  -> Finished processing {os.path.basename(image_path)}") # Optional: verbose

    print(f"All extracted text has been saved to: {OUTPUT_TEXT_FILE}")

    # Print First 200 Words for verification
    print("\n" + "=" * 60)
    print("FIRST 200 WORDS OF EXTRACTED TEXT (Gemini):")
    print("=" * 60)
    words = ' '.join(full_document_text.splitlines()[3:]).split()
    print(' '.join(words[:200]))
    if len(words) > 200:
        print("\n[... Text truncated for display purposes ...]")
    print("=" * 60)

# Execute the async main function in the notebook environment
await main_async_pipeline()
