<a href="https://colab.research.google.com/github/CUHK-DH-Lab/CUHK-DH-Lab.github.io/blob/main/More_texts_for_CRF_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Section 1: The Local Processing Loop**

This first block of code acts as your "Digital Scrubbing Station." Since OCR from early modern texts is often "noisy"—full of long 's' (ſ), weird ligatures, and broken lines—this section uses your local CPU to handle the heavy lifting before we involve any AI.
The Directory Loop: The main() function uses Path.glob('*.txt') to grab every file in your Other_texts folder. It iterates through them one by one, ensuring that no matter how many documents you upload, they all get processed automatically.
The Pipeline: For every file, it runs a process() function. This is a sequence of regex (Regular Expression) "surgeries" that:
Normalize Glyphs: Swaps historical characters (like æ) for modern equivalents.
De-hyphenate: Intelligently joins words split across lines—crucial for making the text readable for the LLM later.
Structural Cleanup: Strips out "page furniture" like headers and page numbers that would otherwise confuse a translation or analysis model.

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import unicodedata
from pathlib import Path
import os
from collections import Counter, defaultdict

# --- Configuration ---
# This looks for a folder named 'Other_texts' in your Colab file pane
INPUT_DIR = Path('/content/Sinensis_Texts1')
OUTPUT_DIR = Path('/content/Cleaned_texts')

# Create output directory if it doesn't exist
OUTPUT_DIR.mkdir(exist_ok=True)

# Map curly quotes/dashes/ligatures/OCR errors
PUNCT_MAP = {
    '\u017F': 's',         # long s (ſ)
    'æ': 'ae', 'Æ': 'AE',
    'œ': 'oe', 'Œ': 'OE',
    '\u2018': "'", '\u2019': "'", '\u201A': ',', '\u201B': "'",
    '\u201C': '"', '\u201D': '"', '\u201E': '"',
    '\u2013': '-', '\u2014': '-', '\u2212': '-',   # en/em/minus → hyphen
    '\u00AD': '-',                                 # soft hyphen → hyphen
    '\u00A0': ' ', '\u2002': ' ', '\u2003': ' ', '\u2009': ' ', # spaces
}

# --- Patterns for Removal ---
ROMAN_LINE = re.compile(r'^\s*[IVXLCDM]+\.*\s*$')
ARABIC_PAGE = re.compile(r'^\s*\d{1,4}\s*$')
FILLER_LINE = re.compile(r'^\s*[•·.\u2022]{3,}\s*$')
SPACED_SMALLCAPS = re.compile(r'\b(?:[A-Z]\s){2,}[A-Z]\b')

# --- OCR Fix Utilities ---

def normalize_glyphs(s: str) -> str:
    """Standardize unicode punctuation, ligatures, and OCR noise characters."""
    for k, v in PUNCT_MAP.items():
        s = s.replace(k, v)
    s = unicodedata.normalize('NFKC', s)
    return s

def strip_scanner_artifacts(s: str) -> str:
    """Remove common stray OCR artifacts like escaped parentheses/backslashes."""
    s = s.replace('\\(', '(').replace('\\)', ')')
    s = re.sub(r'\\{2,}', r'\\', s)
    return s

def fix_spaced_smallcaps(s: str) -> str:
    """Join sequences like "L U D O V I C O" -> "LUDOVICO"."""
    def _join(m):
        return m.group(0).replace(' ', '')
    return SPACED_SMALLCAPS.sub(_join, s)

def remove_page_numbers_and_fillers(text: str) -> str:
    """Removes lines matching page number formats or filler dots."""
    lines = text.splitlines()
    kept = []
    for L in lines:
        S = L.strip()
        if not S:
            kept.append(L)
            continue
        if ARABIC_PAGE.match(S) or ROMAN_LINE.match(S) or FILLER_LINE.match(S):
            continue
        kept.append(L)
    return "\n".join(kept)

def smart_dehyphenate(s: str) -> str:
    """Join end-of-line hyphens when the next line starts with lowercase."""
    s = re.sub(r'([A-Za-zÀ-ÿ]{2,})-\n([a-zà-ÿ]{2,})', r'\1\2', s)
    s = re.sub(r'([A-Za-zÀ-ÿ]{2,})-\n([^\S\r\n]*[a-zà-ÿ]{1,})', r'\1\2', s)
    s = re.sub(r'(\w)-(\w)', lambda m: m.group(1)+m.group(2) if len(m.group(1))>1 and len(m.group(2))>1 else m.group(0), s)
    return s

def normalize_whitespace_preserve_structure(s: str) -> str:
    """Collapses extra spaces but preserves paragraph breaks."""
    paragraphs = re.split(r'\n{2,}', s)
    cleaned = []
    for p in paragraphs:
        p1 = re.sub(r'[ \t]+', ' ', p.strip())
        p1 = re.sub(r'\n{3,}', '\n\n', p1)
        cleaned.append(p1)
    out = "\n\n".join([x for x in cleaned if x])
    out = re.sub(r'[ \t]+$', '', out, flags=re.MULTILINE)
    return out

def process(text: str) -> str:
    """Main processing pipeline."""
    t = normalize_glyphs(text)
    t = strip_scanner_artifacts(t)
    t = remove_page_numbers_and_fillers(t)
    t = smart_dehyphenate(t)
    t = fix_spaced_smallcaps(t)
    t = normalize_whitespace_preserve_structure(t)
    return t

# --- Main Execution Loop ---

def main():
    if not INPUT_DIR.exists():
        print(f"!!! Error: Folder '{INPUT_DIR}' not found. Please create it and upload your files.")
        return

    # Find all .txt files
    files = list(INPUT_DIR.glob('*.txt'))

    if not files:
        print(f"No .txt files found in {INPUT_DIR}")
        return

    print(f"Starting cleanup on {len(files)} files...")

    for file_path in files:
        try:
            # Read input
            with open(file_path, 'r', encoding='utf-8') as f:
                raw_content = f.read()

            # Clean content
            cleaned_content = process(raw_content)

            # Save to new folder with suffix
            output_name = f"{file_path.stem}_Cleaned.txt"
            output_path = OUTPUT_DIR / output_name

            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(cleaned_content)

            print(f"Successfully processed: {file_path.name}")

        except Exception as e:
            print(f"Error processing {file_path.name}: {e}")

    print(f"\nFinished! Cleaned files are in: {OUTPUT_DIR}")

if __name__ == "__main__":
    main()


Starting cleanup on 1 files...
Successfully processed: bub_gb_vrNA0Xt0g3UC_djvu.txt

Finished! Cleaned files are in: /content/Cleaned_texts


**Section 2: The Async AI Correction Engine**

After text cleanup, the code sends the text to Gemini 2.5 Flash for linguistic correction. To improve the speed of processing large documents, the code uses Asynchronous (Async) processing.
smart_chunk_text: This function divides the text into manageable chunks. The function splits the text into pieces, approximately 10,000 characters long, while preserving paragraph breaks.
The Power of asyncio:
Concurrency: Instead of sending chunks sequentially, async allows multiple requests to be sent concurrently, as defined by MAX_CONCURRENCY.
nest_asyncio: nest_asyncio.apply() enables the Gemini API to run inside the Colab/Jupyter notebook.
Robustness (Retries & Backoff): The call_with_retries function handles potential API call failures. If the server indicates rate limits, the code pauses, waits, and retries automatically.

In [None]:
!pip install google-cloud-aiplatform vertexai



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import asyncio
import vertexai
from vertexai.generative_models import GenerativeModel
from pathlib import Path
from typing import List, Tuple
import nest_asyncio

# Enable async support for Colab
nest_asyncio.apply()

# --- Configuration ---
PROJECT_ID = 'renaissance-ocr'
LOCATION = 'us-central1'
GEMINI_MODEL = 'gemini-2.5-flash'
JSON_FILE_PATH = 'renaissance-ocr-4aabe5b8dc65.json'

INPUT_DIR = Path('/content/Cleaned_texts')
OUTPUT_DIR = Path('/content/Gemini_Corrected_texts')
OUTPUT_DIR.mkdir(exist_ok=True)

TARGET_CHUNK_SIZE = 5000  # Character count per API call
MAX_CONCURRENCY = 5        # Number of simultaneous chunks
MAX_RETRIES = 5
INITIAL_BACKOFF = 2.0      # Seconds for rate-limit retry

from vertexai.generative_models import SafetySetting, HarmCategory, HarmBlockThreshold

# Define loose safety settings
SAFETY_SETTINGS = [
    SafetySetting(category=HarmCategory.HARM_CATEGORY_HATE_SPEECH, threshold=HarmBlockThreshold.BLOCK_NONE),
    SafetySetting(category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, threshold=HarmBlockThreshold.BLOCK_NONE),
    SafetySetting(category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, threshold=HarmBlockThreshold.BLOCK_NONE),
    SafetySetting(category=HarmCategory.HARM_CATEGORY_HARASSMENT, threshold=HarmBlockThreshold.BLOCK_NONE),
]

# --- Prompts ---
SYSTEM_STEER = "You are a specialized OCR correction engine for early modern Latin. Output ONLY the corrected text. Do not provide lists of changes, explanations, or commentary."

CORRECTION_PROMPT = """
TASK: Correct ONLY OCR errors in the input text. Do not rewrite or alter authentic early modern usage.

RULES:
1. Preserve original wording, grammar, and syntax. Fix ONLY misreads, broken words, or OCR punctuation noise.
2. Minimal normalization: u=vowel, v=consonant; use 'i' only (no 'j'); ae/oe ligatures; replace & with 'et'.
3. DE-HYPHENATE: Join all words broken across line breaks. Remove the hyphen and merge the fragments.
4. Keep paragraph breaks and capitalization exactly as in the input.
5. Remove page furniture (headers, catchwords, signature marks like 'A ij').
6. Preserve exotic/Chinese romanizations (e.g., 'Tai Ki Gin') exactly as printed.
7. CRITICAL: Output ONLY the corrected text. No explanations. No bullet points.
"""

# -----------------------------------------------------------
# Utils
# -----------------------------------------------------------

def init_vertex() -> GenerativeModel:
    if not os.path.exists(JSON_FILE_PATH):
        raise FileNotFoundError(f"Credentials file not found: {JSON_FILE_PATH}")
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = JSON_FILE_PATH
    vertexai.init(project=PROJECT_ID, location=LOCATION)
    model = GenerativeModel(GEMINI_MODEL)
    print(f"Gemini Model '{GEMINI_MODEL}' initialized successfully.")
    return model

def smart_chunk_text(text: str, target_size: int) -> List[str]:
    """Splits text into chunks, attempting to break at double newlines (paragraphs)."""
    paras = text.split('\n\n')
    chunks = []
    current_chunk = []
    current_length = 0
    for p in paras:
        if current_length + len(p) > target_size and current_chunk:
            chunks.append('\n\n'.join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(p)
        current_length += len(p)
    if current_chunk:
        chunks.append('\n\n'.join(current_chunk))
    return chunks

async def call_with_retries(coro_factory):
    attempt = 0
    backoff = INITIAL_BACKOFF
    last_exc = None
    while attempt <= MAX_RETRIES:
        try:
            return await coro_factory()
        except Exception as e:
            last_exc = e
            if "429" in str(e) or "500" in str(e) or "503" in str(e):
                attempt += 1
                if attempt > MAX_RETRIES: break
                await asyncio.sleep(backoff)
                backoff *= 2
            else:
                raise e
    raise last_exc

# -----------------------------------------------------------
# Core Correction Logic
# -----------------------------------------------------------

async def correct_ocr_chunk_async(model_client: GenerativeModel,
                                  chunk_text: str,
                                  chunk_id: int,
                                  filename: str,
                                  semaphore: asyncio.Semaphore) -> Tuple[int, str]:
    """Corrects a chunk and joins multi-part responses if they occur."""
    full_prompt = f"{SYSTEM_STEER}\n\n{CORRECTION_PROMPT}\n\nTEXT TO CORRECT:\n{chunk_text}"

    async def single_call():
        async with semaphore:
            return await model_client.generate_content_async(
                full_prompt,
                generation_config={"temperature": 0.1, "max_output_tokens": 8192}
            )

    try:
        response = await call_with_retries(single_call)

        # FIX: Join all parts together to prevent 'Multiple content parts' error
        # Navigate through candidates -> content -> parts
        if response.candidates and response.candidates[0].content.parts:
            corrected = "".join(part.text for part in response.candidates[0].content.parts).strip()
        else:
            corrected = chunk_text

        if not corrected:
            corrected = chunk_text

        print(f"  [Chunk {chunk_id:03d}] ✓ {filename}")
        return chunk_id, corrected

    except Exception as e:
        print(f"  [ERROR] {filename} Chunk {chunk_id}: {e} -> returning original")
        return chunk_id, chunk_text

async def process_file(file_path: Path, model_client: GenerativeModel, semaphore: asyncio.Semaphore):
    print(f"\nSTARTING FILE: {file_path.name}")
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    chunks = smart_chunk_text(content, TARGET_CHUNK_SIZE)
    tasks = [correct_ocr_chunk_async(model_client, c, i, file_path.name, semaphore) for i, c in enumerate(chunks)]

    results = await asyncio.gather(*tasks)
    results.sort(key=lambda x: x[0])

    final_text = "\n\n".join(text for _, text in results)

    output_path = OUTPUT_DIR / f"Gemini_{file_path.name}"
    with open(output_path, 'w', encoding='utf-8') as f_out:
        f_out.write(final_text)
    print(f"DONE: {file_path.name}")

# -----------------------------------------------------------
# Main Execution
# -----------------------------------------------------------

async def main():
    model = init_vertex()
    semaphore = asyncio.Semaphore(MAX_CONCURRENCY)

    if not INPUT_DIR.exists():
        print(f"Input directory {INPUT_DIR} not found.")
        return

    files = sorted(list(INPUT_DIR.glob('*.txt')))
    if not files:
        print("No .txt files found.")
        return

    print(f"Found {len(files)} files. Processing chunks in parallel...")

    # Process files one by one, chunks in parallel
    for file_path in files:
        await process_file(file_path, model, semaphore)

    print(f"\n--- ALL TASKS COMPLETE. Check folder: {OUTPUT_DIR} ---")

if __name__ == "__main__":
    asyncio.run(main())


ValueError: Unsupported region for Vertex AI, select from frozenset({'europe-west9', 'europe-central2', 'northamerica-northeast1', 'us-east5', 'europe-west4', 'us-west2', 'asia-northeast3', 'southamerica-west1', 'us-west1', 'us-west4', 'europe-west12', 'asia-southeast1', 'asia-east1', 'northamerica-northeast2', 'australia-southeast2', 'asia-east2', 'europe-west1', 'southamerica-east1', 'asia-northeast2', 'asia-south1', 'asia-southeast2', 'us-east4', 'asia-northeast1', 'us-south1', 'europe-west2', 'me-west1', 'europe-north1', 'us-central1', 'europe-southwest1', 'us-east1', 'me-central2', 'me-central1', 'europe-west6', 'europe-west8', 'europe-west3', 'us-west3', 'australia-southeast1', 'africa-south1'})

In [None]:
pip install -U google-genai

Collecting google-genai
  Downloading google_genai-1.64.0-py3-none-any.whl.metadata (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.2/53.2 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Downloading google_genai-1.64.0-py3-none-any.whl (728 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m728.8/728.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-genai
  Attempting uninstall: google-genai
    Found existing installation: google-genai 1.63.0
    Uninstalling google-genai-1.63.0:
      Successfully uninstalled google-genai-1.63.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-adk 1.25.0 requires google-cloud-aiplatform[agent-engines]<2.0.0,>=1.132.0, but you have google-cloud-aiplatform 1.71.1 which is incompatible.[0m[31m
[0mSuccessfully installed google-genai-1.

In [None]:
!python -c "import google.genai, sys; print(google.genai.__version__)"

1.64.0


In [None]:
# -*- coding: utf-8 -*-
"""
OCR correction with Gemini 3 Flash (preview) using the Google Gen AI SDK (async).
Backward-compatible with SDKs that do NOT support `system_instruction=` in generate_content().
"""

import os
import asyncio
from pathlib import Path
from typing import List, Tuple
import nest_asyncio

# Unified Google Gen AI SDK
from google import genai

# Enable async support for Colab / notebooks
nest_asyncio.apply()

# --- Backend selection ---
# Vertex AI (service account): USE_VERTEX=True, requires PROJECT_ID, LOCATION, and GOOGLE_APPLICATION_CREDENTIALS
# Developer API (API key):     USE_VERTEX=False, requires GOOGLE_API_KEY or GEMINI_API_KEY
USE_VERTEX = True

# Vertex AI settings (used only when USE_VERTEX=True)
PROJECT_ID = "renaissance-ocr"
LOCATION = "us-central1"
SERVICE_ACCOUNT_JSON = "renaissance-ocr-4aabe5b8dc65.json"

# Model (Gemini 3 Flash, preview)
MODEL_ID = "gemini-3-flash-preview"

# I/O
INPUT_DIR = Path("/content/Cleaned_texts")
OUTPUT_DIR = Path("/content/Gemini_Corrected_texts")
OUTPUT_DIR.mkdir(exist_ok=True)

# Processing controls
TARGET_CHUNK_SIZE = 5000
MAX_CONCURRENCY = 5
MAX_RETRIES = 5
INITIAL_BACKOFF = 2.0  # seconds

# Safety settings (loose)
SAFETY_SETTINGS = [
    {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
]

# --- Prompts ---
SYSTEM_STEER = (
    "You are a specialized OCR correction engine for early modern Latin. "
    "Output ONLY the corrected text. Do not provide lists of changes, explanations, or commentary."
)

CORRECTION_PROMPT = """
TASK: Correct ONLY OCR errors in the input text. Do not rewrite or alter authentic early modern usage.

RULES:
1. Preserve original wording, grammar, and syntax. Fix ONLY misreads, broken words, or OCR punctuation noise.
2. Minimal normalization: u=vowel, v=consonant; use 'i' only (no 'j'); ae/oe ligatures; replace &amp; with 'et'.
3. DE-HYPHENATE: Join all words broken across line breaks. Remove the hyphen and merge the fragments.
4. Keep paragraph breaks and capitalization exactly as in the input.
5. Remove page furniture (headers, catchwords, signature marks like 'A ij').
6. Preserve exotic/Chinese romanizations (e.g., 'Tai Ki Gin') exactly as printed.
7. CRITICAL: Output ONLY the corrected text. No explanations. No bullet points.

TEXT TO CORRECT:
""".strip()

# -----------------------------
# Client init
# -----------------------------
def init_client():
    if USE_VERTEX:
        if not os.path.exists(SERVICE_ACCOUNT_JSON):
            raise FileNotFoundError(f"Credentials file not found: {SERVICE_ACCOUNT_JSON}")
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = SERVICE_ACCOUNT_JSON
        aclient = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION).aio
        print(f"Gemini model '{MODEL_ID}' via Vertex AI initialized.")
        return aclient

    api_key = os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY")
    if not api_key:
        raise RuntimeError("Set GOOGLE_API_KEY or GEMINI_API_KEY for Developer API mode.")
    aclient = genai.Client(api_key=api_key).aio
    print(f"Gemini model '{MODEL_ID}' via Developer API initialized.")
    return aclient

# -----------------------------
# Utils
# -----------------------------
def smart_chunk_text(text: str, target_size: int) -> List[str]:
    paras = text.split("\n\n")
    chunks, current, clen = [], [], 0
    for p in paras:
        if clen + len(p) > target_size and current:
            chunks.append("\n\n".join(current))
            current, clen = [], 0
        current.append(p)
        clen += len(p)
    if current:
        chunks.append("\n\n".join(current))
    return chunks

async def call_with_retries(coro_factory):
    attempt = 0
    backoff = INITIAL_BACKOFF
    last_exc = None
    while attempt <= MAX_RETRIES:
        try:
            return await coro_factory()
        except Exception as e:
            last_exc = e
            msg = str(e)
            if "429" in msg or "500" in msg or "503" in msg:
                attempt += 1
                if attempt > MAX_RETRIES:
                    break
                await asyncio.sleep(backoff)
                backoff *= 2
            else:
                raise
    raise last_exc

# -----------------------------
# Core correction
# -----------------------------
async def correct_ocr_chunk_async(
    aclient: genai.Client,
    chunk_text: str,
    chunk_id: int,
    filename: str,
    semaphore: asyncio.Semaphore,
) -> Tuple[int, str]:
    """
    Backward-compatible generate_content call: no `system_instruction` kwarg.
    We inline SYSTEM_STEER + CORRECTION_PROMPT before the chunk.
    """
    # Inline steer + rules + chunk into the single user message
    full_text = f"{SYSTEM_STEER}\n\n{CORRECTION_PROMPT}\n{chunk_text}"

    async def single_call():
        async with semaphore:
            return await aclient.models.generate_content(
                model=MODEL_ID,
                contents=full_text,
                safety_settings=SAFETY_SETTINGS,
                # Use a plain dict for config to maximize compatibility across SDK versions
                config={
                    "temperature": 0.1,
                    "max_output_tokens": 8192,
                    "response_mime_type": "text/plain",
                },
            )

    try:
        response = await call_with_retries(single_call)

        # Prefer the convenience accessor; fall back to stitching parts if needed
        corrected = (getattr(response, "text", "") or "").strip()
        if not corrected and getattr(response, "candidates", None):
            try:
                parts = response.candidates[0].content.parts
                corrected = "".join(getattr(p, "text", "") for p in parts).strip()
            except Exception:
                corrected = ""

        if not corrected:
            corrected = chunk_text

        print(f"  [Chunk {chunk_id:03d}] ✓ {filename}")
        return chunk_id, corrected

    except Exception as e:
        print(f"  [ERROR] {filename} Chunk {chunk_id}: {e} -> returning original")
        return chunk_id, chunk_text

async def process_file(file_path: Path, aclient, semaphore: asyncio.Semaphore):
    print(f"\nSTARTING FILE: {file_path.name}")
    content = file_path.read_text(encoding="utf-8")

    chunks = smart_chunk_text(content, TARGET_CHUNK_SIZE)
    tasks = [
        correct_ocr_chunk_async(aclient, c, i, file_path.name, semaphore)
        for i, c in enumerate(chunks)
    ]

    results = await asyncio.gather(*tasks)
    results.sort(key=lambda x: x[0])
    final_text = "\n\n".join(text for _, text in results)

    output_path = OUTPUT_DIR / f"Gemini_{file_path.name}"
    output_path.write_text(final_text, encoding="utf-8")
    print(f"DONE: {file_path.name}")

# -----------------------------
# Main
# -----------------------------
async def main():
    aclient = init_client()
    try:
        if not INPUT_DIR.exists():
            print(f"Input directory {INPUT_DIR} not found.")
            return

        files = sorted(INPUT_DIR.glob("*.txt"))
        if not files:
            print("No .txt files found.")
            return

        print(f"Found {len(files)} files. Processing chunks in parallel...")
        semaphore = asyncio.Semaphore(MAX_CONCURRENCY)

        for file_path in files:
            await process_file(file_path, aclient, semaphore)

        print(f"\n--- ALL TASKS COMPLETE. Check folder: {OUTPUT_DIR} ---")
    finally:
        await aclient.aclose()

if __name__ == "__main__":
    asyncio.run(main())

Gemini model 'gemini-3-flash-preview' via Vertex AI initialized.
Found 1 files. Processing chunks in parallel...

STARTING FILE: bub_gb_vrNA0Xt0g3UC_djvu_Cleaned.txt
  [ERROR] bub_gb_vrNA0Xt0g3UC_djvu_Cleaned.txt Chunk 0: AsyncModels.generate_content() got an unexpected keyword argument 'safety_settings' -> returning original
  [ERROR] bub_gb_vrNA0Xt0g3UC_djvu_Cleaned.txt Chunk 1: AsyncModels.generate_content() got an unexpected keyword argument 'safety_settings' -> returning original
  [ERROR] bub_gb_vrNA0Xt0g3UC_djvu_Cleaned.txt Chunk 2: AsyncModels.generate_content() got an unexpected keyword argument 'safety_settings' -> returning original
  [ERROR] bub_gb_vrNA0Xt0g3UC_djvu_Cleaned.txt Chunk 3: AsyncModels.generate_content() got an unexpected keyword argument 'safety_settings' -> returning original
  [ERROR] bub_gb_vrNA0Xt0g3UC_djvu_Cleaned.txt Chunk 4: AsyncModels.generate_content() got an unexpected keyword argument 'safety_settings' -> returning original
  [ERROR] bub_gb_vrNA0Xt