In [25]:
# --- Cell 1: Setup and Imports ---
import os
import re
import time
import sys
from openai import OpenAI
from IPython.display import display, Markdown # For better display in Jupyter
import tiktoken # Added for token counting
from tqdm.notebook import tqdm # Added for progress bar

print("Libraries imported: os, re, time, sys, openai, IPython.display, tiktoken, tqdm.notebook")

Libraries imported: os, re, time, sys, openai, IPython.display, tiktoken, tqdm.notebook


In [26]:
# --- Cell 2: Configuration ---

# --- Constants ---
API_RETRY_DELAY = 5 # API call retry delay in seconds
MAX_RETRIES = 3     # Maximum API call retries
XAI_BASE_URL = "https://api.x.ai/v1" # Grok API Base URL

# --- Parameters (Modify as needed) ---
CHUNK_SIZE = 3       # Number of paragraphs per API call batch
XAI_MODEL = "grok-3-mini" # Model to use for translation

# --- API Key and Client Initialization ---
XAI_API_KEY = "xai-zdA7bOhcWyNd0FQBOL00BEsmJiPCnxllZLaxeTdWePEdFGFrIYP64K8aGDJaiLnaNUnkcuGBIGHT23FH"
client = None

if XAI_API_KEY:
    try:
        client = OpenAI(
            api_key=XAI_API_KEY,
            base_url=XAI_BASE_URL,
        )
        print(f"✅ OpenAI client initialized successfully for Base URL: {XAI_BASE_URL}")
    except Exception as e:
        print(f"⚠️ Error initializing OpenAI client: {e}", file=sys.stderr)
else:
    print("⚠️ Error: Environment variable 'XAI_API_KEY' not set.", file=sys.stderr)

# --- Tokenizer Initialization (Added) ---
tokenizer = None
try:
    # Using cl100k_base as a common default. THIS MAY NOT BE ACCURATE FOR GROK.
    tokenizer = tiktoken.get_encoding("cl100k_base")
    # Or try: tokenizer = tiktoken.encoding_for_model("gpt-4") # Might be closer? Needs testing.
    print("✅ tiktoken tokenizer ('cl100k_base') initialized for token estimation.")
    print("   ℹ️ Note: Token counts are estimates; Grok's internal counting may differ.")
except Exception as e:
    print(f"⚠️ Warning: Failed to initialize tiktoken tokenizer: {e}. Token counting will be skipped.", file=sys.stderr)


# Display warning if client initialization failed
if not client:
     display(Markdown("<font color='red'>**Warning:** API Client not initialized. Translation will not work. Ensure `XAI_API_KEY` is set correctly in your environment.</font>"))

✅ OpenAI client initialized successfully for Base URL: https://api.x.ai/v1
✅ tiktoken tokenizer ('cl100k_base') initialized for token estimation.
   ℹ️ Note: Token counts are estimates; Grok's internal counting may differ.


In [27]:
# --- Cell 3: Helper Functions ---
# Includes updated system prompt in translate_via_grok_api

def count_tokens(text):
    """Estimates token count for a given text using the initialized tokenizer."""
    if not tokenizer or not text:
        return 0
    try:
        return len(tokenizer.encode(text))
    except Exception as e:
        # Avoid crashing if encoding fails for some reason
        print(f"⚠️ Warning: tiktoken failed to encode text snippet: {e}", file=sys.stderr)
        return 0

def translate_via_grok_api(text_to_translate):
    """
    Uses Grok API to translate text, applying specific rules, and estimates token usage.

    Rules:
    1. Output only the translation, no extra explanations.
    2. Do not translate literature citations/references, keep original format.

    Args:
        text_to_translate (str): The text chunk to translate.

    Returns:
        tuple: (translated_text, prompt_tokens, completion_tokens)
               Returns (None, estimated_prompt_tokens, 0) on API failure.
               Returns (None, 0, 0) if client not initialized.
    """
    if not client:
        print("❌ Error: API client is not initialized.", file=sys.stderr)
        return None, 0, 0

    # --- Updated System Prompt ---
    system_prompt_content = """Act as a direct translator. Your task is to translate the provided Markdown text into Simplified Chinese according to these rules:
1.  **Direct Output:** Provide ONLY the translated Simplified Chinese text. Do NOT include any introductory phrases, explanations, apologies, or concluding remarks like "Here is the translation:", "以下是翻译内容：", etc. Just the translation itself.
2.  **Preserve Formatting:** Maintain all original Markdown formatting (headings `##`, lists `* - 1.`, bold `**`, italics `*`, code blocks ```` ``` ``, inline code `` ` ``, etc.) precisely.
3.  **Do Not Translate Citations/References:** Identify text that functions as a literature citation or bibliography/reference list entry. Keep the original text and formatting for these parts *without translating them*. This applies to:
    * In-text citations (e.g., `(Author, 2023)`, `[Author et al., 2023]`, `[1]`, `Author (2023)`).
    * Items in reference lists (often lines starting with `*`, `-`, or `1.` followed by author names, year, title, journal/publisher, DOI, URL, etc.). Example: `* Author, A. B. (2023). Title of work. *Journal Name*, 10(2), 123-145. https://doi.org/xxxx` should remain exactly as is.
4.  **Translate Everything Else:** All other text content should be translated accurately into Simplified Chinese.
Ensure the final output flows naturally, integrating the untranslated citations correctly within the translated surrounding text."""

    # Prepare messages with the new system prompt
    messages = [
        {"role": "system", "content": system_prompt_content},
        {"role": "user", "content": text_to_translate}
    ]

    # Estimate prompt tokens
    prompt_tokens = 0
    if tokenizer:
        # Estimate tokens for messages, adding potential overhead
        prompt_tokens += count_tokens(messages[0]['content']) + 4 # System prompt tokens + overhead
        prompt_tokens += count_tokens(messages[1]['content']) + 4 # User content tokens + overhead

    print(f"   ➡️ Calling Grok API (Model: {XAI_MODEL}, Est. Prompt Tokens: {prompt_tokens})...")

    try:
        completion = client.chat.completions.create(
            model=XAI_MODEL,
            messages=messages,
            temperature=0.2, # Lower temperature might help follow instructions strictly
            # Consider adding stop sequences if needed, though the prompt should handle "no explanation"
        )
        translated_text = completion.choices[0].message.content

        # Estimate completion tokens
        completion_tokens = 0
        if translated_text and tokenizer:
            completion_tokens = count_tokens(translated_text)

        print(f"   ✅ API call successful (Est. Completion Tokens: {completion_tokens}).")
        # Basic check to remove potential leading/trailing explanation artifacts if prompt isn't perfectly followed
        # (More robust post-processing could be added if needed)
        cleaned_translation = translated_text.strip() if translated_text else None
        return cleaned_translation, prompt_tokens, completion_tokens

    except Exception as e:
        print(f"   ❌ Error calling Grok API: {e}", file=sys.stderr)
        return None, prompt_tokens, 0 # Return prompt tokens estimated before failure


# --- Other Helper Functions (split_into_paragraphs, process_markdown_file) remain the same ---
# Make sure process_markdown_file still correctly calls this updated translate_via_grok_api
# and handles the returned tuple (translation_result, p_tokens, c_tokens).
# (No changes needed to process_markdown_file from the previous version with token counting)

def split_into_paragraphs(text):
    """Splits text into paragraphs based on blank lines."""
    paragraphs = re.split(r'\n\s*\n', text.strip())
    return [p for p in paragraphs if p.strip()]

def process_markdown_file(input_filepath, output_filepath):
    """
    Reads, translates (in chunks with progress bar), estimates tokens, and writes the Markdown file.

    Returns:
        tuple: (total_prompt_tokens, total_completion_tokens)
    """
    total_prompt_tokens = 0
    total_completion_tokens = 0

    if not client:
        print("❌ Error: Cannot process file because API client is not initialized.", file=sys.stderr)
        return total_prompt_tokens, total_completion_tokens # Return zero tokens

    print(f"Reading input file: {input_filepath}")
    try:
        with open(input_filepath, 'r', encoding='utf-8') as f:
            content = f.read()
    except FileNotFoundError:
        print(f"❌ Error: Input file not found: {input_filepath}", file=sys.stderr)
        return total_prompt_tokens, total_completion_tokens
    except Exception as e:
        print(f"❌ Error reading file '{input_filepath}': {e}", file=sys.stderr)
        return total_prompt_tokens, total_completion_tokens

    paragraphs = split_into_paragraphs(content)
    if not paragraphs:
        print(f"⚠️ Warning: Input file '{input_filepath}' is empty or has no valid paragraphs.")
        try:
            with open(output_filepath, 'w', encoding='utf-8') as f: f.write("")
            print(f"   Empty output file created: {output_filepath}")
        except Exception as e:
            print(f"❌ Error writing empty output file '{output_filepath}': {e}", file=sys.stderr)
        return total_prompt_tokens, total_completion_tokens

    translated_chunks = []
    total_paragraphs = len(paragraphs)
    num_chunks = (total_paragraphs + CHUNK_SIZE - 1) // CHUNK_SIZE

    print(f"Starting translation of {total_paragraphs} paragraphs in {num_chunks} chunks...")

    # --- Initialize Progress Bar ---
    pbar = tqdm(total=total_paragraphs, unit="paragraph", desc="Translating")

    for i in range(0, total_paragraphs, CHUNK_SIZE):
        chunk_paragraphs = paragraphs[i:i + CHUNK_SIZE]
        text_chunk_to_translate = "\n\n".join(chunk_paragraphs)
        chunk_index = (i // CHUNK_SIZE) + 1
        start_para = i + 1
        end_para = min(i + CHUNK_SIZE, total_paragraphs)

        pbar.set_description(f"Chunk {chunk_index}/{num_chunks} (Paras {start_para}-{end_para})")

        translated_chunk_text = None
        prompt_tokens_chunk = 0
        completion_tokens_chunk = 0
        final_prompt_tokens_for_chunk = 0

        for attempt in range(MAX_RETRIES):
            translation_result, p_tokens, c_tokens = translate_via_grok_api(text_chunk_to_translate) # Calls the updated function

            if attempt == 0:
                final_prompt_tokens_for_chunk = p_tokens

            if translation_result is not None:
                translated_chunk_text = translation_result
                completion_tokens_chunk = c_tokens
                break
            else:
                if attempt < MAX_RETRIES - 1:
                    print(f"      Waiting {API_RETRY_DELAY}s before retrying...")
                    time.sleep(API_RETRY_DELAY)

        if translated_chunk_text is not None:
            translated_chunks.append(translated_chunk_text)
        else:
            error_marker = f"[Grok API Translation Failed - Chunk {chunk_index}]"
            print(f"   ❌ Error: Chunk {chunk_index} failed after {MAX_RETRIES} attempts. Original text kept.", file=sys.stderr)
            translated_chunks.append(f"{error_marker}\n\n{text_chunk_to_translate}")
            completion_tokens_chunk = 0

        total_prompt_tokens += final_prompt_tokens_for_chunk
        total_completion_tokens += completion_tokens_chunk
        pbar.update(len(chunk_paragraphs))

    pbar.close()
    print("-" * 50)

    # --- Write Output File ---
    final_translated_content = "\n\n".join(translated_chunks)
    print(f"Writing translated content to: {output_filepath}")
    try:
        with open(output_filepath, 'w', encoding='utf-8') as f:
            f.write(final_translated_content)
        print("✅ Translation process complete!")
        display(Markdown(f"**Success!** Translated file saved to: `{output_filepath}`"))
    except Exception as e:
        print(f"❌ Error writing output file '{output_filepath}': {e}", file=sys.stderr)
        display(Markdown(f"<font color='red'>**Error:** Failed to write output file: `{output_filepath}`</font>"))

    return total_prompt_tokens, total_completion_tokens


print("Helper functions defined with updated translation prompt.")

Helper functions defined with updated translation prompt.


In [28]:
# --- Cell 4: Specify File Paths ---

# <<<---------------------------------------------------------->>>
# <<<--- !!! DEFINE YOUR INPUT AND OUTPUT FILE PATHS HERE !!! --->>>
# <<<---------------------------------------------------------->>>

# Option 1: Specify both input and output paths directly
input_filepath = "D:\\book-CO2_enhanced_gas\\document\\2021-adts.202100127.pdf-e77c6731-95f5-40e9-a7f5-fba1ff52e33a\\full.md"  # <--- REQUIRED: Set path to your input Markdown file
output_filepath = "D:\\book-CO2_enhanced_gas\\document\\2021-adts.202100127.pdf-e77c6731-95f5-40e9-a7f5-fba1ff52e33a\\full_translated_zh.md" # <--- OPTIONAL: Set desired output path

# Option 2: Specify only input, let output be derived automatically
# input_filepath = "another_doc.md" # <--- REQUIRED
# output_filepath = "" # <--- Leave empty or comment out to auto-derive

# --- Auto-generate output path if not specified ---
if not output_filepath:
    if input_filepath:
        base, ext = os.path.splitext(input_filepath)
        output_filepath = f"{base}_zh_translated{ext}"
        print(f"Output filepath not specified, auto-generated: {output_filepath}")
    else:
        print("⚠️ Warning: Input filepath is not set.", file=sys.stderr)
        output_filepath = "default_output_translated.md" # Default fallback

# --- Display configured paths ---
print("-" * 50)
print(f"Input File:  '{input_filepath}'")
print(f"Output File: '{output_filepath}'")
print("-" * 50)

--------------------------------------------------
Input File:  'D:\book-CO2_enhanced_gas\document\2021-adts.202100127.pdf-e77c6731-95f5-40e9-a7f5-fba1ff52e33a\full.md'
Output File: 'D:\book-CO2_enhanced_gas\document\2021-adts.202100127.pdf-e77c6731-95f5-40e9-a7f5-fba1ff52e33a\full_translated_zh.md'
--------------------------------------------------


In [29]:
# --- Cell 5: Execute Translation ---

print("Starting translation process...")
print(f"Using Model: {XAI_MODEL}, Chunk Size: {CHUNK_SIZE} paragraphs")
print("-" * 50)

# --- Pre-run Checks ---
ready_to_run = True
if not client:
    print("❌ Error: API Client is not initialized. Cannot run.", file=sys.stderr)
    display(Markdown("<font color='red'>**Execution Halted:** API Client not ready. Check Cell 2 output and `XAI_API_KEY` environment variable.</font>"))
    ready_to_run = False

if not input_filepath or not os.path.exists(input_filepath):
    print(f"❌ Error: Input file not found or not specified: '{input_filepath}'", file=sys.stderr)
    display(Markdown(f"<font color='red'>**Execution Halted:** Input file path is invalid. Check Cell 4.</font>"))
    ready_to_run = False

if input_filepath and output_filepath and os.path.abspath(input_filepath) == os.path.abspath(output_filepath):
    print("❌ Error: Input and output file paths cannot be the same.", file=sys.stderr)
    display(Markdown(f"<font color='red'>**Execution Halted:** Input and Output paths point to the same file. Specify a different output path in Cell 4.</font>"))
    ready_to_run = False

# --- Run Translation if Checks Pass ---
total_prompt_tokens = 0
total_completion_tokens = 0

if ready_to_run:
    print("Checks passed. Executing translation...")
    start_time = time.time()

    # Call the processing function which now returns token counts
    total_prompt_tokens, total_completion_tokens = process_markdown_file(input_filepath, output_filepath)

    end_time = time.time()
    print("-" * 50) # Separator after process function finishes

    # --- Display Token Usage Summary ---
    print("\n--- Token Usage Estimation Summary ---")
    print(f"Estimated Prompt Tokens:     {total_prompt_tokens}")
    print(f"Estimated Completion Tokens: {total_completion_tokens}")
    print(f"Estimated Total Tokens Used: {total_prompt_tokens + total_completion_tokens}")
    if not tokenizer:
         print("⚠️ Token counting was skipped due to tokenizer initialization failure.")
    else:
         print("ℹ️ Note: Token counts are estimates using the 'cl100k_base' tokenizer via tiktoken.")
         print("   Actual token usage by Grok may differ.")

    print(f"\nTotal execution time: {end_time - start_time:.2f} seconds")

else:
    print("\n--- Execution Halted due to errors ---")

print("\n--- Cell Execution Finished ---")

Starting translation process...
Using Model: grok-3-mini, Chunk Size: 3 paragraphs
--------------------------------------------------
Checks passed. Executing translation...
Reading input file: D:\book-CO2_enhanced_gas\document\2021-adts.202100127.pdf-e77c6731-95f5-40e9-a7f5-fba1ff52e33a\full.md
Starting translation of 128 paragraphs in 43 chunks...


Translating:   0%|          | 0/128 [00:00<?, ?paragraph/s]

   ➡️ Calling Grok API (Model: grok-3-mini, Est. Prompt Tokens: 1042)...
   ✅ API call successful (Est. Completion Tokens: 941).
   ➡️ Calling Grok API (Model: grok-3-mini, Est. Prompt Tokens: 792)...
   ✅ API call successful (Est. Completion Tokens: 584).
   ➡️ Calling Grok API (Model: grok-3-mini, Est. Prompt Tokens: 1086)...
   ✅ API call successful (Est. Completion Tokens: 995).
   ➡️ Calling Grok API (Model: grok-3-mini, Est. Prompt Tokens: 868)...
   ✅ API call successful (Est. Completion Tokens: 742).
   ➡️ Calling Grok API (Model: grok-3-mini, Est. Prompt Tokens: 658)...
   ✅ API call successful (Est. Completion Tokens: 422).
   ➡️ Calling Grok API (Model: grok-3-mini, Est. Prompt Tokens: 874)...
   ✅ API call successful (Est. Completion Tokens: 388).
   ➡️ Calling Grok API (Model: grok-3-mini, Est. Prompt Tokens: 802)...
   ✅ API call successful (Est. Completion Tokens: 540).
   ➡️ Calling Grok API (Model: grok-3-mini, Est. Prompt Tokens: 514)...
   ✅ API call successful (Est.

**Success!** Translated file saved to: `D:\book-CO2_enhanced_gas\document\2021-adts.202100127.pdf-e77c6731-95f5-40e9-a7f5-fba1ff52e33a\full_translated_zh.md`

--------------------------------------------------

--- Token Usage Estimation Summary ---
Estimated Prompt Tokens:     33615
Estimated Completion Tokens: 22634
Estimated Total Tokens Used: 56249
ℹ️ Note: Token counts are estimates using the 'cl100k_base' tokenizer via tiktoken.
   Actual token usage by Grok may differ.

Total execution time: 618.61 seconds

--- Cell Execution Finished ---
