In [None]:
# Jupyter Notebook: Turkish-to-English Batch Translator
# %%
# Cell 1: Imports and Setup
import os
import logging
import re
from itertools import islice

import pandas as pd
from tqdm.notebook import tqdm
from openai import OpenAI

# Setup logging to output in Jupyter
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)

# %%
# Cell 2: API Key Configuration
# Option 1: Set environment variable OPENAI_API_KEY before running
API_KEY = ""

# Option 2: Uncomment to enter manually (not recommended for production)
# from getpass import getpass
# API_KEY = getpass("Enter your OpenAI API key: ")

if not API_KEY:
    raise EnvironmentError(
        "Please set the OPENAI_API_KEY environment variable or enter it manually."
    )

client = OpenAI(api_key=API_KEY)

# %%
# Cell 3: Helper Functions

def chunked(iterable, size):
    """Yield successive chunks of specified size from iterable."""
    it = iter(iterable)
    while (batch := list(islice(it, size))):
        yield batch


def build_prompt(batch: list[str]) -> str:
    """Builds a numbered prompt from a list of sentences."""
    numbered = "\n".join(f"{i+1}. {sentence.strip()}"
                        for i, sentence in enumerate(batch))
    return (
        "You are a professional translator fluent in Turkish and English. "
        "Translate the following Turkish sentences into fluent, natural English. "
        "Preserve the original meaning, tone, and context. "
        "Return only the translations, numbered, one per line, in the same order.\n\n"
        f"{numbered}"
    )


def parse_translations(response_text: str, expected_count: int) -> list[str]:
    """Extracts numbered translations from the model output."""
    matches = re.findall(r"\d+\.\s+(.*)", response_text.strip())
    if len(matches) != expected_count:
        logger.warning(
            "Expected %d translations but got %d; padding or trimming.",
            expected_count, len(matches)
        )
        matches = (matches + [""] * expected_count)[:expected_count]
    return matches


def translate_batch(batch: list[str]) -> list[str]:
    """Translate a batch of Turkish sentences to English using OpenAI."""
    prompt = build_prompt(batch)
    logger.debug("Prompt:\n%s", prompt)

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0
        )
        raw_output = response.choices[0].message.content
        return parse_translations(raw_output, len(batch))

    except Exception as e:
        logger.error("Unexpected error: %s", e)
        # On error, return blanks
        return [""] * len(batch)

# %%
# Cell 4: Translation Workflow Function (for single-file use)

def translate_file(
    input_file: str,
    output_file: str,
    column_name: str = "turkish",
    row_limit: int = None,
    batch_size: int = 50
) -> None:
    """Read input Excel, translate, and save results."""
    logger.info(
        "Reading %s (column=%s)%s",
        input_file,
        column_name,
        f", up to {row_limit} rows" if row_limit else ""
    )
    df = pd.read_excel(input_file)
    if row_limit:
        df = df.head(row_limit)

    texts = df[column_name].dropna().astype(str).tolist()
    all_translations: list[str] = []

    for batch in tqdm(chunked(texts, batch_size), desc="Translating"):
        all_translations.extend(translate_batch(batch))

    df["definition_in_english"] = all_translations[: len(df)]
    df.to_excel(output_file, index=False)
    logger.info("Done! Translations saved to %s", output_file)

# %%
# Cell 5: Batch‐splitting and translation driver (for 68k rows → 2k‐row files)

# Parameters
input_file  = "turkish.xlsx"
column_name = "definition_in_turkish"
batch_size  = 50    # sentences per API call
chunk_size  = 2000  # rows per output file

# Read the entire sheet once
logger.info("Reading full file %s...", input_file)
df_full = pd.read_excel(input_file)

total_rows = len(df_full)
n_chunks   = (total_rows + chunk_size - 1) // chunk_size
logger.info(
    "Total rows: %d, will write %d files of up to %d rows each",
    total_rows, n_chunks, chunk_size
)

for i in range(n_chunks):
    start = i * chunk_size
    end   = min(start + chunk_size, total_rows)
    df_chunk = df_full.iloc[start:end].copy()

    texts = df_chunk[column_name].dropna().astype(str).tolist()
    all_translations: list[str] = []
    for batch in tqdm(chunked(texts, batch_size),
                      desc=f"Chunk {i+1}/{n_chunks} Translating"):
        all_translations.extend(translate_batch(batch))

    df_chunk["definition_in_english"] = all_translations[: len(df_chunk)]
    out_fname = f"turkish_translated_{start+1}_to_{end}.xlsx"
    df_chunk.to_excel(out_fname, index=False)
    logger.info("Saved rows %d–%d to %s", start+1, end, out_fname)
