<a href="https://colab.research.google.com/github/Dhanya-Zac/Multilingual-LLM-hallucination-test/blob/main/translation_using_gpt_turkish.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install dependencies
!pip install openai pandas tqdm



In [None]:
# Install required libraries (if not available)
!pip install pandas tqdm --quiet
!pip install openai==0.28
import openai
import pandas as pd
import os
from tqdm.notebook import tqdm
import time
import re

# --- USER CONFIGURATION ---
API_KEY = ""
MODEL = "gpt-4o-mini"
CHUNK_SIZE = 1000
INPUT_CSV = "triviaqa.csv"
OUTPUT_DIR = "translated_chunks"


openai.api_key = API_KEY

In [3]:


if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


df = pd.read_csv(INPUT_CSV)
total_samples = len(df)

print(f"Loaded {total_samples} samples.")

# --- UTILITY: Detect if answer is a number (including decimals, negatives, etc.) ---
def is_number(s):
    if pd.isna(s):
        return False
    if isinstance(s, (int, float)):
        return True
    s = str(s).strip()
    # Regex for integer/decimal/negative/scientific notation
    return bool(re.fullmatch(r"[-+]?\d*\.?\d+(e[-+]?\d+)?", s))


def translate_batch(prompts, answers):
    turkish_prompts = []
    turkish_answers = []
    for q, a in tqdm(list(zip(prompts, answers)), desc="Translating", leave=False):
        # Translate prompt
        prompt_q = f"Translate the following English prompt to Turkish:\n\n{q}\n\nTurkish:"
        for attempt in range(5):
            try:
                resp_q = openai.ChatCompletion.create(
                    model=MODEL,
                    messages=[
                        {"role": "system", "content": "You are a helpful translation assistant."},
                        {"role": "user", "content": prompt_q}
                    ],
                    temperature=0.3,
                    max_tokens=300,
                )
                tq = resp_q['choices'][0]['message']['content'].strip()
                break
            except Exception as e:
                print(f"Error (Q): {e}. Retrying ({attempt+1}/5)...")
                time.sleep(2 + attempt*4)
                tq = ""
        turkish_prompts.append(tq)

        # Answer: translate only if NOT a number
        if is_number(a):
            ta = a
        else:
            prompt_a = f"Translate the following English answer to Turkish:\n\n{a}\n\nTurkish:"
            for attempt in range(5):
                try:
                    resp_a = openai.ChatCompletion.create(
                        model=MODEL,
                        messages=[
                            {"role": "system", "content": "You are a helpful translation assistant."},
                            {"role": "user", "content": prompt_a}
                        ],
                        temperature=0.3,
                        max_tokens=300,
                    )
                    ta = resp_a['choices'][0]['message']['content'].strip()
                    break
                except Exception as e:
                    print(f"Error (A): {e}. Retrying ({attempt+1}/5)...")
                    time.sleep(2 + attempt*4)
                    ta = ""
        turkish_answers.append(ta)
    return turkish_prompts, turkish_answers





Loaded 64328 samples.


In [None]:

START_CHUNK = 51  # resume from chunk 11
for i in range(START_CHUNK * CHUNK_SIZE, total_samples, CHUNK_SIZE):
    chunk_idx = i // CHUNK_SIZE
    chunk_df = df.iloc[i:i+CHUNK_SIZE]
    out_file = f"{OUTPUT_DIR}/translations_chunk_{chunk_idx:03d}.csv"

    if os.path.exists(out_file):
        print(f"Chunk {chunk_idx} already processed. Skipping.")
        continue

    print(f"Processing chunk {chunk_idx} ({i} - {min(i+CHUNK_SIZE, total_samples)})")
    t_prompts, t_answers = translate_batch(chunk_df["prompt"], chunk_df["answer"])
    out_chunk = pd.DataFrame({
        "english_prompt": chunk_df["prompt"].tolist(),
        "english_answer": chunk_df["answer"].tolist(),
        "turkish_prompt": t_prompts,
        "turkish_answer": t_answers
    })
    out_chunk.to_csv(out_file, index=False)
    print(f"Saved: {out_file}")

print("All done!")