# Deep Past Challenge — Akkadian → English

Auto-generated submission notebook. Do not edit manually.
Regenerate with: `uv run python scripts/build_notebook.py`

In [None]:
import os
print("Available inputs:")
for d in sorted(os.listdir("/kaggle/input")):
    print(f"  /kaggle/input/{d}/")
    try:
        files = os.listdir(f"/kaggle/input/{d}")
        for f in files[:5]:
            print(f"    {f}")
        if len(files) > 5:
            print(f"    ... ({len(files)} total)")
    except Exception as e:
        print(f"    ERROR: {e}")

## Preprocessing (from src/preprocess.py)

In [None]:

import re
import unicodedata



# ── Unicode normalization maps ──────────────────────────────────────────────

# Ḫ / ḫ  →  H / h
_SPECIAL_CHARS = str.maketrans({"Ḫ": "H", "ḫ": "h"})

# Unicode subscript / superscript digits → ASCII
_SUB_DIGITS = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")
_SUP_DIGITS = str.maketrans("⁰¹²³⁴⁵⁶⁷⁸⁹", "0123456789")

# Half-brackets (damaged text markers)
_HALF_BRACKETS = str.maketrans({"˹": "", "˺": ""})


def clean_transliteration(text: str) -> str:
    """Clean a single transliteration string per competition instructions."""
    if not isinstance(text, str) or not text.strip():
        return ""

    s = text

    # 1. Normalize Unicode (NFC) for consistent handling
    s = unicodedata.normalize("NFC", s)

    # 2. Ḫ / ḫ  →  H / h
    s = s.translate(_SPECIAL_CHARS)

    # 3. Remove half-brackets ˹ ˺ (damaged but readable signs)
    s = s.translate(_HALF_BRACKETS)

    # 4. Handle double angle brackets << >> — remove entirely
    s = re.sub(r"<<.*?>>", "", s)

    # 5. Handle single angle brackets < > — keep text, remove brackets
    s = re.sub(r"<(.*?)>", r"\1", s)

    # 6. Handle square brackets
    #    [... ...] or [...]  →  <big_gap>
    s = re.sub(r"\[\.\.\.\s*\.\.\.?\]", "<big_gap>", s)
    #    [x] or [x x] etc  →  <gap>
    s = re.sub(r"\[x(?:\s+x)*\]", "<gap>", s)
    #    [text]  →  text  (keep content, remove brackets)
    s = re.sub(r"\[(.*?)\]", r"\1", s)

    # 7. Strip scribal notations: ! ? (certainty markers)
    s = re.sub(r"[!?]", "", s)

    # 8. Normalize line dividers: / and : used as line breaks → space
    s = re.sub(r"\s*/\s*", " ", s)
    s = re.sub(r"\s*:\s*", " ", s)

    # 9. Subscript / superscript digits → ASCII
    s = s.translate(_SUB_DIGITS)
    s = s.translate(_SUP_DIGITS)

    # 10. Strip line numbers at start (e.g. "1. " or "1' " or "r. 1 ")
    s = re.sub(r"^(?:(?:o|r|rev|obv|lo\.?e\.?|u\.?e\.?)\.?\s+)?(?:\d+['′]?\.\s*)", "", s)

    # 11. Collapse whitespace
    s = re.sub(r"\s+", " ", s).strip()

    return s


def clean_translation(text: str) -> str:
    """Lightly clean an English translation string."""
    if not isinstance(text, str) or not text.strip():
        return ""

    s = text
    # Just whitespace normalization
    s = re.sub(r"\s+", " ", s).strip()
    return s

## Post-processing (from src/postprocess.py)

In [None]:




def postprocess(text: str) -> str:
    """Clean up a single model output for submission."""
    if not isinstance(text, str):
        return "..."

    # Strip any residual thinking tokens or chat template artifacts
    text = re.sub(r"<\|.*?\|>", "", text)
    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)

    # Remove repeated phrases (common LLM failure mode)
    text = remove_repetitions(text)

    # Collapse whitespace, strip
    text = re.sub(r"\s+", " ", text).strip()

    # Ensure non-empty
    if not text:
        text = "..."
    return text


def remove_repetitions(text: str) -> str:
    """Remove consecutive duplicate phrases."""
    words = text.split()
    result = []
    i = 0
    while i < len(words):
        found_repeat = False
        for n in range(10, 2, -1):
            if i + 2 * n <= len(words):
                chunk = words[i : i + n]
                next_chunk = words[i + n : i + 2 * n]
                if chunk == next_chunk:
                    result.extend(chunk)
                    i += 2 * n
                    found_repeat = True
                    break
        if not found_repeat:
            result.append(words[i])
            i += 1
    return " ".join(result)

## Load Model

In [None]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_PATH = "/kaggle/input/datasets/eisfresser/deep-past-model"

# Find model dir — datasets may nest files in a subdirectory
if not os.path.isfile(os.path.join(MODEL_PATH, "config.json")):
    subdirs = [d for d in os.listdir(MODEL_PATH)
               if os.path.isdir(os.path.join(MODEL_PATH, d))]
    if subdirs:
        MODEL_PATH = os.path.join(MODEL_PATH, subdirs[0])
print(f"Model dir: {MODEL_PATH}")
print(os.listdir(MODEL_PATH))

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16,
    device_map="auto",
)
model.eval()
print(f"Model loaded from {MODEL_PATH}")

## Inference (from src/inference.py)

In [None]:





SYSTEM_PROMPT = (
    "You are an expert translator of Old Assyrian Akkadian cuneiform texts "
    "into English. Determinatives in curly brackets classify nouns: "
    "{d} = deity, {ki} = place, {m} = masculine name, {mi} = feminine name. "
    "Words in ALL CAPS are Sumerian logograms. Words with a capitalized first "
    "letter are proper nouns. Translate the transliterated Akkadian into "
    "fluent English."
)


def translate_batch(
    model, tokenizer, texts: list[str], cfg: dict
) -> list[str]:
    """Batched translation with left-padding for efficiency."""
    tokenizer.padding_side = "left"
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    results = []
    batch_size = cfg.get("inference_batch_size", 8)

    # Sort by length for better batching, track original indices
    indexed = sorted(enumerate(texts), key=lambda x: len(x[1]))

    for i in range(0, len(indexed), batch_size):
        batch = indexed[i : i + batch_size]
        prompts = []
        for _, text in batch:
            messages = [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": f"Translate: {text}"},
            ]
            # Disable Qwen3 thinking mode
            prompt = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True,
                enable_thinking=False,
            )
            prompts.append(prompt)

        inputs = tokenizer(
            prompts, return_tensors="pt", padding=True, truncation=True,
            max_length=cfg.get("max_seq_length", 2048),
        ).to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=cfg.get("max_new_tokens", 256),
                do_sample=False,
                temperature=None,
                top_p=None,
                top_k=None,
            )

        for j, (orig_idx, _) in enumerate(batch):
            decoded = tokenizer.decode(
                outputs[j][inputs["input_ids"].shape[1] :],
                skip_special_tokens=True,
            )
            results.append((orig_idx, decoded))

        done = min(i + batch_size, len(indexed))
        print(f"  Translated {done}/{len(indexed)} examples", end="\r")

    print()

    # Restore original order
    results.sort(key=lambda x: x[0])
    return [r[1] for r in results]

## Load & Preprocess Test Data

In [None]:
import pandas as pd

test_df = pd.read_csv("/kaggle/input/competitions/deep-past-initiative-machine-translation/test.csv")
print(f"Test set: {len(test_df)} rows")
print(test_df.head())

# Find transliteration column
cols = {c.lower(): c for c in test_df.columns}
trans_col = cols.get("transliteration", cols.get("source", ""))
if not trans_col:
    raise ValueError(f"No transliteration column found: {list(test_df.columns)}")

test_df["transliteration_clean"] = test_df[trans_col].apply(clean_transliteration)
print(f"Cleaned {len(test_df)} transliterations")

## Generate Translations

In [None]:
cfg = {
    "inference_batch_size": 8,
    "max_new_tokens": 256,
    "max_seq_length": 2048,
}

texts = test_df["transliteration_clean"].tolist()
predictions = translate_batch(model, tokenizer, texts, cfg)
print(f"Generated {len(predictions)} translations")

## Post-process

In [None]:
predictions_clean = [postprocess(p) for p in predictions]
print("Sample predictions:")
for i in range(min(5, len(predictions_clean))):
    print(f"  {i}: {predictions_clean[i][:100]}...")

## Write Submission

In [None]:
# Find ID column
id_col = cols.get("id", cols.get("text_id", ""))
if not id_col:
    test_df["id"] = range(len(test_df))
    id_col = "id"

submission = pd.DataFrame({
    "id": test_df[id_col],
    "translation": predictions_clean,
})

submission.to_csv("submission.csv", index=False)
print(f"Submission saved: {len(submission)} rows")
print(submission.head())