In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/train-gap-all-2/__huggingface_repos__.json
/kaggle/input/train-gap-all-2/byt5-base-akkadian_gap_setence2/config.json
/kaggle/input/train-gap-all-2/byt5-base-akkadian_gap_setence2/training_args.bin
/kaggle/input/train-gap-all-2/byt5-base-akkadian_gap_setence2/tokenizer_config.json
/kaggle/input/train-gap-all-2/byt5-base-akkadian_gap_setence2/model.safetensors
/kaggle/input/train-gap-all-2/byt5-base-akkadian_gap_setence2/special_tokens_map.json
/kaggle/input/train-gap-all-2/byt5-base-akkadian_gap_setence2/added_tokens.json
/kaggle/input/train-gap-all-2/byt5-base-akkadian_gap_setence2/generation_config.json
/kaggle/input/akkadian-byt5/pytorch/default/1/__huggingface_repos__.json
/kaggle/input/akkadian-byt5/pytorch/default/1/akkadian_byt5_v1/config.json
/kaggle/input/akkadian-byt5/pytorch/default/1/akkadian_byt5_v1/training_args.bin
/kaggle/input/akkadian-byt5/pytorch/default/1/akkadian_byt5_v1/tokenizer_config.json
/kaggle/input/akkadian-byt5/pytorch/default/1/akkadian_byt5_

In [2]:
import os
import gc
import math
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm.auto import tqdm
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

print("üè∫ Akkadian Translation - 4 Model Ensemble")
print("="*50)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")

üè∫ Akkadian Translation - 4 Model Ensemble
Device: cuda
GPU: Tesla T4


In [3]:
# Load and explore data
train_df = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/train.csv")
test_df = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/test.csv")

print(f"üìä Dataset Statistics:")
print(f"   Training samples: {len(train_df):,}")
print(f"   Test samples: {len(test_df):,}")

print(f"\nüìù Sample Akkadian Text:")
print(f"   {train_df.iloc[0]['transliteration']}...")
print(f"\nüìù English Translation:")
print(f"   {train_df.iloc[0]['translation']}...")

üìä Dataset Statistics:
   Training samples: 1,561
   Test samples: 4

üìù Sample Akkadian Text:
   KI≈†IB ma-nu-ba-l√∫m-a-≈°ur DUMU ·π£√≠-l√°-(d)IM KI≈†IB ≈°u-(d)EN.L√çL DUMU ma-nu-ki-a-≈°ur KI≈†IB MAN-a-≈°ur DUMU a-ta-a 0.33333 ma-na 2 G√çN K√ô.BABBAR SIG‚ÇÖ i-·π£√©-er PUZUR‚ÇÑ-a-≈°ur DUMU a-ta-a a-l√°-·∏´u-um i-≈°u i≈°-t√π ·∏´a-mu≈°-tim ≈°a √¨-l√≠-dan ITU.KAM ≈°a ke-na-tim li-mu-um e-na-s√∫-in a-na ITU 14 ·∏´a-am-≈°a-tim i-≈°a-qal ≈°u-ma l√° i≈°-q√∫-ul 1.5 G√çN.TA a-na 1 ma-na-im i-na ITU.1.KAM ·π£√≠-ib-t√°m √∫-·π£a-√°b...

üìù English Translation:
   Seal of Mannum-balum-A≈°≈°ur son of ·π¢illi-Adad, seal of ≈†u-Illil son of Mannum-kƒ´-A≈°≈°ur, seal of Puzur-A≈°≈°ur son of Ataya. Puzur-A≈°≈°ur son of Ataya owes 22 shekels of good silver to Ali-ahum. Reckoned from the week of Ilƒ´-dan, month of ≈†a-kƒìnƒÅtim, in the eponymy of Enna-Suen, he will pay in 14 weeks. If he has not paid in time, he will add interest at the rate 1.5 shekel per mina per month....


In [4]:
import re

def preprocess_akkadian(text):
    """Preprocess Akkadian transliteration"""
    if pd.isna(text):
        return ""
    text = str(text)
    
    # Normalize gaps
    text = re.sub(r'\.{3,}', '<big_gap>', text)
    text = re.sub(r'x{2,}', '<gap>', text)
    
    # Clean whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Test preprocessing
sample = "um-ma k√†-ru-um... a-na a-bi-ia qi2-bi2-ma"
print(f"Original: {sample}")
print(f"Processed: {preprocess_akkadian(sample)}")

Original: um-ma k√†-ru-um... a-na a-bi-ia qi2-bi2-ma
Processed: um-ma k√†-ru-um<big_gap> a-na a-bi-ia qi2-bi2-ma


In [5]:
# Model configurations with validation-based weights
MODELS = {
    "byt5-rag-akkadian-v1": {
        "path": "/kaggle/input/byt5-rag-akkadian-v1",
        "weight": 50.93,
        "description": "RAG-enhanced ByT5"
    },
    "train-gap-all-2": {
        "path": "/kaggle/input/train-gap-all-2/byt5-base-akkadian_gap_setence2",
        "weight": 50.40,
        "description": "Gap-aware training"
    },
    "byt5-akkadian-model": {
        "path": "/kaggle/input/akkadian-byt5/pytorch/default/1/akkadian_byt5_v1",
        "weight": 48.70,
        "description": "Standard fine-tuned"
    },
    "byt5-base-big-data2": {
        "path": "/kaggle/input/byt5-base-big-data2",
        "weight": 42.85,
        "description": "Extended data"
    },
}

# Verify all models are accessible
print("üîç Checking model availability:")
available_models = {}
for name, info in MODELS.items():
    exists = os.path.exists(info["path"])
    status = "‚úÖ" if exists else "‚ùå"
    print(f"   {status} {name} ({info['description']})")
    if exists:
        available_models[name] = info

print(f"\nüìä {len(available_models)}/{len(MODELS)} models available")

üîç Checking model availability:
   ‚úÖ byt5-rag-akkadian-v1 (RAG-enhanced ByT5)
   ‚úÖ train-gap-all-2 (Gap-aware training)
   ‚úÖ byt5-akkadian-model (Standard fine-tuned)
   ‚úÖ byt5-base-big-data2 (Extended data)

üìä 4/4 models available


In [6]:
def generate_predictions(model_path, test_df, device, batch_size=1):
    """
    Generate translations from a single model.
    
    Args:
        model_path: Path to the fine-tuned model
        test_df: DataFrame with 'transliteration' column
        device: 'cuda' or 'cpu'
    
    Returns:
        List of English translations
    """
    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    model = model.to(device)
    model.eval()
    
    predictions = []
    prefix = "translate Akkadian to English: "
    
    for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Translating"):
        # Preprocess input
        src = prefix + preprocess_akkadian(row['transliteration'])
        
        # Tokenize
        inputs = tokenizer(
            src,
            max_length=512,
            truncation=True,
            return_tensors="pt"
        ).to(device)
        
        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=512,
                num_beams=8,
                repetition_penalty=1.2,
                no_repeat_ngram_size=3,
                early_stopping=True
            )
        
        # Decode
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(pred)
    
    # Free memory
    del model, tokenizer
    gc.collect()
    torch.cuda.empty_cache()
    
    return predictions

In [7]:
# Store predictions from each model
all_predictions = {}

for name, info in available_models.items():
    print(f"\n{'='*60}")
    print(f"ü§ñ Model: {name}")
    print(f"   {info['description']}")
    print(f"   Weight: {info['weight']}")
    print(f"{'='*60}")
    
    preds = generate_predictions(info["path"], test_df, device)
    all_predictions[name] = preds
    
    print(f"\n‚úÖ Generated {len(preds)} predictions")
    print(f"   Sample: {preds[0]}...")

print(f"\nüéâ All {len(all_predictions)} models completed!")


ü§ñ Model: byt5-rag-akkadian-v1
   RAG-enhanced ByT5
   Weight: 50.93


2026-01-29 14:26:53.490668: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769696813.674587      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769696813.728088      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769696814.165180      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769696814.165214      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769696814.165216      24 computation_placer.cc:177] computation placer alr

Translating:   0%|          | 0/4 [00:00<?, ?it/s]


‚úÖ Generated 4 predictions
   Sample: Thus Kanesh colony, say to Aqil-.. dƒÅtum, our messengers, and Wabarratim: A tablet came from the City. As for you, we have not settled accounts with Ali-ahum."...

ü§ñ Model: train-gap-all-2
   Gap-aware training
   Weight: 50.4


Translating:   0%|          | 0/4 [00:00<?, ?it/s]


‚úÖ Generated 4 predictions
   Sample: Thus the Kanesh colony, say to our messenger A≈°≈°ur-imittƒ´, every single or twice as well....

ü§ñ Model: byt5-akkadian-model
   Standard fine-tuned
   Weight: 48.7


Translating:   0%|          | 0/4 [00:00<?, ?it/s]


‚úÖ Generated 4 predictions
   Sample: From the Kanesh colony to Aqil... datu-payers and wabarrƒÅtum: You have written me as follows: 'There is no transport tarif...

ü§ñ Model: byt5-base-big-data2
   Extended data
   Weight: 42.85


Translating:   0%|          | 0/4 [00:00<?, ?it/s]


‚úÖ Generated 4 predictions
   Sample: Thus Kanesh colony, say to the payment of our messengers, wherever pleases and locations:...

üéâ All 4 models completed!


In [8]:
import nltk
from nltk.translate.chrf_score import sentence_chrf

def ensemble_mbr(all_preds, weights):
    """
    Minimum Bayes Risk Ensemble.
    Selects the candidate sentence that has the highest weighted 
    similarity score (chrF++) compared to all other candidates.
    """
    n_samples = len(list(all_preds.values())[0])
    model_names = list(all_preds.keys())
    
    # Normalize weights
    total_weight = sum(weights[m] for m in model_names)
    norm_weights = {m: weights[m] / total_weight for m in model_names}
    
    final_predictions = []
    
    for i in tqdm(range(n_samples), desc="MBR Ensembling"):
        # Get candidate sentences for this sample
        candidates = {m: all_preds[m][i] for m in model_names}
        
        # Calculate expected risk/gain for each candidate
        candidate_scores = {}
        for target_name, target_text in candidates.items():
            score = 0
            for ref_name, ref_text in candidates.items():
                # We weight the similarity by the reference model's weight
                # Using chrF as it's more robust for character-level models
                sim = sentence_chrf([ref_text], target_text)
                score += sim * norm_weights[ref_name]
            
            candidate_scores[target_name] = score
            
        # Select the candidate with the highest average similarity
        best_model = max(candidate_scores.keys(), key=lambda k: candidate_scores[k])
        final_predictions.append(candidates[best_model])
        
    return final_predictions

# Apply the new ensemble
weights = {name: info["weight"] for name, info in available_models.items()}
final_predictions = ensemble_mbr(all_predictions, weights)

# Check syntax/version status:
# 1. nltk.translate.chrf_score: Current (2026) standard for character-level evaluation.
# 2. No deprecated arguments used in the logic above.
print(f"\n‚úÖ Ensemble complete: {len(final_predictions)} predictions")

MBR Ensembling:   0%|          | 0/4 [00:00<?, ?it/s]


‚úÖ Ensemble complete: 4 predictions


In [9]:
import pandas as pd
import re

def build_robust_verifier(lexicon_path, dictionary_path):
    # Load the actual dataframes from your screenshots
    lex_df = pd.read_csv(lexicon_path)
    dict_df = pd.read_csv(dictionary_path)

    # 1. Prepare Dictionary: Clean definitions and map them to 'word'
    # We remove (parenthetical) grammatical notes to find the core English meaning
    dict_df['clean_meaning'] = dict_df['definition'].apply(
        lambda x: re.sub(r'\(.*?\)', '', str(x)).replace('"', '').strip().lower()
    )
    
    # Create a word -> meaning lookup
    meaning_lookup = dict_df.set_index('word')['clean_meaning'].to_dict()

    # 2. Prepare Lexicon: Map English meanings to the 'form' (transliteration)
    verification_map = {}
    
    for _, row in lex_df.iterrows():
        # 'form' is what we see in test data (e.g., '√°b ≈°a-ra-ni')
        # 'lexeme' is what links to the dictionary 'word'
        form = str(row.get('form', '')).lower()
        lexeme = str(row.get('lexeme', '')).lower()
        
        # Pull the English meaning from our lookup
        english_definition = meaning_lookup.get(lexeme, "")
        
        if english_definition and form:
            # We split definitions like "gold, silver" into individual checkable words
            for part in re.split(r'[,; ]+', english_definition):
                clean_eng = part.strip()
                if len(clean_eng) > 3:  # Only track significant words
                    if clean_eng not in verification_map:
                        verification_map[clean_eng] = set()
                    # Clean the 'form' for matching (remove special chars)
                    clean_form = re.sub(r'[\(\)\[\]\{\}\d+¬´¬ª]', '', form)
                    verification_map[clean_eng].add(clean_form)
                    
    return verification_map

def verify_and_clean_final(transliteration, translation, v_map):
    words = translation.split()
    cleaned_words = []
    source_lower = transliteration.lower()
    
    for word in words:
        # Clean word for matching (remove punctuation)
        clean_word = re.sub(r'[^\w\s]', '', word).lower()
        
        if clean_word in v_map:
            # Check if ANY related Akkadian form/syllable exists in the input text
            # We check the raw transliteration for the components of the 'form'
            found = False
            for form_variant in v_map[clean_word]:
                # Split form like 'ab-sar-ra-ni' into parts to check phonetic overlap
                syllables = [s for s in re.split(r'[- ]', form_variant) if len(s) > 1]
                if any(syl in source_lower for syl in syllables):
                    found = True
                    break
            
            if not found:
                cleaned_words.append("[unverified_entity]")
            else:
                cleaned_words.append(word)
        else:
            cleaned_words.append(word)
            
    return " ".join(cleaned_words)

# --- EXECUTION ---

# 1. Build the map using both files
lexicon_map = build_robust_verifier(
    "/kaggle/input/deep-past-initiative-machine-translation/OA_Lexicon_eBL.csv",
    "/kaggle/input/deep-past-initiative-machine-translation/eBL_Dictionary.csv"
)

# 2. Apply to your Ensemble predictions (assuming final_predictions is your list)
final_safe_predictions = []
for i in range(len(final_predictions)):
    safe_text = verify_and_clean_final(
        test_df.iloc[i]['transliteration'], 
        final_predictions[i], 
        lexicon_map
    )
    final_safe_predictions.append(safe_text)

In [10]:
# --- Sumerogram Leakage Check ---
# This checks if your build_entity_verifier is capturing high-value 
# Sumerograms from the lexicon and matching them to the source text format.

def check_sumerogram_mapping(v_map):
    # High-value markers in Old Assyrian merchant archives
    test_terms = {
        "iron": ["k√π.an", "am≈´tum"],
        "silver": ["k√π.babbar", "kaspum"],
        "merchant": ["dam.g√†r", "tamkƒÅrum"],
        "palace": ["√©.gal", "ekallum"]
    }
    
    print(f"{'English Term':<15} | {'Status':<12} | {'Akkadian Lemmas in Map'}")
    print("-" * 60)
    
    for eng, variants in test_terms.items():
        # Check if the English term exists in your built map
        if eng in v_map:
            mapped_lemmas = v_map[eng]
            # Check for any overlap between our expected variants and the map
            found = any(v in mapped_lemmas for v in variants)
            status = "‚úÖ FOUND" if found else "‚ö†Ô∏è MISSING"
            print(f"{eng:<15} | {status:<12} | {mapped_lemmas}")
        else:
            print(f"{eng:<15} | ‚ùå NOT IN MAP | N/A")

# Execute Diagnostic
check_sumerogram_mapping(lexicon_map)

English Term    | Status       | Akkadian Lemmas in Map
------------------------------------------------------------
iron            | ‚ùå NOT IN MAP | N/A
silver          | ‚ùå NOT IN MAP | N/A
merchant        | ‚ùå NOT IN MAP | N/A
palace          | ‚ùå NOT IN MAP | N/A


In [11]:
# Create submission DataFrame
submission = pd.DataFrame({
    'id': test_df['id'],
    'translation': final_safe_predictions
})

# Save
submission.to_csv('submission.csv', index=False)

print("="*60)
print("‚úÖ SUBMISSION CREATED!")
print("="*60)
print(f"File: submission.csv")
print(f"Rows: {len(submission):,}")

# Show samples
print("\nüìù Sample Predictions:")
for i in range(min(5, len(submission))):
    print(f"\n[{i+1}]")
    print(f"   Input: {test_df.iloc[i]['transliteration']}...")
    print(f"   Output: {final_predictions[i]}...")

‚úÖ SUBMISSION CREATED!
File: submission.csv
Rows: 4

üìù Sample Predictions:

[1]
   Input: um-ma k√†-ru-um k√†-ni-ia-ma a-na aa-q√≠-il‚Ä¶ da-tim a√≠-ip-ri-ni k√†-ar k√†-ar-ma √∫ wa-bar-ra-tim q√≠-bi‚Äû-ma mup-pu-um aa a-lim(ki) i-li-kam...
   Output: Thus Kanesh colony, say to Aqil-.. dƒÅtum, our messengers, and Wabarratim: A tablet came from the City. As for you, we have not settled accounts with Ali-ahum."...

[2]
   Input: i-na mup-p√¨-im aa a-lim(ki) ia-t√π u‚Äû-m√¨-im a-nim ma-ma-an K√ô.AN i-aa-√∫-mu-ni i-na n√©-m√¨-lim da-a√πr √∫-l√° e-WA ia-ra-t√≠-au k√†-ru-um k√†-ni-ia i-l√°-q√©...
   Output: Concerning my tablet from the City Center, whoever leaves me to seek a lawsuit (or) litigation against A≈°≈°ur or DadƒÅya will not speak. My Kanesh colony has received it....

[3]
   Input: ki-ma mup-p√¨-ni ta-√°a-me-a-ni a-ma-kam lu a-na a√≠-m√¨-im a-na √â.GAL-lim i-d√≠-in lu t√©-ra-at √â.GAL-lim √∫-k√†-lim lu na-a√≠-ma a-d√≠-ni l√° i-d√≠-in ma-l√° K√ô.AN na-√°a-√∫ ni-bi‚Äû-it a-a√≠-im

In [12]:
# Visual comparison
sample_idx = 0
print(f"üìä Comparison for sample #{sample_idx}:")
print(f"\nInput:")
print(f"   {test_df.iloc[sample_idx]['transliteration']}...")

print(f"\nIndividual Model Outputs:")
for name in all_predictions.keys():
    print(f"\n   [{name}]:")
    print(f"   {all_predictions[name][sample_idx]}...")

print(f"\nüèÜ ENSEMBLE Output:")
print(f"   {final_predictions[sample_idx]}...")

üìä Comparison for sample #0:

Input:
   um-ma k√†-ru-um k√†-ni-ia-ma a-na aa-q√≠-il‚Ä¶ da-tim a√≠-ip-ri-ni k√†-ar k√†-ar-ma √∫ wa-bar-ra-tim q√≠-bi‚Äû-ma mup-pu-um aa a-lim(ki) i-li-kam...

Individual Model Outputs:

   [byt5-rag-akkadian-v1]:
   Thus Kanesh colony, say to Aqil-.. dƒÅtum, our messengers, and Wabarratim: A tablet came from the City. As for you, we have not settled accounts with Ali-ahum."...

   [train-gap-all-2]:
   Thus the Kanesh colony, say to our messenger A≈°≈°ur-imittƒ´, every single or twice as well....

   [byt5-akkadian-model]:
   From the Kanesh colony to Aqil... datu-payers and wabarrƒÅtum: You have written me as follows: 'There is no transport tarif...

   [byt5-base-big-data2]:
   Thus Kanesh colony, say to the payment of our messengers, wherever pleases and locations:...

üèÜ ENSEMBLE Output:
   Thus Kanesh colony, say to Aqil-.. dƒÅtum, our messengers, and Wabarratim: A tablet came from the City. As for you, we have not settled accounts with Ali-ahum.