In [1]:
"""
FAST VOCABULARY EXTRACTOR
=========================
Training-only version for quick iteration.
Skips unlabeled data to run in seconds instead of minutes.

Goal: Extract Akkadian→English word mappings from training pairs.
"""

import pandas as pd
import re
from collections import defaultdict, Counter
import json

# ============================================================
# CONFIGURATION
# ============================================================

# Logograms we know - used as anchors
LOGO_TO_ENGLISH = {
    'KÙ.BABBAR': 'silver', 'KÙ.GI': 'gold', 'URUDU': 'copper',
    'AN.NA': 'tin', 'GÍN': 'shekel', 'GÍN.TA': 'shekel',
    'MA.NA': 'mina', 'GÚ': 'talent', 'DUMU': 'son',
    'DUMU.MUNUS': 'daughter', 'DAM': 'wife', 'DAM.GÀR': 'merchant',
    'KIŠIB': 'seal', 'É': 'house', 'ANŠE': 'donkey',
    'TÚG': 'textile', 'TÚG.ḪI.A': 'textiles', 'UDU': 'sheep',
    'SÍG': 'wool', 'SÍG.ḪI.A': 'wool', 'ITU': 'month',
    'ITU.KAM': 'month', 'IGI': 'witness',
}

# Stop words to ignore
AKK_STOP = {'ša', 'a-na', 'ù', 'ú', 'i-na', 'x', 'xx', 'xxx', '…'}
ENG_STOP = {
    'the', 'a', 'an', 'of', 'and', 'to', 'in', 'for', 'is', 'are',
    'was', 'were', 'be', 'have', 'has', 'had', 'he', 'she', 'it',
    'they', 'we', 'i', 'you', 'my', 'your', 'his', 'her', 'its',
    'their', 'our', 'from', 'with', 'by', 'at', 'on', 'as', 'or',
    'if', 'but', 'not', 'no', 'that', 'this', 'which', 'who',
}

# ============================================================
# HELPERS
# ============================================================

def is_number(w):
    return bool(re.match(r'^[\d.]+$', w))

def is_logogram(w):
    if w in LOGO_TO_ENGLISH:
        return True
    return re.sub(r'[₀-₉]', '', w) in LOGO_TO_ENGLISH

def tokenize_akk(text):
    return text.split()

def tokenize_eng(text):
    return re.findall(r"[a-z]+(?:'[a-z]+)?|\d+", text.lower())

# ============================================================
# METHOD 1: SENTENCE CO-OCCURRENCE
# ============================================================

def extract_cooccurrence(train_df):
    """
    Find Akkadian words that consistently appear with specific English words.
    Simple but effective: if 'ṣa-ru-pá-am' appears in 80% of sentences 
    containing 'refined', they're probably related.
    """
    print("\n--- Method 1: Sentence Co-occurrence ---")
    
    cooccur = defaultdict(Counter)
    akk_counts = Counter()
    
    for _, row in train_df.iterrows():
        akk_words = set(tokenize_akk(row['transliteration']))
        eng_words = set(tokenize_eng(row['translation']))
        
        # Filter
        akk_words = {w for w in akk_words if w not in AKK_STOP 
                     and not is_number(w) and not is_logogram(w) and len(w) > 1}
        eng_words = {w for w in eng_words if w not in ENG_STOP and len(w) > 2}
        
        for akk in akk_words:
            akk_counts[akk] += 1
            for eng in eng_words:
                cooccur[akk][eng] += 1
    
    # Build vocabulary with confidence scores
    vocab = {}
    for akk, eng_counts in cooccur.items():
        if akk_counts[akk] < 3:  # Need at least 3 occurrences
            continue
        
        total = akk_counts[akk]
        top_eng, top_count = eng_counts.most_common(1)[0]
        confidence = top_count / total
        
        if confidence >= 0.3 and top_count >= 3:
            vocab[akk] = {
                'translation': top_eng,
                'confidence': round(confidence, 2),
                'count': top_count,
                'total': total,
                'alternatives': eng_counts.most_common(5)[1:],
            }
    
    # Sort by count
    vocab = dict(sorted(vocab.items(), key=lambda x: -x[1]['count']))
    
    print(f"Found {len(vocab)} words with >=30% confidence")
    return vocab

# ============================================================
# METHOD 2: POSITIONAL EXTRACTION (Flipped)
# ============================================================

def extract_positional(train_df):
    """
    Extract vocabulary based on position relative to logograms.
    Key insight: Akkadian post-LOGO maps to English pre-anchor (word order flip)
    """
    print("\n--- Method 2: Positional Extraction ---")
    
    # Track what appears at each position
    akk_positions = defaultdict(lambda: defaultdict(Counter))
    eng_positions = defaultdict(lambda: defaultdict(Counter))
    
    for _, row in train_df.iterrows():
        akk_words = tokenize_akk(row['transliteration'])
        eng_words = tokenize_eng(row['translation'])
        
        # Akkadian: find logograms and record neighbors
        for i, w in enumerate(akk_words):
            logo = None
            if w in LOGO_TO_ENGLISH:
                logo = w
            elif re.sub(r'[₀-₉]', '', w) in LOGO_TO_ENGLISH:
                logo = re.sub(r'[₀-₉]', '', w)
            
            if logo:
                for offset in [-2, -1, 1, 2]:
                    j = i + offset
                    if 0 <= j < len(akk_words):
                        neighbor = akk_words[j]
                        if neighbor not in AKK_STOP and not is_number(neighbor) and not is_logogram(neighbor):
                            pos = f"post{offset}" if offset > 0 else f"pre{offset}"
                            akk_positions[logo][pos][neighbor] += 1
        
        # English: find anchors and record neighbors
        for i, w in enumerate(eng_words):
            for logo, anchor in LOGO_TO_ENGLISH.items():
                if w == anchor or w == anchor + 's':  # Handle plurals
                    for offset in [-2, -1, 1, 2]:
                        j = i + offset
                        if 0 <= j < len(eng_words):
                            neighbor = eng_words[j]
                            if neighbor not in ENG_STOP:
                                pos = f"post{offset}" if offset > 0 else f"pre{offset}"
                                eng_positions[anchor][pos][neighbor] += 1
    
    # Cross-reference with FLIPPED positions
    # Akkadian post1 -> English pre-1 (descriptors come after in Akk, before in Eng)
    position_map = {'post1': 'pre-1', 'post2': 'pre-2', 'pre-1': 'post1', 'pre-2': 'post2'}
    
    vocab = {}
    
    for logo, eng_anchor in LOGO_TO_ENGLISH.items():
        if logo not in akk_positions or eng_anchor not in eng_positions:
            continue
        
        for akk_pos, eng_pos in position_map.items():
            if akk_pos not in akk_positions[logo] or eng_pos not in eng_positions[eng_anchor]:
                continue
            
            akk_words = akk_positions[logo][akk_pos].most_common(10)
            eng_words = eng_positions[eng_anchor][eng_pos].most_common(5)
            
            if not eng_words:
                continue
            
            top_eng = eng_words[0][0]
            
            for akk_word, akk_count in akk_words:
                if akk_count >= 3:
                    key = f"{logo}_{akk_pos}"
                    if akk_word not in vocab:
                        vocab[akk_word] = {
                            'translation': top_eng,
                            'confidence': 0.5,  # Positional has medium confidence
                            'count': akk_count,
                            'context': key,
                            'eng_candidates': [w for w, c in eng_words],
                        }
    
    print(f"Found {len(vocab)} words from positional analysis")
    return vocab

# ============================================================
# METHOD 3: HIGH-FREQUENCY UNKNOWN DETECTION
# ============================================================

def find_high_frequency_unknowns(train_df, existing_vocab):
    """
    Find Akkadian words that appear frequently but aren't in our vocabulary.
    These are prime targets for manual review or additional extraction.
    """
    print("\n--- Method 3: High-Frequency Unknowns ---")
    
    word_counts = Counter()
    
    for _, row in train_df.iterrows():
        akk_words = tokenize_akk(row['transliteration'])
        for w in akk_words:
            if w not in AKK_STOP and not is_number(w) and not is_logogram(w):
                word_counts[w] += 1
    
    unknowns = []
    for word, count in word_counts.most_common(200):
        if word not in existing_vocab and count >= 5:
            unknowns.append((word, count))
    
    print(f"Found {len(unknowns)} high-frequency unknowns (>=5 occurrences)")
    return unknowns[:50]  # Top 50

# ============================================================
# MAIN
# ============================================================

if __name__ == '__main__':
    print("="*60)
    print("FAST VOCABULARY EXTRACTOR")
    print("="*60)
    
    # Load training data only
    train = pd.read_csv('/kaggle/input/deep-past-initiative-machine-translation/train.csv')
    print(f"Loaded {len(train)} training sentences")
    
    # Method 1: Co-occurrence
    cooccur_vocab = extract_cooccurrence(train)
    
    # Method 2: Positional
    positional_vocab = extract_positional(train)
    
    # Merge vocabularies
    merged = {}
    for word, data in cooccur_vocab.items():
        merged[word] = data
    for word, data in positional_vocab.items():
        if word not in merged or data.get('confidence', 0) > merged[word].get('confidence', 0):
            merged[word] = data
    
    print(f"\n{'='*60}")
    print(f"MERGED VOCABULARY: {len(merged)} words")
    print("="*60)
    
    # Method 3: Find what we're missing
    unknowns = find_high_frequency_unknowns(train, merged)
    
    # ============================================================
    # OUTPUT: Formatted for copy-paste into translator
    # ============================================================
    
    print("\n" + "="*60)
    print("HIGH-CONFIDENCE VOCABULARY (>=40%)")
    print("Copy this into VOCABULARY dict in mycelial_translator.py")
    print("="*60)
    
    high_conf = [(k, v) for k, v in merged.items() if v['confidence'] >= 0.4]
    high_conf.sort(key=lambda x: -x[1]['count'])
    
    print("\n# HIGH CONFIDENCE (>=40%)")
    for akk, data in high_conf[:50]:
        eng = data['translation']
        eng_escaped = eng.replace("'", "\\'")  # ← ADD THIS
        conf = data['confidence']
        count = data['count']
        # Guess semantic type based on English word
        if eng in ['refined', 'good', 'fine', 'bad', 'washed', 'black', 'white']:
            stype = 'quality'
        elif eng in ['gave', 'received', 'paid', 'brought', 'sent', 'said', 'returned']:
            stype = 'verb'
        elif eng in ['silver', 'gold', 'copper', 'tin', 'wool', 'textile']:
            stype = 'material'
        elif eng in ['brother', 'father', 'mother', 'lord', 'slave']:
            stype = 'relation'
        else:
            stype = 'unknown'
        print(f"    '{akk}': {{'meaning': '{eng_escaped}', 'type': '{stype}', 'confidence': {conf}}},  # {count}x")
    
    print("\n" + "="*60)
    print("MEDIUM-CONFIDENCE VOCABULARY (30-40%)")
    print("="*60)
    
    med_conf = [(k, v) for k, v in merged.items() if 0.3 <= v['confidence'] < 0.4]
    med_conf.sort(key=lambda x: -x[1]['count'])
    
    print("\n# MEDIUM CONFIDENCE (30-40%) - review before adding")
    for akk, data in med_conf[:30]:
        eng = data['translation']
        eng_escaped = eng.replace("'", "\\'")  # ← ADD THIS

        conf = data['confidence']
        count = data['count']
        alts = data.get('alternatives', [])[:3]
        alt_str = ', '.join([f"{w}({c})" for w, c in alts]) if alts else ''
        print(f"    '{akk}': {{'meaning': '{eng_escaped}', 'type': '{stype}', 'confidence': {conf}}},  # {count}x")
    
    print("\n" + "="*60)
    print("TOP UNKNOWNS (high frequency, not in vocab)")
    print("These need manual investigation")
    print("="*60)
    
    for word, count in unknowns[:20]:
        print(f"    '{word}': ???  # {count}x")
    
    # Save JSON for reference
    with open('extracted_vocab.json', 'w', encoding='utf-8') as f:
        json.dump(merged, f, ensure_ascii=False, indent=2)
    print(f"\nSaved full vocabulary to extracted_vocab.json")
    
    print("\n" + "="*60)
    print("DONE")
    print("="*60)

FAST VOCABULARY EXTRACTOR
Loaded 1561 training sentences

--- Method 1: Sentence Co-occurrence ---
Found 2532 words with >=30% confidence

--- Method 2: Positional Extraction ---
Found 192 words from positional analysis

MERGED VOCABULARY: 2536 words

--- Method 3: High-Frequency Unknowns ---
Found 1 high-frequency unknowns (>=5 occurrences)

HIGH-CONFIDENCE VOCABULARY (>=40%)
Copy this into VOCABULARY dict in mycelial_translator.py

# HIGH CONFIDENCE (>=40%)
    'ma-na': {'meaning': 'silver', 'type': 'material', 'confidence': 0.84},  # 757x
    'um-ma': {'meaning': 'silver', 'type': 'material', 'confidence': 0.79},  # 541x
    'lá': {'meaning': 'silver', 'type': 'material', 'confidence': 0.78},  # 415x
    'qí-bi-ma': {'meaning': 'silver', 'type': 'material', 'confidence': 0.81},  # 309x
    'šu-ma': {'meaning': 'silver', 'type': 'material', 'confidence': 0.79},  # 301x
    'ki-ma': {'meaning': 'silver', 'type': 'material', 'confidence': 0.79},  # 262x
    'a-lá-ḫi-im': {'meaning': 'a

In [2]:
import pandas as pd
import re
from collections import defaultdict, Counter
import json

# ============================================================
# CONFIGURATION & NEUTRALIZATION MAP
# ============================================================
LOGO_TO_ENGLISH = {
    'KÙ.BABBAR': 'silver', 'KÙ.GI': 'gold', 'URUDU': 'copper',
    'AN.NA': 'tin', 'GÍN': 'shekel', 'MA.NA': 'mina',
    'DUMU': 'son', 'KIŠIB': 'seal', 'É': 'house', 'ANŠE': 'donkey',
    'TÚG': 'textile', 'ITU': 'month', 'IGI': 'witness',
}
NAME_ANCHORS = {'DUMU', 'KIŠIB', 'PÀD'}
AKK_STOP = {'ša', 'a-na', 'ù', 'ú', 'i-na', 'x', 'xx', '…'}
ENG_STOP = {'the', 'a', 'an', 'of', 'and', 'to', 'in', 'for', 'is', 'his', 'her', 'my'}

# ============================================================
# IMPROVED HELPERS
# ============================================================
def neutralize_line(akk_text, eng_text):
    clean_akk = akk_text
    clean_eng = eng_text.lower()
    
    for logo, eng in LOGO_TO_ENGLISH.items():
        if logo in clean_akk and eng in clean_eng:
            clean_akk = clean_akk.replace(logo, "[ANCHOR]")
            # FIX 1: Regex escaping to handle periods/parens in English
            clean_eng = re.sub(rf'\b{re.escape(eng)}\b', "[ANCHOR]", clean_eng)
            
    return clean_akk, clean_eng

def get_name_mask(train_df):
    """
    FIX 2: Captures two tokens after anchors to handle multi-word
    patronyms (e.g., 'DUMU I-din-A-šur').
    """
    names = set()
    for _, row in train_df.iterrows():
        tokens = row['transliteration'].split()
        for i, t in enumerate(tokens):
            if t in NAME_ANCHORS:
                if i + 1 < len(tokens): names.add(tokens[i+1])
                if i + 2 < len(tokens): names.add(tokens[i+2])
    return names

# ============================================================
# DISTRIBUTIONAL TYPE CLASSIFICATION (Honeycomb Methodology)
# ============================================================
def classify_by_distribution(word, meaning, context_positions):
    """
    Uses the Chaveste methodology: Words aren't defined by surface form,
    but by their 'structural neighborhood.'
    """
    # If a word consistently follows a number -> it's likely a UNIT
    if any('pre' in pos and 'number' in pos for pos in context_positions):
        return 'unit'
    # If it appears at the end of a sentence following a list -> likely a VERB
    if meaning in ['gave', 'paid', 'received', 'sent', 'took']:
        return 'verb'
    # Defaulting to the English semantic mapping for now
    return 'unknown'

# ============================================================
# MAIN EXECUTION
# ============================================================
if __name__ == '__main__':
    train = pd.read_csv('/kaggle/input/deep-past-initiative-machine-translation/train.csv')
    name_mask = get_name_mask(train)
    
    # Extraction logic
    vocab_candidates = defaultdict(Counter)
    context_pos = defaultdict(list)  # For type classification, stub for now
    
    for _, row in train.iterrows():
        akk_text = row['transliteration']
        eng_text = row['translation']
        clean_akk, clean_eng = neutralize_line(akk_text, eng_text)
        akk_tokens = [t for t in re.split(r'\s+', clean_akk) if t and t not in AKK_STOP and t != '[ANCHOR]']
        eng_tokens = [t for t in re.split(r'\s+', clean_eng) if t and t not in ENG_STOP and t != '[ANCHOR]']
        
        # Assume rough alignment by position, use ±1 window for co-occurrence
        min_len = min(len(akk_tokens), len(eng_tokens))
        for i in range(min_len):
            akk_word = akk_tokens[i]
            if akk_word in name_mask or akk_word.isupper():  # Skip names and potential logograms
                continue
            for j in range(max(0, i-1), min(len(eng_tokens), i+2)):
                eng_word = eng_tokens[j]
                vocab_candidates[akk_word][eng_word] += 1
                # Stub for context (expand as needed): track relative positions
                if j < i: context_pos[akk_word].append('pre')
                if eng_word.isdigit(): context_pos[akk_word].append('number')
    
    # Compute vocab with confidence
    extracted_vocab = {}
    for akk_word, eng_counts in vocab_candidates.items():
        total = sum(eng_counts.values())
        if total < 3: continue  # Min occurrences to reduce noise
        top_eng, top_count = eng_counts.most_common(1)[0]
        confidence = top_count / total
        if confidence >= 0.3:  # Keep for JSON
            word_type = classify_by_distribution(akk_word, top_eng, context_pos.get(akk_word, []))
            extracted_vocab[akk_word] = {
                'meaning': top_eng,
                'type': word_type,
                'confidence': round(confidence, 2)
            }
    
    # Display >= 0.4
    print("Extracted Vocabulary (confidence >= 0.4):")
    for word, data in sorted(extracted_vocab.items(), key=lambda x: x[1]['confidence'], reverse=True):
        if data['confidence'] >= 0.4:
            print(f"'{word}': {{'meaning': '{data['meaning']}', 'type': '{data['type']}', 'confidence': {data['confidence']}}},")
    
    # Save all >=0.3 to JSON
    with open('/kaggle/working/extracted_vocab.json', 'w') as f:
        json.dump(extracted_vocab, f, indent=4)
    print("\nFull extracted vocab (>=0.3) saved to /kaggle/working/extracted_vocab.json")

Extracted Vocabulary (confidence >= 0.4):
'šu-ma-bi-a-ma': {'meaning': '...', 'type': 'unknown', 'confidence': 1.0},
'(d)IŠKUR.DÙL-ma': {'meaning': '...', 'type': 'unknown', 'confidence': 1.0},
'zu-ú-lu': {'meaning': '...', 'type': 'unknown', 'confidence': 1.0},
'ší-it-ar-ša-a': {'meaning': '...', 'type': 'unknown', 'confidence': 1.0},
'eṭ-ra-šu': {'meaning': '...', 'type': 'unknown', 'confidence': 1.0},
'ra-da-x': {'meaning': '...', 'type': 'unknown', 'confidence': 1.0},
'tù-ša-ak-ší-dí': {'meaning': '...', 'type': 'unknown', 'confidence': 1.0},
'a-ra-ší-a-ku-um': {'meaning': '...', 'type': 'unknown', 'confidence': 1.0},
'ta-ta-wu-ni': {'meaning': '...', 'type': 'unknown', 'confidence': 1.0},
'a-ṭá-ra-da-ku-nu-tí': {'meaning': '...', 'type': 'unknown', 'confidence': 1.0},
'…-mu': {'meaning': '...', 'type': 'unknown', 'confidence': 1.0},
'…-nam': {'meaning': '...', 'type': 'unknown', 'confidence': 1.0},
'ta-ṭá-ba-am': {'meaning': '...', 'type': 'unknown', 'confidence': 1.0},
'sà-ra-tí-

In [3]:
"""
VOCABULARY EXTRACTOR v5 - NAME BLACKLISTING
============================================
Key fix: Words immediately after DUMU, KIŠIB, IGI are NAMES.
Pass 1: Identify all names
Pass 2: Extract vocabulary, excluding names

This prevents names from being incorrectly mapped to 'son', 'seal', etc.
"""

import pandas as pd
import re
from collections import defaultdict, Counter

# ============================================================
# CONFIG
# ============================================================

LOGO_TO_ENGLISH = {
    'KÙ.BABBAR': 'silver', 'KÙ.GI': 'gold', 'URUDU': 'copper',
    'AN.NA': 'tin', 'GÍN': 'shekel', 'GÍN.TA': 'shekel',
    'MA.NA': 'mina', 'GÚ': 'talent', 'DUMU': 'son',
    'DUMU.MUNUS': 'daughter', 'DAM': 'wife', 'DAM.GÀR': 'merchant',
    'KIŠIB': 'seal', 'É': 'house', 'ANŠE': 'donkey',
    'TÚG': 'textile', 'TÚG.ḪI.A': 'textiles', 'UDU': 'sheep',
    'SÍG': 'wool', 'ITU': 'month', 'IGI': 'witness',
    'SIG₅': 'fine', 'GÍR': 'dagger', 'ŠU.NÍGIN': 'total',
}

# Markers that indicate next word is a NAME
NAME_MARKERS = {'DUMU', 'KIŠIB', 'IGI', 'DAM', 'DUMU.MUNUS'}

AKK_STOP = {'ša', 'a-na', 'ù', 'ú', 'i-na', 'x', 'xx', 'xxx', '…', '-', '', 'ma'}
ENG_STOP = {
    'the', 'a', 'an', 'of', 'and', 'to', 'in', 'for', 'is', 'are',
    'was', 'were', 'be', 'have', 'has', 'had', 'he', 'she', 'it',
    'they', 'we', 'i', 'you', 'my', 'your', 'his', 'her', 'its',
    'their', 'our', 'from', 'with', 'by', 'at', 'on', 'as', 'or',
    'if', 'but', 'not', 'no', 'that', 'this', 'which', 'who',
    'him', 'me', 'us', 'them', 'so', 'then', 'when', 'will',
    'said', 'says',  # Common in translations but not useful mappings
}

# English words that indicate the Akkadian is probably a name
NAME_INDICATORS = {'son', 'daughter', 'wife', 'seal', 'witness'}

def tokenize_akk(text):
    return text.split()

def tokenize_eng(text):
    return re.findall(r"[a-z]+(?:'[a-z]+)?|\d+", text.lower())

def is_number(w):
    return bool(re.match(r'^[\d.]+$', w))

def is_logogram(w):
    if w in LOGO_TO_ENGLISH:
        return True
    clean = re.sub(r'[₀-₉]', '', w)
    return clean in LOGO_TO_ENGLISH

def guess_type(eng):
    if eng in ['refined', 'good', 'fine', 'bad', 'washed', 'black', 'white', 'pure', 'ordinary', 'quality']:
        return 'quality'
    elif eng in ['gave', 'received', 'paid', 'brought', 'sent', 'returned', 'pay', 'bring', 'send', 
                 'give', 'take', 'owe', 'owes', 'owed', 'taken', 'placed', 'went', 'came', 'settled']:
        return 'verb'
    elif eng in ['silver', 'gold', 'copper', 'tin', 'wool', 'textile', 'textiles', 'iron', 'lapis']:
        return 'material'
    elif eng in ['brother', 'father', 'mother', 'lord', 'slave', 'sister', 'uncle', 'servant']:
        return 'relation'
    elif eng in ['tablet', 'seal', 'document', 'letter', 'contract', 'certified', 'verdict']:
        return 'document'
    elif eng in ['house', 'city', 'colony', 'there', 'here', 'palace', 'gate', 'market']:
        return 'place'
    elif eng in ['shekel', 'shekels', 'mina', 'minas', 'talent', 'talents']:
        return 'unit'
    elif eng in ['interest', 'debt', 'price', 'profit', 'loss', 'capital', 'expenses']:
        return 'financial'
    elif eng in ['month', 'year', 'day', 'week', 'eponymy', 'time']:
        return 'time'
    else:
        return 'unknown'

# ============================================================
# PASS 1: IDENTIFY NAMES
# ============================================================

def identify_names(train_df):
    """
    Find all words that appear immediately after NAME_MARKERS.
    These are personal names and should NOT be in vocabulary.
    """
    name_candidates = Counter()
    
    for _, row in train_df.iterrows():
        akk_words = tokenize_akk(row['transliteration'])
        
        for i, word in enumerate(akk_words):
            # Check if this word is a name marker
            marker = None
            if word in NAME_MARKERS:
                marker = word
            elif re.sub(r'[₀-₉]', '', word) in NAME_MARKERS:
                marker = re.sub(r'[₀-₉]', '', word)
            
            if marker:
                # Next word is a name
                if i + 1 < len(akk_words):
                    potential_name = akk_words[i + 1]
                    if potential_name not in AKK_STOP and not is_number(potential_name):
                        name_candidates[potential_name] += 1
    
    # Words that appear after name markers more than once are definitely names
    names = {word for word, count in name_candidates.items() if count >= 1}
    
    return names

# ============================================================
# PASS 2: EXTRACT VOCABULARY (EXCLUDING NAMES)
# ============================================================

def extract_vocab(train_df, name_blacklist):
    """
    Extract vocabulary using ±1 positional window.
    Skip any word in name_blacklist.
    """
    cooccur = defaultdict(Counter)
    akk_counts = Counter()
    
    for _, row in train_df.iterrows():
        akk_words = tokenize_akk(row['transliteration'])
        eng_words = tokenize_eng(row['translation'])
        
        if len(akk_words) < 2 or len(eng_words) < 2:
            continue
        
        for akk_idx, akk_word in enumerate(akk_words):
            # SKIP IF NAME
            if akk_word in name_blacklist:
                continue
            
            # Skip stop words, numbers, logograms
            if akk_word in AKK_STOP or is_number(akk_word) or is_logogram(akk_word):
                continue
            if len(akk_word) < 2:
                continue
            
            akk_counts[akk_word] += 1
            
            # Positional alignment (±1 window)
            ratio = akk_idx / len(akk_words)
            eng_center = int(ratio * len(eng_words))
            
            for eng_idx in range(max(0, eng_center - 1), min(len(eng_words), eng_center + 2)):
                eng_word = eng_words[eng_idx]
                if eng_word not in ENG_STOP and len(eng_word) > 2:
                    # Also skip if English word indicates this is probably a name context
                    if eng_word not in NAME_INDICATORS:
                        cooccur[akk_word][eng_word] += 1
    
    # Build vocab
    vocab = {}
    for akk, eng_counts in cooccur.items():
        if akk_counts[akk] < 3:
            continue
        
        top_eng, top_count = eng_counts.most_common(1)[0]
        confidence = top_count / akk_counts[akk]
        
        if confidence >= 0.3 and top_count >= 3:
            vocab[akk] = {
                'translation': top_eng,
                'confidence': round(confidence, 2),
                'count': top_count,
                'total': akk_counts[akk],
                'alternatives': [(w, c) for w, c in eng_counts.most_common(5)[1:]],
                'type': guess_type(top_eng),
            }
    
    return vocab

# ============================================================
# MAIN
# ============================================================

if __name__ == '__main__':
    print("="*60)
    print("VOCAB EXTRACTOR v5 - NAME BLACKLISTING")
    print("="*60)
    
    train = pd.read_csv('/kaggle/input/deep-past-initiative-machine-translation/train.csv')
    print(f"Loaded {len(train)} training sentences")
    
    # PASS 1: Identify names
    print("\n--- Pass 1: Identifying names ---")
    names = identify_names(train)
    print(f"Found {len(names)} unique names (blacklisted from vocabulary)")
    
    # Show sample names
    print("\nSample names (first 20):")
    for name in list(names)[:20]:
        print(f"  {name}")
    
    # PASS 2: Extract vocabulary
    print("\n--- Pass 2: Extracting vocabulary ---")
    vocab = extract_vocab(train, names)
    print(f"Extracted {len(vocab)} vocabulary words")
    
    # Group by type
    by_type = defaultdict(list)
    for word, data in vocab.items():
        by_type[data['type']].append((word, data))
    
    for t in by_type:
        by_type[t].sort(key=lambda x: -x[1]['count'])
    
    # Print by type
    print("\n" + "="*60)
    print("VOCABULARY BY TYPE (copy to mycelial_translator.py)")
    print("="*60)
    
    for stype in ['verb', 'quality', 'relation', 'place', 'document', 'time', 'financial', 'material', 'unit', 'unknown']:
        if stype not in by_type:
            continue
        words = by_type[stype]
        if not words:
            continue
        
        print(f"\n# === {stype.upper()} ({len(words)} words) ===")
        for akk, data in words[:25]:  # Top 25 per type
            eng = data['translation']
            conf = data['confidence']
            count = data['count']
            alts = data.get('alternatives', [])[:2]
            alt_str = f" | alts: {[a[0] for a in alts]}" if alts else ""
            print(f"    '{akk}': {{'meaning': '{eng}', 'type': '{stype}', 'confidence': {conf}}},  # {count}x{alt_str}")
    
    # Print names for reference
    print("\n" + "="*60)
    print(f"NAMES BLACKLIST ({len(names)} total)")
    print("These are kept as transliteration, not translated")
    print("="*60)
    
    # Sort by frequency if we tracked it
    print("\nFirst 50 names:")
    for name in sorted(list(names))[:50]:
        print(f"  {name}")
    
    print("\n" + "="*60)
    print("DONE")
    print("="*60)

VOCAB EXTRACTOR v5 - NAME BLACKLISTING
Loaded 1561 training sentences

--- Pass 1: Identifying names ---
Found 1116 unique names (blacklisted from vocabulary)

Sample names (first 20):
  ma-sà-a-a
  dan-a-a
  a-gu-a
  ku-ta-a
  a-šùr-bé-el-a-wa-tim
  la-qé-pì-im
  šu-(d)EN.LÍL
  mu-mu-lá-nim
  ša-ar-ni-ga-a-šu
  la-li-im
  a-šùr-be-el-a-wa-tim
  a-nu-li
  a-šur
  en-nam-(d)IŠKUR
  en-num-a-šur
  da-da-nim
  a-šur-ṣú-lu-li
  ku-ṣí-im
  i-di-a-bi₄-im
  in-ba-a-šùr

--- Pass 2: Extracting vocabulary ---
Extracted 210 vocabulary words

VOCABULARY BY TYPE (copy to mycelial_translator.py)

# === VERB (13 words) ===
    'šé-bi₄-lam': {'meaning': 'send', 'type': 'verb', 'confidence': 0.37},  # 16x | alts: ['silver', 'good']
    'ša-bu-ú': {'meaning': 'paid', 'type': 'verb', 'confidence': 0.33},  # 8x | alts: ['tariff', 'transport']
    'iḫ-da': {'meaning': 'take', 'type': 'verb', 'confidence': 0.44},  # 7x | alts: ['care', 'brothers']
    'a-dí-na-ku-ni': {'meaning': 'gave', 'type': 'verb', 'c

In [4]:
"""
MYCELIALNET AKKADIAN TRANSLATOR - COMPLETE VERSION
==================================================
Inspired by Paolo Dell'Aversana's MycelialNet architecture.

Instead of static positional lookup:
  LOGOGRAM → check position ±1,2,3 → assign meaning

We use distributed propagation:
  Each word is a NODE that:
  - Has initial confidence about its meaning
  - Communicates with neighboring nodes
  - Updates belief based on neighbor signals
  - Converges to final interpretation

Logograms are HIGH-CONFIDENCE SEED NODES that inject
certainty into the network. Meaning propagates outward
like nutrients through a mycelial network.

Key principles from Paolo's work:
1. Local rules → emergent global intelligence
2. No central processor
3. Adaptive connectivity based on context
4. Resonance between compatible meanings
"""

import pandas as pd
import numpy as np
import re
from collections import defaultdict
import json

# ============================================================
# KNOWLEDGE BASE (Seeds for the network)
# ============================================================

# Logograms: 100% confidence anchor nodes
LOGOGRAMS = {
    'KÙ.BABBAR': {'meaning': 'silver', 'type': 'material', 'confidence': 1.0},
    'KÙ.GI': {'meaning': 'gold', 'type': 'material', 'confidence': 1.0},
    'URUDU': {'meaning': 'copper', 'type': 'material', 'confidence': 1.0},
    'AN.NA': {'meaning': 'tin', 'type': 'material', 'confidence': 1.0},
    'ZABAR': {'meaning': 'bronze', 'type': 'material', 'confidence': 1.0},
    'GÍN': {'meaning': 'shekel', 'type': 'unit', 'confidence': 1.0},
    'GÍN.TA': {'meaning': 'shekel each', 'type': 'unit', 'confidence': 1.0},
    'MA.NA': {'meaning': 'mina', 'type': 'unit', 'confidence': 1.0},
    'GÚ': {'meaning': 'talent', 'type': 'unit', 'confidence': 1.0},
    'DUMU': {'meaning': 'son of', 'type': 'relation', 'confidence': 1.0},
    'DUMU.MUNUS': {'meaning': 'daughter of', 'type': 'relation', 'confidence': 1.0},
    'DAM': {'meaning': 'wife of', 'type': 'relation', 'confidence': 1.0},
    'DAM.GÀR': {'meaning': 'merchant', 'type': 'profession', 'confidence': 1.0},
    'KIŠIB': {'meaning': 'seal of', 'type': 'document', 'confidence': 1.0},
    'É': {'meaning': 'house', 'type': 'structure', 'confidence': 1.0},
    'ANŠE': {'meaning': 'donkey', 'type': 'animal', 'confidence': 1.0},
    'TÚG': {'meaning': 'textile', 'type': 'goods', 'confidence': 1.0},
    'TÚG.ḪI.A': {'meaning': 'textiles', 'type': 'goods', 'confidence': 1.0},
    'UDU': {'meaning': 'sheep', 'type': 'animal', 'confidence': 1.0},
    'SÍG': {'meaning': 'wool', 'type': 'material', 'confidence': 1.0},
    'SÍG.ḪI.A': {'meaning': 'wool', 'type': 'material', 'confidence': 1.0},
    'ITU': {'meaning': 'month', 'type': 'time', 'confidence': 1.0},
    'ITU.KAM': {'meaning': 'month', 'type': 'time', 'confidence': 1.0},
    'MU': {'meaning': 'year', 'type': 'time', 'confidence': 1.0},
    'IGI': {'meaning': 'before/witnessed by', 'type': 'legal', 'confidence': 1.0},
    'ŠU.NÍGIN': {'meaning': 'total', 'type': 'accounting', 'confidence': 1.0},
    'SIG₅': {'meaning': 'fine/good quality', 'type': 'quality', 'confidence': 1.0},
}

# Semantic type compatibility matrix
TYPE_COMPATIBILITY = {
    'material': {'unit': 0.9, 'quality': 0.9, 'number': 0.8, 'verb': 0.5},
    'unit': {'material': 0.9, 'number': 0.95, 'quality': 0.3},
    'relation': {'name': 0.95, 'profession': 0.6},
    'name': {'relation': 0.9, 'name': 0.7, 'legal': 0.6},
    'number': {'unit': 0.95, 'material': 0.7, 'goods': 0.8, 'animal': 0.8},
    'quality': {'material': 0.9, 'goods': 0.8, 'animal': 0.7},
    'animal': {'quality': 0.8, 'number': 0.8, 'color': 0.9},
    'color': {'animal': 0.9, 'goods': 0.7},
    'goods': {'number': 0.8, 'quality': 0.8},
    'time': {'name': 0.7, 'number': 0.8},
    'legal': {'name': 0.9},
    'document': {'name': 0.95},
    'profession': {'name': 0.7, 'relation': 0.5},
    'verb': {'name': 0.6, 'material': 0.6, 'number': 0.5},
    'particle': {'any': 0.3},
    'speech': {'name': 0.7, 'particle': 0.5},
    'financial': {'material': 0.8, 'number': 0.7, 'unit': 0.6},
    'accounting': {'number': 0.9, 'material': 0.7},
    'structure': {'name': 0.6, 'place': 0.5},
}

# Known vocabulary with semantic types
VOCABULARY = {
    # Quality descriptors
    'ṣa-ru-pá-am': {'meaning': 'refined', 'type': 'quality', 'confidence': 0.9},
    'ṣa-ru-pu-um': {'meaning': 'refined', 'type': 'quality', 'confidence': 0.9},
    'ṣa-ru-pí-im': {'meaning': 'refined', 'type': 'quality', 'confidence': 0.9},
    
    # Donkey colors
    'ṣa-lá-mu': {'meaning': 'black', 'type': 'color', 'confidence': 0.9},
    'ṣa-lá-ma-am': {'meaning': 'black', 'type': 'color', 'confidence': 0.9},
    'ṣa-lá-me': {'meaning': 'black', 'type': 'color', 'confidence': 0.9},
    'ṣa-la-mu': {'meaning': 'black', 'type': 'color', 'confidence': 0.9},
    
    # Copper qualities
    'ší-kam': {'meaning': 'good (sikku)', 'type': 'quality', 'confidence': 0.85},
    'ší-ku-um': {'meaning': 'good (sikku)', 'type': 'quality', 'confidence': 0.85},
    'ma-sí-um': {'meaning': 'washed', 'type': 'quality', 'confidence': 0.85},
    'ma-sí-im': {'meaning': 'washed', 'type': 'quality', 'confidence': 0.85},
    'ma-sí-a-am': {'meaning': 'washed', 'type': 'quality', 'confidence': 0.85},
    'lá-mu-num': {'meaning': 'refined', 'type': 'quality', 'confidence': 0.85},
    'lá-mu-nam': {'meaning': 'refined', 'type': 'quality', 'confidence': 0.85},
    
    # Common verbs
    'i-dí-in': {'meaning': 'he gave', 'type': 'verb', 'confidence': 0.8},
    'a-dí-in': {'meaning': 'I gave', 'type': 'verb', 'confidence': 0.8},
    'ni-dí-in': {'meaning': 'we gave', 'type': 'verb', 'confidence': 0.8},
    'il₅-qé': {'meaning': 'he received', 'type': 'verb', 'confidence': 0.8},
    'al-qé': {'meaning': 'I received', 'type': 'verb', 'confidence': 0.8},
    'i-ša-qal': {'meaning': 'he will pay', 'type': 'verb', 'confidence': 0.8},
    'iš-qúl': {'meaning': 'he paid', 'type': 'verb', 'confidence': 0.8},
    'iš-qú-ul': {'meaning': 'he paid', 'type': 'verb', 'confidence': 0.8},
    
    # Particles
    'ša': {'meaning': 'of/which', 'type': 'particle', 'confidence': 0.7},
    'a-na': {'meaning': 'to/for', 'type': 'particle', 'confidence': 0.7},
    'ù': {'meaning': 'and', 'type': 'particle', 'confidence': 0.7},
    'ú': {'meaning': 'and', 'type': 'particle', 'confidence': 0.7},
    'i-na': {'meaning': 'in/from', 'type': 'particle', 'confidence': 0.7},
    'iš-tù': {'meaning': 'from/since', 'type': 'particle', 'confidence': 0.7},
    'iš-tí': {'meaning': 'with', 'type': 'particle', 'confidence': 0.7},
    'ki-ma': {'meaning': 'like/as', 'type': 'particle', 'confidence': 0.7},
    'šu-ma': {'meaning': 'if', 'type': 'particle', 'confidence': 0.7},
    
    # Speech/communication
    'um-ma': {'meaning': 'thus (says)', 'type': 'speech', 'confidence': 0.85},
    'qí-bi-ma': {'meaning': 'speak!', 'type': 'speech', 'confidence': 0.85},
    'qí-bi₄-ma': {'meaning': 'speak!', 'type': 'speech', 'confidence': 0.85},
    
    # Financial terms
    'ṣí-ib-tám': {'meaning': 'interest', 'type': 'financial', 'confidence': 0.85},
    'ṣí-ba-sú': {'meaning': 'his interest', 'type': 'financial', 'confidence': 0.85},
    'qá-qá-ad': {'meaning': 'principal', 'type': 'financial', 'confidence': 0.85},
    'ḫu-bu-ul': {'meaning': 'debt of', 'type': 'financial', 'confidence': 0.85},
    
    # Time/Calendar
    'li-mu-um': {'meaning': 'eponymy (year)', 'type': 'time', 'confidence': 0.85},
    'ḫa-muš-tim': {'meaning': 'week', 'type': 'time', 'confidence': 0.85},
    'ḫa-mu-uš-tim': {'meaning': 'week', 'type': 'time', 'confidence': 0.85},
    
    # Places
    'kà-ni-iš': {'meaning': 'Kanesh', 'type': 'place', 'confidence': 0.9},
    'kà-ru-um': {'meaning': 'kārum (colony)', 'type': 'place', 'confidence': 0.85},
    'a-lim': {'meaning': 'the City (Assur)', 'type': 'place', 'confidence': 0.85},
    
    # People terms
    'a-ḫi': {'meaning': 'my brother', 'type': 'relation', 'confidence': 0.8},
    'a-bi': {'meaning': 'my father', 'type': 'relation', 'confidence': 0.8},
    'be-lí': {'meaning': 'my lord', 'type': 'relation', 'confidence': 0.8},
    'um-me-a-nim': {'meaning': 'investor', 'type': 'profession', 'confidence': 0.8},
}
# Load extracted vocabulary
import json
try:
    with open('extracted_vocab.json', 'r', encoding='utf-8') as f:
        extracted = json.load(f)
    
    # Merge into VOCABULARY
    for word, data in extracted.items():
        if word not in VOCABULARY:
            VOCABULARY[word] = {
                'meaning': data['meaning'],
                'type': data.get('type', 'unknown'),
                'confidence': data['confidence']
            }
    
    print(f"Loaded {len(extracted)} words from extracted_vocab.json")
except:
    print("No extracted vocab found, using base vocabulary only")
# ============================================================
# WORD NODE CLASS
# ============================================================

class WordNode:
    """
    A node in the mycelial network.
    Each word is a node that:
    - Holds beliefs about its meaning
    - Communicates with neighbors
    - Updates based on network signals
    """
    
    def __init__(self, word, position):
        self.word = word
        self.position = position
        self.neighbors = []
        
        # Initialize beliefs
        self.beliefs = {}
        self.semantic_type = None
        self.is_anchor = False
        
        # Check if it's a known word
        self._initialize_beliefs()
    
    def _initialize_beliefs(self):
        """Set initial beliefs based on knowledge base"""
        
        # Check if logogram
        if self.word in LOGOGRAMS:
            data = LOGOGRAMS[self.word]
            self.beliefs = {data['meaning']: data['confidence']}
            self.semantic_type = data['type']
            self.is_anchor = True
            return
        
        # Check subscript variants
        clean = re.sub(r'[₀-₉]', '', self.word)
        if clean in LOGOGRAMS:
            data = LOGOGRAMS[clean]
            self.beliefs = {data['meaning']: data['confidence']}
            self.semantic_type = data['type']
            self.is_anchor = True
            return
        
        # Check vocabulary
        if self.word in VOCABULARY:
            data = VOCABULARY[self.word]
            self.beliefs = {data['meaning']: data['confidence']}
            self.semantic_type = data['type']
            return
        
        # Check if number
        if re.match(r'^[\d.]+$', self.word):
            self.beliefs = {self.word: 0.95}
            self.semantic_type = 'number'
            return
        
        # Unknown word - start with uncertainty
        self.beliefs = {self.word: 0.3}
        self.semantic_type = 'unknown'
    
    def get_best_belief(self):
        """Return highest confidence meaning"""
        if not self.beliefs:
            return self.word, 0.0
        
        # Filter out inference tracking keys
        actual_beliefs = {k: v for k, v in self.beliefs.items() 
                          if not k.endswith('_inference') and not k.endswith('_candidate')}
        
        if not actual_beliefs:
            return self.word, 0.3
        
        best = max(actual_beliefs.items(), key=lambda x: x[1])
        return best
    
    def receive_signal(self, sender_type, sender_meaning, signal_strength):
        """
        Receive a signal from a neighbor node.
        Update beliefs based on semantic compatibility.
        """
        if self.is_anchor:
            return
        
        if self.semantic_type == 'unknown':
            self._infer_from_context(sender_type, sender_meaning, signal_strength)
    
    def _infer_from_context(self, neighbor_type, neighbor_meaning, strength):
        """
        Systematic inference using TYPE_COMPATIBILITY matrix.
        Geological facies correlation engine.
        """
        if self.is_anchor:
            return
        
        # Get compatible types for this neighbor
        if neighbor_type not in TYPE_COMPATIBILITY:
            return
        
        compatible_types = TYPE_COMPATIBILITY[neighbor_type]
        
        # Propagate belief to compatible semantic types
        for possible_type, compatibility_score in compatible_types.items():
            if possible_type == 'any':
                continue
            
            # Calculate belief boost
            boost = strength * compatibility_score * 0.4
            
            # Create or update belief for this type
            belief_key = f"{possible_type}_inference"
            
            if belief_key not in self.beliefs:
                self.beliefs[belief_key] = boost
            else:
                self.beliefs[belief_key] = min(0.95, self.beliefs[belief_key] + boost)
            
            # If confidence crosses threshold, assign the semantic type
            if self.beliefs[belief_key] > 0.6 and self.semantic_type == 'unknown':
                self.semantic_type = possible_type
                self.beliefs[self.word] = self.beliefs[belief_key]
    
    def propagate(self):
        """
        Send signals to all neighbors.
        Returns list of (neighbor, signal) tuples.
        """
        if not self.beliefs:
            return []
        
        meaning, confidence = self.get_best_belief()
        
        # Signal strength decays with uncertainty
        signal_strength = confidence * 0.8
        
        signals = []
        for neighbor in self.neighbors:
            signals.append((neighbor, self.semantic_type, meaning, signal_strength))
        
        return signals


# ============================================================
# MYCELIAL NETWORK CLASS
# ============================================================

class MycelialNetwork:
    """
    A network of word nodes that propagate meaning.
    """
    
    def __init__(self, words):
        """Build network from word list"""
        self.nodes = []
        
        # Create nodes
        for i, word in enumerate(words):
            node = WordNode(word, i)
            self.nodes.append(node)
        
        # Connect neighbors (bidirectional, within window)
        window_size = 3
        for i, node in enumerate(self.nodes):
            for j in range(max(0, i - window_size), min(len(self.nodes), i + window_size + 1)):
                if i != j:
                    node.neighbors.append(self.nodes[j])
    
    def propagate_iteration(self):
        """
        One iteration of signal propagation.
        All nodes send signals simultaneously, then update.
        """
        # Collect all signals
        all_signals = []
        for node in self.nodes:
            signals = node.propagate()
            all_signals.extend(signals)
        
        # Apply signals
        for receiver, sender_type, sender_meaning, strength in all_signals:
            receiver.receive_signal(sender_type, sender_meaning, strength)
    
    def converge(self, max_iterations=5):
        """
        Run propagation until convergence or max iterations.
        """
        for i in range(max_iterations):
            self.propagate_iteration()
    
    def get_translation(self):
        """
        Extract final translation from converged network.
        """
        result = []
        for node in self.nodes:
            meaning, confidence = node.get_best_belief()
            
            # Clean up candidate markers
            if meaning.endswith('_candidate'):
                meaning = node.word
            
            result.append({
                'original': node.word,
                'translation': meaning,
                'confidence': confidence,
                'type': node.semantic_type,
            })
        
        return result


# ============================================================
# TRANSLATOR CLASS
# ============================================================

class MycelialTranslator:
    """
    Full translator using mycelial network approach.
    """
    
    def __init__(self):
        self.stats = defaultdict(int)
    
    def translate(self, akkadian_text):
        """Translate a single sentence"""
        
        # Tokenize
        words = akkadian_text.split()
        
        # Build network
        network = MycelialNetwork(words)
        
        # Propagate until convergence
        network.converge(max_iterations=5)
        
        # Extract results
        node_results = network.get_translation()
        
        # Build output string
        output_words = []
        for res in node_results:
            self.stats[res['type']] += 1
            output_words.append(res['translation'])
        
        return ' '.join(output_words)
    
    def translate_batch(self, texts):
        """Translate multiple sentences"""
        return [self.translate(t) for t in texts]
    
    def get_stats(self):
        return dict(self.stats)


# ============================================================
# MAIN EXECUTION
# ============================================================

if __name__ == '__main__':
    print("="*60)
    print("MYCELIALNET AKKADIAN TRANSLATOR")
    print("="*60)
    
    # Test with sample sentences
    print("\n" + "="*60)
    print("SAMPLE TRANSLATIONS")
    print("="*60)
    
    # Sample Akkadian sentences
    samples = [
        "KIŠIB ma-nu-ba-lúm-a-šur DUMU ḫu-za-lúm",
        "10 MA.NA KÙ.BABBAR ṣa-ru-pá-am",
        "1 ANŠE ṣa-lá-mu SIG₅",
    ]
    
    translator = MycelialTranslator()
    
    for idx, akkadian in enumerate(samples):
        print(f"\n--- Example {idx + 1} ---")
        print(f"Akkadian: {akkadian}")
        
        # Build network for visualization
        words = akkadian.split()
        network = MycelialNetwork(words)
        network.converge()
        
        result = network.get_translation()
        trans = ' '.join([r['translation'] for r in result])
        print(f"Translation: {trans}")
        
        # Show node details
        print("Node breakdown:")
        for r in result:
            print(f"  {r['original']:20} -> {r['translation']:20} ({r['type']}, {r['confidence']:.0%})")
    
    print("\n" + "="*60)
    print("DONE")
    print("="*60)

Loaded 6648 words from extracted_vocab.json
MYCELIALNET AKKADIAN TRANSLATOR

SAMPLE TRANSLATIONS

--- Example 1 ---
Akkadian: KIŠIB ma-nu-ba-lúm-a-šur DUMU ḫu-za-lúm
Translation: seal of ma-nu-ba-lúm-a-šur son of ḫu-za-lúm
Node breakdown:
  KIŠIB                -> seal of              (document, 100%)
  ma-nu-ba-lúm-a-šur   -> ma-nu-ba-lúm-a-šur   (name, 61%)
  DUMU                 -> son of               (relation, 100%)
  ḫu-za-lúm            -> ḫu-za-lúm            (name, 61%)

--- Example 2 ---
Akkadian: 10 MA.NA KÙ.BABBAR ṣa-ru-pá-am
Translation: 10 mina silver refined
Node breakdown:
  10                   -> 10                   (number, 95%)
  MA.NA                -> mina                 (unit, 100%)
  KÙ.BABBAR            -> silver               (material, 100%)
  ṣa-ru-pá-am          -> refined              (quality, 90%)

--- Example 3 ---
Akkadian: 1 ANŠE ṣa-lá-mu SIG₅
Translation: 1 donkey black fine/good quality
Node breakdown:
  1                    -> 1                 