In [1]:
import xml.etree.ElementTree as ET
import json

def parse_tmx_no_to_en(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    root = ET.fromstring(content)
    pairs = []
    
    for tu in root.findall('.//tu'):
        tuvs = tu.findall('tuv')
        if len(tuvs) >= 2:
            en_text = None
            nb_text = None
            
            for tuv in tuvs:
                lang = tuv.get('{http://www.w3.org/XML/1998/namespace}lang')
                seg = tuv.find('seg')
                text = seg.text.strip() if seg is not None and seg.text else ""
                
                if lang == 'en' and text:
                    en_text = text
                elif lang == 'nb' and text:
                    nb_text = text
            
            if nb_text and en_text:
                pairs.append((nb_text, en_text))
    
    return pairs

def save_parallel_files(pairs, output_prefix):
    with open(f"{output_prefix}.no", 'w', encoding='utf-8') as no_file, \
         open(f"{output_prefix}.en", 'w', encoding='utf-8') as en_file:
        for no_text, en_text in pairs:
            no_file.write(no_text + '\n')
            en_file.write(en_text + '\n')
    
    print(f"Saved: {output_prefix}.no ({len(pairs)} lines)")
    print(f"Saved: {output_prefix}.en ({len(pairs)} lines)")

def save_tsv(pairs, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("norwegian\tenglish\n")
        for no_text, en_text in pairs:
            f.write(f"{no_text}\t{en_text}\n")
    print(f"Saved: {output_file} ({len(pairs)} pairs)")

def save_jsonl(pairs, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for no_text, en_text in pairs:
            json.dump({
                "source": no_text,
                "target": en_text,
                "source_lang": "nb",
                "target_lang": "en"
            }, f, ensure_ascii=False)
            f.write('\n')
    print(f"Saved: {output_file} ({len(pairs)} pairs)")

def save_huggingface_format(pairs, output_file):
    data = {
        "translation": [
            {"nb": no_text, "en": en_text} 
            for no_text, en_text in pairs
        ]
    }
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"Saved: {output_file} ({len(pairs)} pairs)")

def save_fairseq_format(pairs, output_prefix):
    save_parallel_files(pairs, output_prefix)
    print(f"Fairseq format saved and ready for fairseq-preprocess")

def convert_tmx_no_to_en(input_file, format_type="jsonl", output_name=None):
    try:
        pairs = parse_tmx_no_to_en(input_file)
        print(f"\nExtracted {len(pairs)} translation pairs from {input_file}\n")
        
        print("First 3 examples:")
        print("-" * 80)
        for i, (no_text, en_text) in enumerate(pairs[:3], 1):
            print(f"{i}. Norwegian: {no_text}")
            print(f"   English: {en_text}\n")
        
    except Exception as e:
        print(f"Error parsing TMX file: {e}")
        return
    
    if not output_name:
        output_name = input_file.replace('.tmx', '').replace('.', '_')
    
    print("=" * 80)
    try:
        if format_type == "parallel":
            save_parallel_files(pairs, output_name)
        elif format_type == "tsv":
            save_tsv(pairs, f"{output_name}.tsv")
        elif format_type == "jsonl":
            save_jsonl(pairs, f"{output_name}.jsonl")
        elif format_type == "huggingface":
            save_huggingface_format(pairs, f"{output_name}_hf.json")
        elif format_type == "fairseq":
            save_fairseq_format(pairs, output_name)
        else:
            print(f"Unknown format: {format_type}")
            return
        
        print(f"\nConversion complete!")
        
    except Exception as e:
        print(f"Error saving file: {e}")

if __name__ == "__main__":
    convert_tmx_no_to_en("npd.no.en-nb.tmx", "jsonl", "npd_no_en_train")



Extracted 26324 translation pairs from npd.no.en-nb.tmx

First 3 examples:
--------------------------------------------------------------------------------
1. Norwegian: Brønn 16/1-23 S skal borast frå boreinnretninga Rowan Viking i posisjon 58°49’47,04’’ nord 02°16’56,00’’ aust i utvinningsløyve 338.
   English: Well 16/1-23 S will be drilled from the Rowan Viking drilling facility at position 58°49’47.04’’ north 02°16’56.00’’ east in production licence 338.

2. Norwegian: Boreprogrammet for brønn 16/1-23 S gjeld boring av avgrensingsbrønn i utvinningsløyve 338.
   English: The drilling programme for well 16/1-23 S relates to the drilling of an appraisal well in production licence 338.

3. Norwegian: Lundin er operatør med 50 prosent del.
   English: Lundin is the operator with a 50 per cent ownership interest.

Saved: npd_no_en_train.jsonl (26324 pairs)

Conversion complete!
