In [4]:
# install if needed:
# !pip install python-docx lxml

from docx import Document
from lxml import etree
import re, os

In [6]:
doc = Document('Original/Paiwan Ch2 Preprocessed.docx')
raw_lines = [] # will hold list of all lines in word doc
for para in doc.paragraphs:
    for ln in para.text.split('\n'): # split each paragraph on \n
        ln = ln.strip() # remove leading and trailing spaces/tabs
        raw_lines.append(ln)

In [7]:
def is_header(line):
    # headers start with 3 digits followed by a space and do not contain tabs
    return bool(re.match(r'^\d{3}\s+', line)) and '\t' not in line

In [11]:
def is_sentence_start(line):
    # 3 digits + tab
    return bool(re.match(r'^\d{3}\t', line))

In [12]:
dialect_map = {
    # Eastern
    "Tjauvałi":   "Eastern",
    "Tjavuałi":   "Eastern",
    "Patjavał":   "Eastern",
    # Southern
    "Qatsiłay":     "Southern",
    "Tjakuvukuvuł": "Southern",
    "Łaleklek":     "Southern",
    # Central
    "Kułałau":    "Central",
    "Tjałakavus": "Central",
    "Kaviangan":  "Central",
    # Northern
    "Kapayuanan": "Northern",
    "Vałulu":     "Northern",
    "Tjukuvuł":   "Northern",
}

In [13]:
stories = {} # nested dict holding content of stories mapped to sid
i = 0 # iterate through raw_lines

while i < len(raw_lines):
    if not is_header(raw_lines[i]):
        i += 1
        continue

    # split on any whitespace once --> sid is the 3 digit number and title is rest of line
    sid, title = raw_lines[i].split(None, 1)
    
    # subtitle is the next line
    subtitle = raw_lines[i+1]

    # skip 2 lines to first sentence
    i += 2

    # collect all lines for current story
    story_lines = []
    while i < len(raw_lines) and not is_header(raw_lines[i]):
        story_lines.append(raw_lines[i])
        i += 1

    # remove compiled english translation at the end of every story
    # find every index in story_lines where a sentence starts
    starts = []
    for idx, line in enumerate(story_lines):
        if is_sentence_start(line):
            starts.append(idx)

    if starts:
        last_start = starts[-1] # where last sentence begins
        # scan from that last_start forward to find the free‐translation line
        for j in range(last_start, len(story_lines)):
            if '\t' not in story_lines[j]:
                # keep everything up to and including that line
                story_lines = story_lines[:j+1]
                break

    # list for the parsed sentences
    sentences = []

    # process each sentence block
    for s_idx in range(len(starts)):
        start_idx = starts[s_idx]
        if s_idx + 1 < len(starts):
            end_idx = starts[s_idx + 1]
        else:
            end_idx = len(story_lines)
        block = story_lines[start_idx:end_idx]

        # remove any footnotes (starts with open bracket)
        filtered_block = []
        for line in block:
            if not line.startswith('['):
                filtered_block.append(line)

        block = filtered_block
        if len(block) == 0:
            continue

        free_tr = block[-1]
        tier_lines = block[:-1]

        # TODO: lines containing one word are being counted as the free translation because they contain no tabs
            # lines dropped --> change these lines during QC

        # check that each block has 3 tiers (orig, morph, gloss), if not then drop lines
        total = len(tier_lines)
        triplets = total // 3
        if triplets * 3 != total:
            print(f"story {sid}, sentence {tier_lines[0]} has {total} lines, dropping {total-triplets*3}")
        tier_lines = tier_lines[:triplets*3]

        # chunk into orig, morph, gloss
        orig_parts, morph_parts, gloss_parts = [], [], []
        for k in range(0, len(tier_lines), 3):
            orig_line = tier_lines[k]
            orig_line_clean = re.sub(r'^\d{3}\t+', '', orig_line)  # remove sentence number and tab
            orig_parts.append(orig_line_clean)
            morph_parts.append(tier_lines[k+1])
            gloss_parts.append(tier_lines[k+2])

        # join sentences that wrapped more than once
        full_orig = ' '.join(orig_parts)
        full_morph = ' '.join(morph_parts)
        full_gloss = ' '.join(gloss_parts)

        sentences.append({
            'orig': full_orig,
            'morph': full_morph,
            'gloss': full_gloss,
            'free_tr': free_tr
        })

    stories[sid] = {
        'title':     title,
        'subtitle':  subtitle,
        'sentences': sentences
    }

story 061, sentence 034	amin. has 1 lines, dropping 1
story 062, sentence 022	amin. has 1 lines, dropping 1
story 071, sentence 064	amin. has 1 lines, dropping 1
story 072, sentence 045	amin. has 1 lines, dropping 1
story 074, sentence 070	amin. has 1 lines, dropping 1
story 075, sentence 075	amin. has 1 lines, dropping 1
story 095, sentence 008	?aming. has 1 lines, dropping 1
story 096, sentence 049	?aming. has 1 lines, dropping 1
story 097, sentence 022	amin. has 1 lines, dropping 1


In [14]:
os.makedirs('xml_output', exist_ok=True)
for sid, data in stories.items():
    # <TEXT>
    TEXT = etree.Element('TEXT')
    TEXT.set('id', f"PaiwanCh2_{sid}_{data['title']}")
    TEXT.set('{http://www.w3.org/XML/1998/namespace}lang', 'pwn')
    TEXT.set('copyright', 'public domain')
    TEXT.set('citation',
                'Early, R. J., & Whitehorn, J. (2003). One hundred Paiwan texts. Pacific Linguistics, Research School of Pacific and Asian Studies, The Australian National University.')
    TEXT.set('BibTeX_citation',
                '@book{100paiwantexts, author = {Robert Early and John Whitehorn}, title = {One Hundred Paiwan Texts}, year = {2003}, publisher = {Pacific Linguistics, Research School of Pacific and Asian Studies, The Australian National University}}')
    TEXT.set('source', data['subtitle'])

    ## dialect attribute
    dialect = "Unknown"
    subtitle = data['subtitle']
    for village_name, region in dialect_map.items():
        if village_name in subtitle:
            dialect = region
            break
    if dialect != "Unknown":
        TEXT.set('dialect', dialect)

    # sentences
    for sidx, sent in enumerate(data['sentences'], start=1):
        S = etree.SubElement(TEXT, 'S', id=f'{sid}S{sidx}')
        # FORM original
        fo = etree.SubElement(S, 'FORM', kindOf='original')
        fo.text = sent['orig']

        # free translation
        tr = etree.SubElement(S, 'TRANSL')
        tr.set('{http://www.w3.org/XML/1998/namespace}lang', 'en')
        tr.text = sent['free_tr']

        # word‐level breakdown
        delims = r'\t| '
        orig_toks = re.split(delims, sent['orig'])
        morph_toks = re.split(delims, sent['morph'])
        gloss_toks = re.split(delims, sent['gloss'])

        for widx, (wform, mform, gform) in enumerate(zip(orig_toks, morph_toks, gloss_toks), start=1):
            W = etree.SubElement(S, 'W', id=f'{sid}S{sidx}W{widx}')
            # word FORM
            wo = etree.SubElement(W, 'FORM')
            wo.text = wform
            # split the morph and gloss on hyphens
            morph_pieces = mform.split('-')
            gloss_pieces = gform.split('-')

            # if they match up, create one <M> per morpheme
            if len(morph_pieces) == len(gloss_pieces):
                for mid, (mp, gp) in enumerate(zip(morph_pieces, gloss_pieces)):
                    M = etree.SubElement(W, 'M', id=f'{sid}S{sidx}W{widx}M{mid}')
                    mo = etree.SubElement(M, 'FORM', kindOf='original')
                    mo.text = mp
                    mg = etree.SubElement(M, 'TRANSL')
                    mg.text = gp
            else:
                # if counts don't match, keep together
                # TODO: check this during QC
                print(f"check story {sid}, sentence {sidx}, word {widx}")
                M = etree.SubElement(W, 'M', id=f'{sid}S{sidx}W{widx}M{mid}')
                mo = etree.SubElement(M, 'FORM', kindOf='original')
                mo.text = mform
                mg = etree.SubElement(M, 'TRANSL')
                mg.text = gform


    tree = etree.ElementTree(TEXT)
    output = f'xml_output/PaiwanCh2_{sid}.xml'
    tree.write(output, encoding='UTF-8', pretty_print=True)

print(f"wrote {len(stories)} XML files to ./xml_output/")

check story 002, sentence 3, word 8
check story 078, sentence 4, word 19
wrote 100 XML files to ./xml_output/
