# Build Version 1.0 NENA from 0.01

We currently have all NENA texts stored in an older version of the
NENA text-format, with some deprecated markdown strings (for example).
In this notebook, we'll build the code to convert these version 0.01 texts
into the new version 0.02.

In [1]:
import re
import collections
import unicodedata
from pathlib import Path

In [2]:
dialects = Path('/Users/cody/github/CambridgeSemiticsLab/nena_corpus/texts/0.01')

dialects2texts = collections.defaultdict(lambda: collections.defaultdict())

for dialect_file in dialects.glob('*'):
    dialect = dialect_file.name
    for text_file in dialect_file.glob('*.nena'):
        title = text_file.stem
        text = text_file.read_text()
        dialects2texts[dialect][title] = text

In [3]:
dialects2texts.keys()

dict_keys(['Barwar', 'Urmi_C'])

In [4]:
test = dialects2texts['Barwar']['A Hundred Gold Coins']

print(test[:500])

# A Hundred Gold Coins

source: bar text a1-A7.html
text_id: A6
informant: Yuwəl Yuḥanna
place: Dure

(1) xá-ga xèta,ˈ mállah Naṣràdin,ˈ xázəx mòdi wíða.ˈ gu-bɛ̀θa wéwa,ˈ har-zála-w
θàya.ˈ zála-w θàya,ˈ mára ya-ʾàlaha,ˈ yawə̀tliˈ ʾə́mma dàwe.ˈ ʾən-hàwaˈ ʾə́č̣č̣i-u
ʾə́č̣č̣a maqəlbə̀nna.ˈ ʾu-ʾən-hàwaˈ ʾə́mma-w-xà-ži,ˈ la-băyə̀nna.ˈ de-šùqla.ˈ ʾə̀mma
gắrəg háwa drə́st.ˈ (2) b-álaha hóle zála-w θàya,ˈ ʾíθwale xá-šwawa huðàya,ˈ
maṣóθe ʾə́lle dìye.ˈ mə́re xázəx ʾáwwa dū̀s-ile.ˈ qɛ́mən mjarbə̀nne.ˈ síq


## Make Modifications

As much as possible, we'll use regex to clean out old markdown standards.
We also have to do some surgery on the metadata section of each text.

In some cases, we need to apply corrections to the underlying text due to problems
that were left behind previously. We also do that.

In [5]:
text_re = re.compile(r'^# ([^\n]*)\n\n(.*?)\n\n(.*)', re.DOTALL)
attributes = re.compile(r'(.*): (.*)?')

def norm(text):
    return unicodedata.normalize('NFD', text)

# order of replacement patterns matters
# we make a number of different edits, including 
# edits of existing standards as well as correcting
# bad characters / encodings
replacements = [
    
    # replace speaker tags
    (re.compile(r'\[([A-Za-z]+):(.*?)\]', re.DOTALL), '«\g<1>: \g<2>»'),
    
    # replace emphases and language tags
    (re.compile(r'<([A-Za-z]+)>\*(.*?)\*([.,]?)<[A-Za-z]+>', re.DOTALL), r'<\g<1>:\g<2>>\g<3>'),
    
    # add missing lang tags for these elements
    (
        re.compile(norm(r'(ʾArmanəs-⁺tɑ̄̀n|⁺Hayə̀st[ɑa]n|ʾArmanə̀s-⁺tɑn|Téhrɑn|Šɑ̄h Abbɑ̄̀s)')),
        '<?:\g<1>>',
    ),
    
    # replace blank emphases with unknown language tags
    (re.compile(r'\*(.*?)\*', re.DOTALL), r'<?:\g<1>>'),
    
    # add English language tag
    (re.compile(norm('<\?:(sýrop)>')), '<E:\g<1>>'),
    
    # remove one interruption note
    (re.compile(r'\([^\d]*?\)\s?\n?', re.DOTALL), ''),
    
    # remove stylistic linebreak markers
    (re.compile(r'\/'), ''),
     
    # remove any footnotes at bottom of text
    (re.compile(r'\[\^\d*\]:.*', re.DOTALL), ''),
    
    # remove any footnotes in body of text
    (re.compile(r'\[\^\d*\]'), ''),
    
    # replace or correct idiosyncratic characters
    (re.compile(r'\u0248|\u0249'), 'ɟ'),
    (re.compile(r'\u02b8'), 'y'),
    (re.compile(r'ı'), 'i'),
    (re.compile(r'ɑ'), 'a'),
    (re.compile(norm(r'ĉ')),  norm('č')), 
    (re.compile(norm(r'p̂')), norm('p̭')),
    (re.compile(norm('bŕatan')), norm('brátan')),
    (re.compile(r'\n—ˈ'), '\nˈ —'),
    (re.compile(r'ʾ⁺'), '⁺ʾ'), # switch places between ayin and +
    (re.compile(r'([^ -:])(⁺)'), '\g<1> \g<2>'), # fix missing spaces 
    (re.compile(r' \.\. '), ' ... '),
    (re.compile(r'\.\.\.\.+'), '...'),
    (re.compile(r'  +'), ' '),
    (re.compile(r'!\.'), '!'), 
    
    # add trailing space to any ellipses
    (re.compile(r'(\.\.\.)[^ \n]', re.DOTALL), '... '),
    
    # merge a few side-by-side lang tags
    (re.compile(norm(r'(⁺ʾávun <P:Nādər-Šā́h)> <P:(ʾafšā̀r>)')), '\g<1> \g<2>'),
    (re.compile(norm(r'(<P:dādgā̀h)>ˈ <P:(dādgā́h>)')), '\g<1>ˈ \g<2>'),
    (re.compile(norm(r'(<P:măʿammà)>ˈ <P:(măʿammà>)')), '\g<1>ˈ \g<2>'),
    
]

def apply_replacements(text):
    """Iterate through re patterns and call sub on text."""
    for patt, replace in replacements:
        text = patt.sub(replace, text)
    return text.strip()
        
def metadata_block(dialect, title, meta_string):
    """Build metadata block for version 1.0"""
    attribs = attributes.findall(meta_string)
    attribs.insert(0, ('encoding', 'UTF8'))
    attribs.insert(0, ('title', title))
    attribs.insert(0, ('dialect', dialect))
    return '\n'.join(f'{attr.strip()} = {val.strip()}'
                         for attr, val in attribs)
def adjust_title(title_str):
    """Adjust deficiencies in the titles"""
    first_letter = title_str[:1].upper()
    return first_letter + title_str[1:]

In [6]:
# #test patterns

# patt, repl = (re.compile(r'([^ -:])(⁺)'), '\g<1> \g<2>')
# test = 'P:ʾəhtiyā̀j>⁺ʾallux.'
# patt.sub(repl, test)

Make the replacements...

In [8]:
transformed_texts = []

new_dir = Path('/Users/cody/github/CambridgeSemiticsLab/nena_corpus/texts/1.0/')
for dialect, texts in dialects2texts.items():
    dialect_dir = new_dir.joinpath(f'{dialect}')
    dialect_dir.mkdir(exist_ok=True)
    for title, text in texts.items():
        title, metadata, text = text_re.search(text).groups()
        text = norm(text)
        title = adjust_title(title)
        meta_block = metadata_block(dialect, title, metadata)
        text_block = apply_replacements(text)
        new_text = meta_block + '\n\n' + text_block
        filename = dialect_dir.joinpath(f'{title}.nena')
        filename.write_text(new_text)
        transformed_texts.append(new_text)