In [26]:
import re
import collections
import unicodedata
from pathlib import Path
from pprint import pprint

In [2]:
def grapheme_len(s):
    """Return the number of spacing characters in a string"""
    return len([c for c in s if unicodedata.category(c) != 'Mn'])

def split_string(s, maxlen=80):
    """Yields lines of string with a maximum length.
    
    Lines will not break across word boundaries.
    """
    pos = 0 
    line = []
    for w in s.split():
        w_len = grapheme_len(w)
        if pos + w_len > maxlen:
            yield ' '.join(line)
            pos = 0 
            line = []
        pos += w_len + 1 
        line.append(w)
    yield ' '.join(line)

In [3]:
dialects = Path('/Users/cody/github/CambridgeSemiticsLab/nena_corpus/texts/1.0')

dialects2texts = collections.defaultdict(lambda: collections.defaultdict())

for dialect_file in dialects.glob('*'):
    dialect = dialect_file.name
    for text_file in dialect_file.glob('*.nena'):
        title = text_file.stem
        text = text_file.read_text()
        dialects2texts[dialect][title] = text

In [4]:
dialects2texts.keys()

dict_keys(['Barwar', 'Urmi_C'])

In [36]:
test_text = dialects2texts['Urmi_C']['Village Life (3)']

print(test_text)

dialect = Urmi_C
title = Village Life (3)
encoding = UTF8
source = cu vol 4 texts.html
text_id = B10
informant = Alice Bet-Yosəp
place = Zumallan, N

(1) sə́tva k̭arbúnələ bí čiriyyavàtəˈ <?:Octòber>.ˈ k̭a-⁺ṱùsa,ˈ bá ṱúsa b-<?:Octóber>
cúllə mə́ndi ⁺mavùruna,ˈ pìcə,ˈ ⁺ṱùnta.ˈ ʾíta ʾánnə ⁺ʾànvəvaˈ cúllə
ci-⁺zak̭rívalun b-xá ⁺xòlaˈ tandəllívalun ɟu-čùxta.ˈ ʾə́tvalan čuxyátə
ʾax-⁺ʾambàrə.ˈ cəšmìšə ʾə́tvalan,ˈ cmá jùrrə cəšmíšə.ˈ ʾánnə ɟarúysə ⁺ʾànvə
ʾátxa,ˈ ɟabùšta ci-⁺k̭aràxvalə.ˈ xá sáma hár ⁺ʾal-⁺k̭unṱòpa ci-pešíva brìzə.ˈ
lḕn-⁺bəddaˈ k̭àm ʾátxəva,ˈ buš šap̭ìrə váyəva.ˈ hár b-⁺k̭unṱópa bət-mattə̀tvaˈ
ɟu-mánux ʾíta b-ɟabə́tvalə hár b-⁺k̭unṱòpa.ˈ lè sardívalun ʾátxa xína
mən-⁺ʾúydalə.ˈ (2) cəšmìšəva,ˈ sənjìyyəva yemíšanˈ ʾú ⁺camútrə ʾu-sparə̀ɟləva
 ⁺rába.ˈ ci-⁺zak̭rìvalunˈ mə́drə b-⁺xólə ci-tandəllìvalun.ˈ ci-pešíva k̭a-trè-yarxəˈ
xáč̭č̭a péša ⁺ṱlà yárxə ʾátxaˈ ci-pešíva ɟu-dé ⁺ʾàmbar.ˈ ʾánnə ⁺hasílan=iva k̭át
ʾə̀tva.ˈ nipúxta ci-bašlàxvala.ˈ xína ⁺marč̭áxvalun ⁺ʾànvə.ˈ mə́drə ⁺badəmjánə

In [59]:
known_speakers = {'GK': 'Geoffrey Khan'}
line_splitter = re.compile('\(\d+\)|(«.*?»)') # split on line marker or quotation

def convert_version2(text):
    """Convert version 1 text to version 2"""
    
    # split markup into blocks
    new_text = ''
    attrib, text = text.split('\n\n', 1)
    attribs = dict(re.findall('(.*) = (.*)', attrib))

    # configure informant initials
    informant_names = attribs['informant'].split()
    inform_first = informant_names[0]
    inform_last = informant_names[-1]
    inform_initials = inform_first[0].upper() + inform_last[0].upper()
    attribs['speakers'] = {inform_initials:attribs['informant']}
    del attribs['informant']

    # configure text block and update with 
    new_text = ''
    paragraphs = text.split('\n\n')
    speaker = inform_initials
    i_line = 1
    for i_para, paragraph in enumerate(paragraphs):
        para_text = paragraph.replace('\n', ' ')
        lines = [l.strip() for l in line_splitter.split(para_text) 
                     if l and l.strip()]
        
        new_lines = []
        line_n = 0

        for i, line in enumerate(lines):

            line_tag = f'{i_line}'
            i_line += 1

            # trigger new speaker
            if line.startswith('«'):
                line = line.replace('«', '').replace('»', '') # remove quote tags
                s_initial, line = line.split(': ')
                s_name = known_speakers[s_initial]
                attribs['speakers'][s_initial] = s_name
                speaker = s_initial
                line_tag += f' {s_initial}'

            elif i == 0 and i_para == 0:
                line_tag += f' {inform_initials}'

            # return speaker to informant
            else:
                if speaker != inform_initials:
                    speaker = inform_initials
                    line_tag += f' {inform_initials}'

            new_line = (line_tag, line)
            new_lines.append(new_line)

        new_para = ''
        for tag, line in new_lines:
            new_para += f'{tag}    '
            for short_line in split_string(line):
                new_para += f'{short_line}\n'

        new_text += f'{new_para}\n\n'

    # configure metadata block
    new_attribs = []
    for att, val in attribs.items():
        if att == 'speakers':
            speakers = []
            for initial, name in val.items():
                speakers.append(f'{initial}={name}')
            speaker_string = ', '.join(speakers)
            new_attribs.append(f'{att}:: {speaker_string}')
        else:
            new_attribs.append(f'{att}:: {val}')
    new_meta = '\n'.join(new_attribs)

    new_text = f'{new_meta}\n\n{new_text.strip()}'
    
    return new_text

# Export

In [60]:
target_dir = Path('/Users/cody/github/CambridgeSemiticsLab/nena_corpus/texts/2.0')

for dialect, texts in dialects2texts.items():
    dialect_dir = target_dir.joinpath(f'{dialect}')
    dialect_dir.mkdir(exist_ok=True)
    for title, text in texts.items():
        new_text = convert_version2(text)
        title_path = dialect_dir.joinpath(title+'.nena')
        title_path.write_text(new_text)

### Scratch Code

In [58]:
print(convert_version2(test_text))

dialect:: Urmi_C
title:: Village Life (3)
encoding:: UTF8
source:: cu vol 4 texts.html
text_id:: B10
place:: Zumallan, N
speakers:: AB=Alice Bet-Yosəp, GK=Geoffrey Khan

1 AB    sə́tva k̭arbúnələ bí čiriyyavàtəˈ <?:Octòber>.ˈ k̭a-⁺ṱùsa,ˈ bá ṱúsa b-<?:Octóber>
cúllə mə́ndi ⁺mavùruna,ˈ pìcə,ˈ ⁺ṱùnta.ˈ ʾíta ʾánnə ⁺ʾànvəvaˈ cúllə
ci-⁺zak̭rívalun b-xá ⁺xòlaˈ tandəllívalun ɟu-čùxta.ˈ ʾə́tvalan čuxyátə
ʾax-⁺ʾambàrə.ˈ cəšmìšə ʾə́tvalan,ˈ cmá jùrrə cəšmíšə.ˈ ʾánnə ɟarúysə ⁺ʾànvə
ʾátxa,ˈ ɟabùšta ci-⁺k̭aràxvalə.ˈ xá sáma hár ⁺ʾal-⁺k̭unṱòpa ci-pešíva brìzə.ˈ
lḕn-⁺bəddaˈ k̭àm ʾátxəva,ˈ buš šap̭ìrə váyəva.ˈ hár b-⁺k̭unṱópa bət-mattə̀tvaˈ
ɟu-mánux ʾíta b-ɟabə́tvalə hár b-⁺k̭unṱòpa.ˈ lè sardívalun ʾátxa xína
mən-⁺ʾúydalə.ˈ
2    cəšmìšəva,ˈ sənjìyyəva yemíšanˈ ʾú ⁺camútrə ʾu-sparə̀ɟləva ⁺rába.ˈ
ci-⁺zak̭rìvalunˈ mə́drə b-⁺xólə ci-tandəllìvalun.ˈ ci-pešíva k̭a-trè-yarxəˈ xáč̭č̭a
péša ⁺ṱlà yárxə ʾátxaˈ ci-pešíva ɟu-dé ⁺ʾàmbar.ˈ ʾánnə ⁺hasílan=iva k̭át ʾə̀tva.ˈ
nipúxta ci-bašlàxvala.ˈ xína ⁺marč̭áxvalun ⁺ʾ