In [1]:
data = """
# Gozáli and Nozali

text_id: A8
informant: Nanəs Bənyamən
place: ʾƐn-Nune

(0) bla bla /
bla bla bla //
bla.

(1) ë *ʾána* šmíyənwa xa-tunìθa| y-amríwala Gozáli ʾu-Nozàli,| šə́mma díya yáʿni
tuníθət Gozáli ʾu-Nozàli.| dáx ʾiwáwa ʾàyya?| ʾi-mšárəxwa tuníθa y-amrìwa|
(2) ʾíθwa-w lìθwa| bíš m-álaha gòṛa| líθwa gòṛa| hič-nàša.| bas-ʾíθwa xá malkà|
ʾíθwale xa-bròna.| ʾáw malkà| rába bắyewale ʾaw-bróne dìye.| ʾu-ṭábʿan màlka|
bróne díye páyəš šàwpe díye.| ʾíman-t málka ʾi-màyəθ,| ʾíman-t páyəš gòṛa| bróna
díye šáqəl šáwpe dìye| bar-d-àw.|

(3) ʾɛ́-ga xa-yóma málka dmíxa-wewa b-lɛ̀le,| dmìxa wéwa,| ʾu-xzéle b-xə́lme dìye|
ʾíθ xa-náša gu-d-áy mðìta| biš-făqíra m-kúlla nàša.| rába făqìra-wewa
ʾawwa-náša.| málka xzéle b-xə́lme dìye| mə́ra ʾáwwa náša făqìra| ṱ-awéle xá bronà|
ʾáw páyəš málka šàwpət brònux.| lɛ́-y-awe brónux màlka.|"""

text = """
# Gozáli and Nozali

text_id: A8
informant: Nanəs Bənyamən
place: ʾƐn-Nune

(1) a-word... (a-comment) (GK: lalala) bla
(2) also words.

(4) new paragraph."""

In [2]:
import pprint

from sly import Lexer

class NenaLexer(Lexer):
    
    # set of token names
    tokens = {
        TITLE, ATTRIBUTE, LETTERS, NEWLINES, SPACE,
        #LINE_ID, 
        PUNCTUATION, HYPHEN, #ASTERISK,
        #LPAREN, LBRACKET, RPAREN, RBRACKET,
        LPAREN_COMMENT, LBRACKET_COMMENT, DIGITS,
        LANG_MARKER, #LINE_BREAK, VERSE_BREAK,
        COMMENT,
    }
    
    literals = {'*', '(', ')', '{', '}', '[', ']', '/'}

    # Title starts with pound sign. Returns 2-tuple (key, value).
    @_(r'(?m)^\# .*$')
    def TITLE(self, t):
        t.value = ('title', t.value[2:])
        return t

    # Attribute starts key and colon. Returns 2-tuple (key, value).
    @_(r'(?m)^[a-z][a-z0-9_]+: .*$')
    def ATTRIBUTE(self, t):
        t.value = tuple(t.value.split(': '))
        return t
    # How to get combined Unicode characters to be recognized?
    # Matching only Unicode points of letters with pre-combined
    # marks can be done with the 'word' class '\w', but it
    # includes digits and underscore. To remove those, negate
    # the inverted word class along with digits and underscore:
    # '[^\W\d_]. But that does not include separate combining
    # marks, or the '+' sign.
    # One solution would be unicodedata.normalize('NFC', data),
    # except that not all combinations have pre-combined Unicode
    # points.
    # Another solution is to use an external regex engine such as
    # `regex` (`pip install regex`), which has better Unicode
    # support. However, I would like to avoid extra dependencies.
    # Another (less elegant) solution is to make the '+' symbol
    # and the combining characters [\u0300-\u036F] each its own
    # token, which the parser will have to parse into morphemes
    # and words.
    # Another (also less elegant) solution is to use a 'negative
    # lookbehind assertion' for the negation of digits and '_':
    # https://stackoverflow.com/a/12349464/9230612
    # That is what we will use here. r'(?!\d_)[\w\u0300-\u036F]+'
    # Because combining marks can never appear before the first
    # letter, and because some dialects have a '+' sign at the
    # beginning of some words, we prefix an optional '+' symbol
    # and an obligatory '[^\W\d_]' before the negative lookbehind.
    LETTERS = r'[+]?[^\W\d_](?!\d_)[\w\u0300-\u036F+]*'
    # Newlines: boundaries of paragraphs and metadata are marked
    # with two newlines (meaning an empty line). The empty line
    # may contain whitespace.
    NEWLINES = r'\n\s*\n'
    # Space is any successive number of whitespace symbols.
    SPACE = r'\s+'
    # One or more digits, not starting with zero
    DIGITS = r'[1-9][0-9]*'
    # Line id is any number of digits surrounded by round brackets
#     LINE_ID = r'\([0-9]+\)'  # TODO convert to int?
    # Punctuation is any normal punctuation symbol and vertical bar.
    PUNCTUATION = r'[.,?!:;|–]'
    # There are two different hyphens, a single one and a double one.
    # The double one is the 'equals' sign.
    HYPHEN = r'[-=]'
    # Language markers are ASCII letter strings surrounded by
    # angle brackets.
    LANG_MARKER = r'<[A-Za-z]+>'
    # A special comment starts with an opening bracket, capital initials
    # and a colon.
    LPAREN_COMMENT = r'\([A-Za-z]+: '
    LBRACKET_COMMENT = r'\[[A-Za-z]+: '
    # A regular comment is text (at least one character not being a digit)
    # which may not contain a colon (otherwise it becomes a special comment/interruption)
    COMMENT = r'\([^:)]*[^:)\d]+[^:)]*\)'

# print text for reference
pprint.pprint(text)

# demonstration of output results of lexer, to be used by parser below
lexer = NenaLexer()
[(tok.type, tok.value) for tok in lexer.tokenize(text)]

('\n'
 '# Gozáli and Nozali\n'
 '\n'
 'text_id: A8\n'
 'informant: Nanəs Bənyamən\n'
 'place: ʾƐn-Nune\n'
 '\n'
 '(1) a-word... (a-comment) (GK: lalala) bla\n'
 '(2) also words.\n'
 '\n'
 '(4) new paragraph.')


[('SPACE', '\n'),
 ('TITLE', ('title', 'Gozáli and Nozali')),
 ('NEWLINES', '\n\n'),
 ('ATTRIBUTE', ('text_id', 'A8')),
 ('SPACE', '\n'),
 ('ATTRIBUTE', ('informant', 'Nanəs Bənyamən')),
 ('SPACE', '\n'),
 ('ATTRIBUTE', ('place', 'ʾƐn-Nune')),
 ('NEWLINES', '\n\n'),
 ('(', '('),
 ('DIGITS', '1'),
 (')', ')'),
 ('SPACE', ' '),
 ('LETTERS', 'a'),
 ('HYPHEN', '-'),
 ('LETTERS', 'word'),
 ('PUNCTUATION', '.'),
 ('PUNCTUATION', '.'),
 ('PUNCTUATION', '.'),
 ('SPACE', ' '),
 ('COMMENT', '(a-comment)'),
 ('SPACE', ' '),
 ('LPAREN_COMMENT', '(GK: '),
 ('LETTERS', 'lalala'),
 (')', ')'),
 ('SPACE', ' '),
 ('LETTERS', 'bla'),
 ('SPACE', '\n'),
 ('(', '('),
 ('DIGITS', '2'),
 (')', ')'),
 ('SPACE', ' '),
 ('LETTERS', 'also'),
 ('SPACE', ' '),
 ('LETTERS', 'words'),
 ('PUNCTUATION', '.'),
 ('NEWLINES', '\n\n'),
 ('(', '('),
 ('DIGITS', '4'),
 (')', ')'),
 ('SPACE', ' '),
 ('LETTERS', 'new'),
 ('SPACE', ' '),
 ('LETTERS', 'paragraph'),
 ('PUNCTUATION', '.')]

In [3]:
from sly import Parser

class NenaParser(Parser):
#     debugfile = 'parser.out'

    # Get the token list from the lexer (required)
    tokens = NenaLexer.tokens
    
    @_('heading NEWLINES paragraphs')
    def text(self, p):
        return (p.heading, p.paragraphs)
    
    @_('SPACE TITLE NEWLINES attributes',
       'TITLE NEWLINES attributes')
    def heading(self, p):
        return [p.TITLE] + p.attributes
    
    @_('ATTRIBUTE SPACE attributes')
    def attributes(self, p):
        return [p.ATTRIBUTE] + p.attributes
            
    @_('ATTRIBUTE')
    def attributes(self, p):
        return [p.ATTRIBUTE]
    
    @_('paragraphs NEWLINES paragraph')
    def paragraphs(self, p):
        return p.paragraphs + [p.paragraph]
    
    @_('paragraph')
    def paragraphs(self, p):
        return [p.paragraph]
    
    # paragraph
    @_('paragraph line')
    def paragraph(self, p):
        return p.paragraph + [p.line]
    
    # lines
    @_('line')
    def paragraph(self, p):
        return [p.line]
    
    # line
    @_('line_id line_elements')
    def line(self, p):
        return (p.line_id, p.line_elements)
    
    # line_id
    @_('"(" DIGITS ")" SPACE')
    def line_id(self, p):
        return int(p.DIGITS)

    # line_elements
    @_('line_elements line_element')
    def line_elements(self, p):
        return p.line_elements + p.line_element

    @_('line_element')
    def line_elements(self, p):
        return p.line_element

    # line_element
    @_('full_word')
    def line_element(self, p):
        return [p.full_word]

    @_('comment')
    def line_element(self, p):
        return [p.comment]
    
    @_('interruption')
    def line_element(self, p):
        return p.interruption
    
    # comment
    @_('COMMENT SPACE',
       'COMMENT')
    def comment(self, p):
        return {'comment': p.COMMENT[1:-1]}
        
    # interruption
    @_('LPAREN_COMMENT full_words ")" SPACE',
       'LBRACKET_COMMENT full_words "]" SPACE')
    def interruption(self, p):
        speaker = p[0][1:-2]
        for fw in p.full_words:
            fw.update({'speaker': speaker})
        if not p.full_words[-1]['trailer'].endswith(' '):
            p.full_words[-1]['trailer'] += ' '
        return p.full_words
    
    # full_words
    @_('full_words full_word')
    def full_words(self, p):
        return p.full_words + p.full_word
        
    @_('full_word')
    def full_words(self, p):
        return [p.full_word]

    # full_word
    @_('word trailer')
    def full_word(self, p):
        return {'word': p.word, 'trailer': p.trailer}

    @_('word')
    def full_word(self, p):
        return {'word': p.word, 'trailer': ''}
    
    # word
    @_('word HYPHEN morpheme')
    def word(self, p):
        return p.word + [p.HYPHEN, p.morpheme]
    
    @_('morpheme')    
    def word(self, p):
        return [p.morpheme]
    
    # morpheme
    @_('LETTERS')
    def morpheme(self, p):
        return p.LETTERS
    
    # trailer
    @_('trailer PUNCTUATION')
    def trailer(self, p):
        return p.trailer + p.PUNCTUATION

    @_('trailer SPACE')
    def trailer(self, p):
        return p.trailer + ' '
    
    @_('PUNCTUATION')
    def trailer(self, p):
        return p.PUNCTUATION

    @_('SPACE')
    def trailer(self, p):
        return ' '

pprint.pprint(text)

# demonstration of output results of parser, to be used by generate_TF loop
parser = NenaParser()
parser.parse(lexer.tokenize(text))

('\n'
 '# Gozáli and Nozali\n'
 '\n'
 'text_id: A8\n'
 'informant: Nanəs Bənyamən\n'
 'place: ʾƐn-Nune\n'
 '\n'
 '(1) a-word... (a-comment) (GK: lalala) bla\n'
 '(2) also words.\n'
 '\n'
 '(4) new paragraph.')




([('title', 'Gozáli and Nozali'),
  ('text_id', 'A8'),
  ('informant', 'Nanəs Bənyamən'),
  ('place', 'ʾƐn-Nune')],
 [[(1,
    [{'word': ['a', '-', 'word'], 'trailer': '... '},
     {'comment': 'a-comment'},
     {'word': ['lalala'], 'trailer': ' ', 'speaker': 'GK'},
     {'word': ['bla'], 'trailer': ' '}]),
   (2,
    [{'word': ['also'], 'trailer': ' '},
     {'word': ['words'], 'trailer': '.'}])],
  [(4,
    [{'word': ['new'], 'trailer': ' '},
     {'word': ['paragraph'], 'trailer': '.'}])]])

TODO
- implement 'foreign' marker `*`
- implement language marker `<Marker>`
- implement line and verse breaks `/` and `//`

QUESTIONS

Some questions require answers for implementation. They need not be definitive answers for now, but they should be motivated somehow (even if the motivation is 'random choice'), so it will be clear later why it is done in one way or another.

- How to store hyphen? Now it is stored as a character in a word occuring between morphemes (I think).
  
  Should it be the trailer of the morpheme?


- How to split sentences?

  Now sentences are split on .?! and subsentences on ,
  There are other symbols: ;:– and even .. ... ..., .... ..... (If I recall correctly). Should those split
  sentences or subsentences?


- What to do with poetic line breaks and sentence/paragraph boundaries?

  I think a 'poem' should not be divided into paragraphs. I suggest that a line break '/' is a subsentence division, and a verse break '//' a sentence division (even when in the source it is followed by an empty line). If there is a verse number in between, that automatically starts a new sentence.