# NENA to TF

This notebook will be used to develop code for converting texts from .nena format to Text-Fabric. The parser has principally been written by Hannes Vlaardingerbroek. Many thanks to him for his hard work on it. Updates and refinements have been added by Cody Kingham.

In [1]:
! echo "last updated"; date

last updated
Mon 16 Mar 2020 20:08:54 GMT


In [2]:
import os
import sys
import collections
import re
import csv
import unicodedata
import tabulate
from pathlib import Path
from tf.convert.walker import CV
from tf.fabric import Fabric
from Levenshtein import distance

# path to parser
parserpath = f'../../nena_corpus/parse_nena/'
sys.path.append(parserpath)
from nena_parser import NenaLexer, NenaParser

# paths
corpus = Path('/Users/cody/github/CambridgeSemiticsLab/nena_corpus')
VERSION = '0.01'
OUT_DIR = Path(f'../tf/{VERSION}')
data_dir = corpus.joinpath(f'nena/{VERSION}')
dialect_dirs = list(Path(data_dir).glob('*'))
glossary = corpus.joinpath(f'glossaries/bar glossary general.txt')

# open char tables
# trans_lite_table = Path('../char_tables/trans_lite.tsv')
# with open(trans_lite_table, 'r') as infile:
#     trans_data = list(csv.reader(infile, delimiter='\t'))[1:]
#     trans_lite = {unicodedata.normalize('NFC', td[0]):td[1] for td in trans_data}



In [3]:
TF = Fabric(locations=[str(OUT_DIR)], silent=True)
cv = CV(TF)

## Test NENA Parser

The NENA Parser delivers the text as structured morphemes, which can then be processed into a TF graph. We do that below by opening each source text, retrieving its parsed form, and begin each iteration. 

In [4]:
lexer = NenaLexer()
parser = NenaParser()

In [5]:
# dialect2file2parsed = collections.defaultdict(lambda: collections.defaultdict())

# nparsed = 0

# for dialect in sorted(dialect_dirs):    
#     print()
#     print(dialect.name)
#     for file in sorted(dialect.glob('*.nena')):
#         with open(file, 'r') as infile:
#             text = infile.read()
#             print(f'parsing: {file.name}')
#             parse = parser.parse(lexer.tokenize(text))
#             nparsed += 1
#             dialect2file2parsed[dialect.name][file.name] = parse
            

# print('\n', nparsed, 'texts ready for conversion')

In [6]:
# linenum, elements = dialect2file2parsed['Urmi_C']['Village Life.nena'][1][0]
# eg_morph = elements[0]

In [7]:
# dialect2file2parsed['Barwar']['A Hundred Gold Coins.nena'][1][0][0]

## Glossary

In [8]:
def normalize_entry(text):
    """Normalize text by removing style elements/quirks"""
    # remove trailing spaces
    text = text.strip()
    # remove style asterixes
    text = text.replace('*', '')
    # remove begin/end single quotes
    text = re.sub('^\u0027|\u0027$', '', text)
    text = unicodedata.normalize('NFC', text)
    return text
    
with open(glossary,'r') as infile:
    glosses = infile.read()
    glosses = re.split('\n\n', glosses)
    gloss_data = []
    for lemma in glosses:
        lemma_data = {}
        for dataline in lemma.split('\n'):
            key, value = dataline.split(':', 1)
            key,value = key.strip(), value.strip()
            lemma_data[key] = value
            
        gloss_data.append(lemma_data)
        
# clean up and prep nena gloss data for 
# matching with instances in the text
nena_glosses = {}
for gl_id, gloss in enumerate(gloss_data):
    data = {}
    data['lemmas'] = tuple(
        normalize_entry(ent) for ent in re.split('[,;]', gloss['lemma'])
    )
    data['lemma'] = normalize_entry(gloss['lemma'])
    data['grm_desc'] = normalize_entry(gloss.get('grm_desc', ''))
    forms = eval(gloss.get('forms', '[]'))
    data['forms'] = [
        (fdata, normalize_entry(form)) 
            for fdata, forms in forms
                for form in re.split('[;,]', forms)
    ]
    data['examples'] = unicodedata.normalize('NFC',
        re.sub('^\u0027|\u0027$', '', gloss.get('examples',''))
    )
    data['trans'] = normalize_entry(
        re.sub('\(§[\d.,;\s§]+\)', '', gloss.get('trans',''))
    )
    
    data['lang'] = normalize_entry(gloss.get('lang',''))
    data['ref'] = normalize_entry(gloss.get('ref', ''))
    nena_glosses[gl_id] = data
    
# prepare matchset for matching surface forms
gloss_matchset = []

for lem_id, lemdat in nena_glosses.items():
    for lemma in lemdat['lemmas']:
        lemma = lemma.strip('-')
        gloss_matchset.append(
            (re.compile(f'^{lemma}$'), (lem_id, ''))
        )
    for form in lemdat['forms']:
        form_type = form[0]
        for formstring in form[1:]:
            if not formstring:
                continue
            formstring = formstring.strip('-')
            gloss_matchset.append(
                (re.compile(f'^{formstring}$'), (lem_id, form_type))
            )

In [9]:
print(f'{len(gloss_matchset)} surface forms ready for match attempts...')

6674 surface forms ready for match attempts...


In [10]:
gloss_matchset[:2]

[(re.compile(r'^ʾabaya$', re.UNICODE), (0, '')),
 (re.compile(r'^ʾabbaya$', re.UNICODE), (0, ''))]

In [11]:
# for i in range(1, 10):
#     print(nena_glosses[i])

In [12]:
def match_gloss(string):
    """A function to match strings to glosses
    
    Matches can be made at the word or the morpheme level.
    """
    for pattern, pattern_data in gloss_matchset:
        if pattern.match(string):
            return pattern_data
            break

## Transcriptions

In [13]:
# trans_full = {
#     # non-latin vowels
#     '\u0131': '1',  # 0x0131 ı dotless i
#     '\u0251': '@',  # 0x0251 ɑ alpha
#     '\u0259': '3',  # 0x0259 ə schwa
#     '\u025B': '$',  # 0x025B ɛ open e
#     # vowel accents
#     '\u0300': '`',  # 0x0300 à grave
#     '\u0301': "'",  # 0x0301 á acute
#     '\u0304': '_',  # 0x0304 ā macron
#     '\u0306': '%',  # 0x0306 ă breve
#     '\u0308': '"',  # 0x0308 ä diaeresis
#     '\u0303': '~',  # 0x0303 ã tilde
#     '\u02C8': '', # 0x2c8 ˈ small vertical line
#     # non-latin consonants
#     '\u00F0': '6',  # 0x00F0 ð eth
#     '\u025F': '&',  # 0x025F ɟ small dotless j with stroke
#     '\u0248': '!',  # 0x0248 Ɉ capital J with stroke
#     '\u03B8': '8',  # 0x03B8 θ greek theta
#     '\u02B8': '7',  # 0x02B8 ʸ small superscript y
#     '\u02BE': '}',  # 0x02BE ʾ right half ring (alaph)
#     '\u02BF': '{',  # 0x02BF ʿ left half ring (ayin)
#     # consonant diacritics
#     '\u207A': '+',  # 0x207A ⁺ superscript plus
#     '\u030C': '<',  # 0x030C x̌ caron
#     '\u0302': '^',  # 0x0302 x̂ circumflex
#     '\u0307': ';',  # 0x0307 ẋ dot above
#     '\u0323': '.',  # 0x0323 x̣ dot below
#     '\u032D': '>',  # 0x032D x̭ circumflex below
    
#     # punctuation
#     '\u02C8': '|', # 0x2c8 ˈ small vertical line
# }
# def trans(s, table, mark_punct=True):
#     '''
#     Transcribes a text.
#     '''
#     s = unicodedata.normalize('NFD', s)
#     # mark punctuation 
#     if mark_punct:
#         s = re.sub('([\n,.!?:;/])', r'/\g<1>', s, 1)
#     return ''.join([table.get(c, c) for c in s])

In [14]:
class Transcriber:
    """Transcribe a string according to transcription rules.
    
    This transcription class is essentially a filter
    which determines which characters make it into a new,
    transcribed string. The filter is applied on a letter-by-letter 
    basis. A "letter" (token) is defined by the `tokens` argument 
    and can include diacritics/accents. The filter is applied 
    in one of three methods:
        1. replacements on a unicode composed letter (NFC)
        2. or replacements on punctuation if letter is punctuation
        3. or replacements on a unicode decomposed letter (NFD)
    The changes are added to a new string which is then returned.
    
    __init__(tokens, replacements, punctuation, keep):
        string: a string to transcribe
        tokens: regex for splitting letters (tokens)
            to be used with findall
        replacements: dict with find:replace mappings
        keep: regex for characters to keep
            
    Returns:
        str in transcribed form
    """
    def __init__(self, tokens='', replace={}, 
                 punctuation='', keep='', keep_case=False):    
        self.tokenize = re.compile(f'{tokens}|{punctuation}').findall
        self.punct = re.compile(punctuation)
        self.keep = re.compile(keep)
        self.keep_case = keep_case
        
        # ensure normalized characters for pattern searches
        self.repl = {
            unicodedata.normalize('NFC',f):r 
                for f,r in replace.items()
        }
        
    def convert(self, string):
        """Convert string to transcription.
        
        Returns:
            str in transcribed form
        """
        
        string = unicodedata.normalize('NFC',string)
        if not self.keep_case:
            string = string.lower()
        transcription = ''

        for token in self.tokenize(string):

            # filter a composed string
            if token in self.repl:
                transcription += self.repl[token]

            # keep punctuation
            elif self.punct.match(token):
                transcription += token

            # filter at 
            else:
                for char in unicodedata.normalize('NFD', token):

                    # attempt second match on char by char basis
                    if char in self.repl:
                        transcription += self.repl[char]

                    # attempt to keep with keep-set
                    elif self.keep.match(char):
                        transcription += char   
                        
        return transcription

In [15]:
trans_full = {
    'tokens': f'[\u207A]?[^\W\d_][\u0300-\u036F]*',
    'replace': {
   
    # char combinations
    'p̌':'p<',
    'ṭ': 't',
    'ð̣': '6',

    # non-latin vowels
    '\u0131': 'i',  # 0x0131 ı dotless i
    '\u0251': 'a',  # 0x0251 ɑ alpha
    '\u0259': '9',  # 0x0259 ə schwa
    '\u025B': '3',  # 0x025B ɛ open e
    
    # vowel accents
    '\u0300': '`',  # 0x0300 à grave
    '\u0301': "'",  # 0x0301 á acute
    '\u0304': '_',  # 0x0304 ā macron
    '\u0306': '%',  # 0x0306 ă breve
    '\u0308': '"',  # 0x0308 ä diaeresis
    '\u0303': '~',  # 0x0303 ã tilde
    '\u02C8': '', # 0x2c8 ˈ small vertical line
        
    # non-latin consonants
    '\u00F0': '6',  # 0x00F0 ð eth
    '\u025F': '4',  # 0x025F ɟ small dotless j with stroke
    '\u0248': '4',  # 0x0248 Ɉ capital J with stroke
    '\u03B8': '8',  # 0x03B8 θ greek theta
    '\u02B8': '7',  # 0x02B8 ʸ small superscript y
    '\u02BE': ')',  # 0x02BE ʾ right half ring (alaph)
    '\u02BF': '(',  # 0x02BF ʿ left half ring (ayin)
        
    # consonant diacritics
    '\u207A': '+',  # 0x207A ⁺ superscript plus
    '\u030C': '>',  # 0x030C x̌ caron
    '\u0302': '^',  # 0x0302 x̂ circumflex
    '\u0307': ';',  # 0x0307 ẋ dot above
    '\u0323': '.',  # 0x0323 x̣ dot below
    '\u032D': '<',  # 0x032D x̭ circumflex below
    
    # punctuation
    '\u02C8': '|', # 0x2c8 ˈ small vertical line
    },
    'punctuation': '[\s.,?!:;–\-\u2014]',
    'keep': '[A-Za-z]',
}

trans_lite = {
    'tokens': f'[\u207A]?[^\W\d_][\u0300-\u036F]*', 
    'replace': {
        'ʾ': ')',
        'ʿ': '(',
        'č': '5',
        'č̭': '5',
        'č̣': '%',
        'ḍ': 'D',
        'ð': '6',
        'ð̣': '^',
        'ġ': 'G',
        'ḥ': 'H',
        'ɟ': '4',
        'Ɉ': '4',
        'k̭': '&',
        'ḷ': 'L',
        'ṃ': 'M',    
        'p̣': 'P',
        'ṛ': 'R',
        'ṣ': 'S',
        'š': '$',
        'ṱ': '+',
        'ṭ': 'T',
        'θ': '8',
        'ž': '7',
        'ẓ': 'Z',
        'ā̀': 'A',
        'ā́': 'A',
        'ă': '@',
        'ắ': '@',
        'ằ': '@',
        'ē': 'E',
        'ɛ': '3',
        'ī': 'I',
        'ĭ': '9',
        'ə': '9',
        'o': 'o',
        'ō': 'O',
        'ū': 'U',
        'ŭ': '2',
        'ı': 'i',
        'ɑ': 'a',
        'ˈ': '|'
    },
    'punctuation': '[\s.,?!:;–\-\u2014]',
    'keep': '[A-Za-z]',
}

fuzzy_urmi = {
    'tokens': f'[\u207A]?[^\W\d_][\u0300-\u036F]*',
    'replace': {
        'c': 'k',
        'c̭': 'k',
        'č': '5',
        'č̭': '5',
        'č̣': '5',
        'k̭': 'q',
        'ɟ': 'g',
        'Ɉ': 'g',
        'ə': 'i',
        'v': 'w',
    },
    'punctuation': '[\s.,?!:;–\-\u2014]',
    'keep': '[A-Za-z]',
}

fuzzy_barwar = {
    'tokens': f'[\u207A]?[^\W\d_][\u0300-\u036F]*',
    'replace': {
        'č': '5',
        'č̭': '5',
        'č̣': '5',
        'k̭': 'k',
        'θ': 't',
        'ð': 'd',
        'ɛ': 'e',
        'ə': 'i',
    },
    'punctuation': '[\s.,?!:;–\-\u2014]',
    'keep': '[A-Za-z]',
}

trans_text = {
    'tokens': f'[\u207A]?[^\W\d_][\u0300-\u036F]*',
    'replace': {
    },
    'punctuation': '[-–]',
    'keep': '[^\s.,?!:;\u02C8]+',
}

def normalize_nena(word):
    """Strip accents and spaces from NENA text on a node.
    
    Args:
        word: a node number to get normalized text
    """
    accents = '\u0300|\u0301|\u0304|\u0306|\u0308|\u0303'
    norm = unicodedata.normalize('NFD', word) # decompose for accent stripping
    norm = re.sub(accents, '', norm) # strip accents
    return unicodedata.normalize('NFC', norm)

In [16]:
# for char in unicodedata.normalize('NFD','č̣'):
#     print(char)

In [17]:
# trans_test = Transcriber(**fuzzy_barwar)
# trans_test_full = Transcriber(**trans_full)

test = 'ʾə́č̣č̣a dáwe zìle mə́nne'

trans_test = Transcriber(**trans_lite)

In [18]:
trans_test.convert(test)

')9%%a dawe zile m9nne'

In [19]:
test = 'bəṱ-lábən.ˈ'
trans_test = Transcriber(**trans_text)
trans_test.convert(test)

'bəṱ-lábən'

# Metadata

In [20]:
slotType = 'letter'

otext = {
    'sectionTypes': 'dialect,text,line',
    'sectionFeatures': 'dialect,title,number',
    'fmt:text-orig-full': '{text}{end}',
    'fmt:text-trans-full': '{full}{full_end}',
    'fmt:text-trans-lite': '{lite}{lite_end}',
    'fmt:text-trans-fuzzy': '{fuzzy}{fuzzy_end}',
}

description = ''.join("""
The NENA linguistic corpus is derived from decades of 
field work by Prof. Geoffrey Khan and his students.
""".split('\n'))

generic = {
    'origin': 'Cambridge University, Faculty of Asian and Middle Eastern Studies',
    'author': 'Geoffrey Khan et al.',
    'editors': 'Cody Kingham, Paul Noorlander, James Strachan, Hannes Vlaardingerbroek',
    'researchers': 'Dorota Molin, Johan Lundberg',
    'source': description,
    'url': 'https://github.com/CambridgeSemiticsLab/nena_tf',
}

intFeatures = {'number'}

d = 'about'
feature_type = 'feature_type'

featureMeta = {
    'dialect': {d: 'name of a dialect in Northeastern Neo-Aramaic', feature_type:'categorical'},
    'title': {d: 'title of a text (story)', feature_type:'string'},
    'version': {d: 'version of the story if there are multiple instances of the same story', feature_type:'categorical'},
    'number': {d: 'sequential number of a paragraph or line within a text or paragraph, respectively', feature_type:'integer'},
    'text': {d: 'plain text representation of a letter, morpheme, or word', feature_type:'text'},
    'text_norm': {d: 'plain text without accents', feature_type:'text'},
    'full': {d: 'full transcription, one-to-one transcription of a letter, morpheme, or word', feature_type:'text'},
    'lite': {d: 'lite transcription of a letter, morpheme, or word, without vowel accents', feature_type:'text'},
    'fuzzy': {d: 'fuzzy transcription that leaves out most diacritics and maps certain characters in certain dialects to common characters', feature_type:'text'},
    'end': {d: 'space, punctuation, or other stylistic text at the end of a morpheme or word', feature_type:'text'},
    'full_end': {d: 'full transcription of punctuation or other stylistic text at the end of a morpheme or word; see also trans_f', feature_type:'text'},
    'lite_end': {d: 'lite transcription of punctuation or other stylistic text at the end of a morpheme or word, excluding intonation boundary markers; see also trans_l', feature_type:'text'},
    'fuzzy_end': {d: 'fuzzy transcription of punctuation or other stylistic text at the end of a morpheme or word, excluding intonation boundary markers; see also trans_l', feature_type:'text'},
    'speaker': {d: 'name or initials of person speaking a morpheme or word; see also informant', feature_type:'string'},
    'footnotes': {d: 'explanatory footnote on a morpheme or text', feature_type:'string'},
    'lang': {d: 'language of a morpheme foreign to a text', feature_type:'categorical'},
    'foreign': {d: 'indicates whether a morpheme is foreign to a text; see also lang', feature_type:'string'},
    'comment': {d: 'explanatory comment inserted in the text, stored on a morpheme', feature_type:'string'},
    'continued_from': {d: 'text is a follow-up to the named text', feature_type:'string'},
    'informant': {d: 'name of person who spoke these words', feature_type:'categorical'},
    'place': {d: 'place a text was recorded', feature_type:'categorical'},
    'source': {d: 'name of the file from which a text was converted', feature_type:'string'},
    'text_id': {d: 'id of a text within its original publication; can overlap between publications', feature_type:'string'},
    'class': {d: 'class of a letter (consonant or vowel)', feature_type:'categorical'},
    'lemma': {d: 'lemma of a word', feature_type:'string'},
    'lemma_form': {d: 'grammatical form of a word lemma', feature_type:'string'},
    'grm_desc': {d: 'grammatical description of a word lemma', feature_type:'categorical'},
    'gloss': {d: 'English gloss of a word lemma', feature_type:'string'},
}

# Converter

Build a TF Walker class that can walk over the NENA parsed data and fit the text graph.

In [21]:
def director(CV):
    """Walk the source data and produce a TF graph"""
    
    info = TF.tm.info
    
    # transcriptions particular to dialects
    dialect2fuzzy = {
        'Barwar': Transcriber(**fuzzy_barwar),
        'Urmi_C': Transcriber(**fuzzy_urmi),
    }
    
    matched_glosses = set() # track matched glosses from gloss-set
    
    def make_footnotes(fn_dict):
        """Format footnote dict into string"""
        if fn_dict:
            return '; '.join(
                f'[^{num}]: {txt}' for num, txt in fn_dict.items()
            )
        else:
            return None

    t_text = Transcriber(**trans_text) # full text transcription without punctuation
    t_full = Transcriber(**trans_full) # transcription full
    t_lite = Transcriber(**trans_lite) # transcription lite feature

    def make_wordfeats(mfeat_list, ignore={}, dialect=''):
        """Convert a list of morpheme feature dicts into one for a word.

        Features stored on a word must be inherited in special ways
        for words. For example, a word's "end" feature should be 
        the last morpheme, not all of the ends. Those features 
        are specially processed here.
        """

        # gather word features here
        word_fs = collections.defaultdict(set)

        # add features
        for mfeats in mfeat_list:
            for feat,val in mfeats.items():
                if feat in ignore:
                    continue
                else:
                    word_fs[feat].add(val)

        # handle special cases
        word_fs['end'] = mfeat_list[-1]['end']
        word_fs['full_end'] = mfeat_list[-1]['full_end']
        word_fs['lite_end'] = mfeat_list[-1]['lite_end']
        word_fs['fuzzy_end'] = mfeat_list[-1]['fuzzy_end']
        word_fs['text'] = t_text.convert(''.join(
            mf['text']+mf['end'] for mf in mfeat_list
        ))
        # add transcription with end, leaving off the end from the 
        # last morpheme
        trans_parts = [('full', 'full_end'), ('lite', 'lite_end'), ('fuzzy', 'fuzzy_end')]
        for trans, end in trans_parts:
            word_fs[trans] = ''
            for i,mf in enumerate(mfeat_list):
                word_fs[trans] += mf[trans]
                if i+1 != len(mfeat_list):
                    word_fs[trans] += mf[end]

        # convert to strings and handle duplicates
        for feat,val in word_fs.items():
            if type(val) == set:
                val = {v for v in val if v}
                if val:
                    word_fs[feat] = ' '.join(val)
                else:
                    word_fs[feat] = None

        return word_fs
    
    for dialect_dir in sorted(dialect_dirs):  
        
        # make dialect node
        dialect = cv.node('dialect')
        dia = dialect_dir.name
        cv.feature(dialect, dialect=dia)
        
        # retrieve fuzzy transcription particular to dialect
        t_fuzzy = dialect2fuzzy[dia]
        
        # process file into TF graph
        for file in sorted(dialect_dir.glob('*.nena')):
            
            info(f'processing: [{file}]')
            
            with open(file, 'r') as infile:
                nena_text = infile.read()
            
            # parse the .nena format
            header, paragraphs = parser.parse(lexer.tokenize(nena_text))
            
            # -- begin TF node creation --
            
            # cv.node initializes a node object
            # all slots added in between its creation and 
            # termination will be considered embedded within
            # this node; same is true of following cv.node calls
            text = cv.node('text')
            cv.feature(text, **header) # adds features to supplied node
            title = header['title']
            
            for i, para in enumerate(paragraphs):
                
                # TODO: Process footnotes here
                if len(para[0]) != 2:
                    continue
                
                # make paragraph node
                paragraph = cv.node('paragraph')
                cv.feature(paragraph, number=i+1)
                
                for line_num, line_elements in para:
                    
                    # make line nodes
                    line = cv.node('line')
                    cv.feature(line, number=line_num)
                    
                    # Make linguistic nodes by parsing morphemes.
                    # This must be done iteratively and composed
                    # based on characters at the end of each morpheme. 
                    # Punctuation signals intonation/subsentence/sentence 
                    # boundaries; spaces and hyphens signal word bounds. 
                    # This is handled in the loop below.
                    word = cv.node('word')
                    inton = cv.node('inton')
                    subsentence = cv.node('subsentence')
                    sentence = cv.node('sentence')
                    word_features = [] # store morphs feats here for processing

                    for i, elem in enumerate(line_elements):
                        
                        is_end = i+1 == len(line_elements)

                        # add morphemes as slots
                        # 'slot' being the most basic element
                        if elem.__class__.__name__ == 'Morpheme':
                            
                            # make morpheme node
                            morph = cv.node('morpheme')
                            
                            # access/prepare morph features
                            fs = elem.__dict__
                            trailer = elem.trailer.replace('/', '\n')
                            
                            # package & edit morph features for cv
                            # NB: None values are ignored by default
                            m_string = ''.join(elem.value)
                            feats = {
                                #  make string representations
                                'text': t_text.convert(m_string),
                                'text_norm': normalize_nena(t_text.convert(m_string)),
                                'full': t_full.convert(m_string),
                                'lite': t_lite.convert(m_string),
                                'fuzzy': t_fuzzy.convert(m_string),
                                'end': trailer,
                                
                                # make punctuation strings at end of morpheme
                                'full_end': t_full.convert(trailer),
                                'lite_end': t_lite.convert(trailer),
                                'fuzzy_end': t_fuzzy.convert(trailer),
                                
                                # make metadata features
                                'speaker': fs.get('speaker') or header.get('informant'),
                                'footnotes': make_footnotes(fs.get('footnotes', {})),
                                'lang': fs.get('lang'),
                                'foreign': str(fs.get('foreign')) if fs.get('foreign') else None,
                            }
                            
                            # attempt gloss matches
                            if dia == 'Barwar':
                                gloss_match = match_gloss(feats['text_norm'])
                                if gloss_match:
                                    lem_id, lem_form = gloss_match
                                    feats['lemma'] = nena_glosses[lem_id]['lemma']
                                    feats['lemma_form'] = lem_form or None
                                    feats['grm_desc'] = nena_glosses[lem_id].get('grm_desc')
                                    feats['gloss'] = nena_glosses[lem_id].get('trans')
                                
                                    # track which forms are found
                                    matched_glosses.add(gloss_match)
                            
                            # make letter slots
                            # creation of a slot simultaneously 
                            # embeds it within all active nodes
                            
                            for i, let in enumerate(elem.value):
                                # letter features
                                letfs = {
                                    
                                    #  make string representations
                                    'text': let,
                                    'full': t_full.convert(let),
                                    'lite': t_lite.convert(let),
                                    'fuzzy': t_fuzzy.convert(let),
                                    
                                    # make punctuation strings at end of letter
                                    # set it to null by default
                                    'end': '',
                                    'full_end': '',
                                    'lite_end': '', 
                                    'fuzzy_end': '',                        
                                }
                                
                                # make letter class data
                                vowels = {'a','e','i','o','u'}
                                if letfs['fuzzy'] in vowels:
                                    letfs['class'] = 'vowel'
                                else:
                                    letfs['class'] = 'consonant'
                                
                                # keep punctuation after letter at end of a morpheme 
                                if i+1 == len(elem.value):
                                    letfs['end'] = trailer
                                    letfs['full_end'] = feats['full_end']
                                    letfs['lite_end'] = feats['lite_end']
                                    letfs['fuzzy_end'] = feats['fuzzy_end']
                                letter = cv.slot()
                                cv.feature(letter, **letfs)
                                cv.terminate(letter)
                            
                            word_features.append(feats)
                            cv.feature(morph, **feats)
                            cv.terminate(morph)
                                
                            # -- trigger linguistic node endings --
                            
                            # word ending
                            if (not re.match('^$|^[-=]$', trailer)) or is_end:
                                cv.feature(word, **make_wordfeats(word_features))
                                word_features = []
                                cv.terminate(word)
                                if not is_end:
                                    word = cv.node('word')
                                    
                            # intonation group ending
                            if re.search('\u02c8', trailer):
                                cv.terminate(inton)
                                if not is_end:
                                    inton = cv.node('inton')
                            
                            # subsentence ending
                            if re.search('[,;:\u2014\u2013]', trailer):
                                cv.terminate(subsentence)
                                if not is_end:
                                    subsentence = cv.node('subsentence')
                            
                            # sentence ending
                            elif re.search('[.!?]', trailer):
                                cv.terminate(subsentence)
                                cv.terminate(sentence)
                                if not is_end:
                                    subsentence = cv.node('subsentence')
                                    sentence = cv.node('sentence')
                                
                        # add other elements
                        else:
                            kind, data = elem
                            if kind == 'footnote':
                                cv.feature(text, footnote=make_footnotes(data))
                            else:
                                cv.feature(morph, **{kind:str(data)})
                    
                    # sanity check for un-closed words, itons, subsentences, sentences
                    # due either to lack of proper punctuation in the source text (to be fixed later)
                    # or due to non-morpheme elements intervening in the iteration
                    unclosed = {'inton','sentence', 'subsentence', 'word'} & cv.activeTypes()
                    if unclosed:
                        sys.stderr.write(f'force-closing types {unclosed} in {title} ln {line_num}\n')
                        cv.terminate(word)
                        cv.terminate(inton)
                        cv.terminate(subsentence)
                        cv.terminate(sentence)
                        
                    # -- trigger section node endings --
                    cv.terminate(line)
                cv.terminate(paragraph)
            cv.terminate(text)
        cv.terminate(dialect)
        
    info(f'{len(matched_glosses)} glosses matched...')
    info(f'{len(set(gloss_matchset) - matched_glosses)} glosses not matched...')

## Test good


In [None]:
good = cv.walk(
    director,
    slotType,
    otext=otext,
    generic=generic,
    intFeatures=intFeatures,
    featureMeta=featureMeta,
    warn=True,
    force=False,
)

  0.00s Importing data from walking through the source ...
   |     0.00s Preparing metadata... 
   |     0.00s No structure nodes will be set up
   |   SECTION   TYPES:    dialect, text, line
   |   SECTION   FEATURES: dialect, title, number
   |   STRUCTURE TYPES:    
   |   STRUCTURE FEATURES: 
   |   TEXT      FEATURES:
   |      |   text-orig-full       end, text
   |      |   text-trans-full      full, full_end
   |      |   text-trans-fuzzy     fuzzy, fuzzy_end
   |      |   text-trans-lite      lite, lite_end
   |     0.01s OK
   |     0.00s Following director... 
   |     0.00s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/A Hundred Gold Coins.nena]
   |     0.66s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/A Man Called Čuxo.nena]
   |     2.15s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/A Tale of Two Kings.nena]
   |     3.15s processing: [/Users/cody/github/Cambri

force-closing types {'subsentence', 'sentence'} in A Tale of a Prince and a Princess ln 32


   |     7.18s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/Baby Leliθa.nena]
   |     9.23s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/Dəmdəma.nena]
   |       11s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/Gozali and Nozali.nena]


force-closing types {'subsentence', 'sentence'} in Gozali and Nozali ln 1


   |       18s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/I Am Worth the Same as a Blind Wolf.nena]


force-closing types {'subsentence', 'sentence'} in I Am Worth the Same as a Blind Wolf ln 2


   |       19s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/Man Is Treacherous.nena]
   |       20s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/Measure for Measure.nena]
   |       20s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/Nanno and Jəndo.nena]
   |       21s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/Qaṭina Rescues His Nephew From Leliθa.nena]


force-closing types {'subsentence', 'sentence'} in Qaṭina Rescues His Nephew From Leliθa ln 1
force-closing types {'subsentence', 'sentence'} in Qaṭina Rescues His Nephew From Leliθa ln 2
force-closing types {'subsentence', 'sentence'} in Qaṭina Rescues His Nephew From Leliθa ln 3
force-closing types {'subsentence', 'sentence'} in Qaṭina Rescues His Nephew From Leliθa ln 4
force-closing types {'subsentence', 'sentence'} in Qaṭina Rescues His Nephew From Leliθa ln 5
force-closing types {'subsentence', 'sentence'} in Qaṭina Rescues His Nephew From Leliθa ln 7
force-closing types {'subsentence', 'sentence'} in Qaṭina Rescues His Nephew From Leliθa ln 8
force-closing types {'sentence'} in Qaṭina Rescues His Nephew From Leliθa ln 10
force-closing types {'subsentence', 'sentence'} in Qaṭina Rescues His Nephew From Leliθa ln 11
force-closing types {'subsentence', 'sentence'} in Qaṭina Rescues His Nephew From Leliθa ln 12
force-closing types {'subsentence', 'sentence'} in Qaṭina Rescues His Ne

   |       22s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/Sour Grapes.nena]
   |       22s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/Tales From the 1001 Nights.nena]
   |       28s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Battle With Yuwanəs the Armenian.nena]


force-closing types {'subsentence', 'sentence'} in The Battle With Yuwanəs the Armenian ln 4
force-closing types {'subsentence', 'sentence'} in The Battle With Yuwanəs the Armenian ln 5
force-closing types {'subsentence', 'sentence'} in The Battle With Yuwanəs the Armenian ln 20
force-closing types {'subsentence', 'sentence'} in The Battle With Yuwanəs the Armenian ln 21
force-closing types {'subsentence', 'sentence'} in The Battle With Yuwanəs the Armenian ln 22


   |       30s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Bear and the Fox.nena]


force-closing types {'inton', 'sentence'} in The Battle With Yuwanəs the Armenian ln 25
force-closing types {'subsentence', 'sentence'} in The Battle With Yuwanəs the Armenian ln 26


   |       30s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Brother of Giants.nena]
   |       31s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Cat and the Mice.nena]
   |       32s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Cooking Pot.nena]
   |       32s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Crafty Hireling.nena]


force-closing types {'subsentence', 'sentence'} in The Crafty Hireling ln 43


   |       35s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Crow and the Cheese.nena]
   |       35s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Daughter of the King.nena]


force-closing types {'subsentence', 'sentence'} in The Crafty Hireling ln 54
force-closing types {'subsentence', 'sentence'} in The Crow and the Cheese ln 1
force-closing types {'subsentence', 'sentence'} in The Crow and the Cheese ln 2
force-closing types {'subsentence', 'sentence'} in The Crow and the Cheese ln 3
force-closing types {'subsentence', 'sentence'} in The Crow and the Cheese ln 5
force-closing types {'subsentence', 'sentence'} in The Crow and the Cheese ln 6


   |       37s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Fox and the Lion.nena]
   |       38s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Fox and the Miller.nena]
   |       39s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Fox and the Stork.nena]
   |       39s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Giant’s Cave.nena]
   |       40s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Girl and the Seven Brothers.nena]


force-closing types {'sentence'} in The Girl and the Seven Brothers ln 2
force-closing types {'subsentence', 'sentence'} in The Girl and the Seven Brothers ln 3
force-closing types {'sentence'} in The Girl and the Seven Brothers ln 12


   |       41s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The King With Forty Sons.nena]


force-closing types {'sentence'} in The King With Forty Sons ln 40


   |       46s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Leliθa From č̭āl.nena]
   |       46s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Lion King.nena]
   |       47s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Lion With a Swollen Leg.nena]
   |       48s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Man Who Cried Wolf.nena]
   |       48s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Man Who Wanted to Work.nena]
   |       51s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Monk Who Wanted to Know When He Would Die.nena]
   |       51s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Monk and the Angel.nena]
   |       53s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/

force-closing types {'subsentence', 'sentence'} in The Sale of an Ox ln 41


   |       56s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Scorpion and the Snake.nena]
   |       56s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Selfish Neighbour.nena]
   |       57s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Sisisambər Plant.nena]


force-closing types {'subsentence', 'sentence'} in The Sisisambər Plant ln 2
force-closing types {'subsentence', 'sentence'} in The Sisisambər Plant ln 8
force-closing types {'subsentence', 'sentence'} in The Sisisambər Plant ln 9


   |       57s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Story With No End.nena]


force-closing types {'subsentence', 'sentence'} in The Sisisambər Plant ln 14
force-closing types {'subsentence', 'sentence'} in The Sisisambər Plant ln 15


   |       58s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Tale of Farxo and Səttiya.nena]


force-closing types {'inton'} in The Tale of Farxo and Səttiya ln 29


   |    1m 03s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Tale of Mămo and Zine.nena]


force-closing types {'subsentence', 'sentence'} in The Tale of Mămo and Zine ln 22


   |    1m 08s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Tale of Mərza Pămət.nena]


force-closing types {'subsentence', 'sentence'} in The Tale of Mərza Pămət ln 32


   |    1m 11s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Tale of Nasimo.nena]


force-closing types {'sentence'} in The Tale of Nasimo ln 3
force-closing types {'subsentence', 'sentence'} in The Tale of Nasimo ln 4
force-closing types {'subsentence', 'sentence'} in The Tale of Nasimo ln 5
force-closing types {'sentence'} in The Tale of Nasimo ln 6
force-closing types {'subsentence', 'sentence'} in The Tale of Nasimo ln 7


   |    1m 11s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Tale of Parizada, Warda and Nargis.nena]


force-closing types {'inton'} in The Tale of Parizada, Warda and Nargis ln 29


   |    1m 15s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Tale of Rustam (1).nena]


force-closing types {'subsentence', 'sentence'} in The Tale of Parizada, Warda and Nargis ln 55


   |    1m 17s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Tale of Rustam (2).nena]


force-closing types {'subsentence', 'sentence'} in The Tale of Rustam (2) ln 51


   |    1m 21s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Wise Daughter of the King.nena]
   |    1m 22s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Wise Snake.nena]


force-closing types {'sentence'} in The Wise Snake ln 1


   |    1m 23s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/The Wise Young Man.nena]


force-closing types {'subsentence', 'sentence'} in The Wise Young Man ln 25


   |    1m 26s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Barwar/šošət Xere.nena]


force-closing types {'subsentence', 'sentence'} in šošət Xere ln 6
force-closing types {'sentence'} in šošət Xere ln 7
force-closing types {'subsentence', 'inton', 'sentence'} in šošət Xere ln 8
force-closing types {'sentence'} in šošət Xere ln 10
force-closing types {'subsentence', 'sentence'} in šošət Xere ln 11


   |    1m 27s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/A Close Shave.nena]
   |    1m 27s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/A Cure for a Husband’s Madness.nena]


force-closing types {'inton'} in A Cure for a Husband’s Madness ln 1
force-closing types {'inton'} in A Cure for a Husband’s Madness ln 4
force-closing types {'inton'} in A Cure for a Husband’s Madness ln 5
force-closing types {'inton'} in A Cure for a Husband’s Madness ln 6


   |    1m 27s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/A Donkey Knows Best.nena]
   |    1m 27s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/A Dragon in the Well.nena]
   |    1m 27s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/A Dutiful Son.nena]
   |    1m 28s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/A Frog Wants a Husband.nena]
   |    1m 28s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/A Lost Donkey.nena]
   |    1m 28s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/A Lost Ring.nena]
   |    1m 28s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/A Painting of the King of Iran.nena]
   |    1m 29s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/A Pound of Flesh.nena]
   |    1m 29s processin

force-closing types {'inton'} in A Thousand Dinars ln 13


   |    1m 29s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Agriculture and Village Life.nena]
   |    1m 30s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Am I Dead?.nena]
   |    1m 30s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/An Orphan Duckling.nena]
   |    1m 31s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Axiqar.nena]


force-closing types {'word', 'subsentence', 'inton', 'sentence'} in Axiqar ln 28


   |    1m 31s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Events in 1946 on the Urmi Plain.nena]
   |    1m 31s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Games.nena]


force-closing types {'subsentence', 'sentence'} in Axiqar ln 89


   |    1m 32s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Hunting.nena]
   |    1m 32s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/I Have Died.nena]
   |    1m 32s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Ice for Dinner.nena]
   |    1m 32s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Is There a Man With No Worries?.nena]
   |    1m 32s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Kindness to a Donkey.nena]
   |    1m 32s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Lost Money.nena]
   |    1m 32s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Mistaken Identity.nena]
   |    1m 32s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Much Ado About Nothing.nena]
   |    1m 32s processing: [/Users/c

force-closing types {'inton'} in The Adventures of Ashur ln 27


   |    1m 34s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Adventures of Two Brothers.nena]
   |    1m 35s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Adventures of a Princess.nena]
   |    1m 36s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Angel of Death.nena]
   |    1m 36s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Assyrians of Armenia.nena]
   |    1m 36s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Assyrians of Urmi.nena]


force-closing types {'subsentence', 'sentence'} in The Assyrians of Armenia ln 10


   |    1m 38s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Bald Child and the Monsters.nena]
   |    1m 38s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Bald Man and the King.nena]
   |    1m 39s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Bird and the Fox.nena]
   |    1m 39s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Cat’s Dinner.nena]
   |    1m 39s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Cow and the Poor Girl.nena]
   |    1m 39s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Dead Rise and Return.nena]
   |    1m 39s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Fisherman and the Princess.nena]


force-closing types {'inton'} in The Fisherman and the Princess ln 2
force-closing types {'inton'} in The Fisherman and the Princess ln 3


   |    1m 40s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Giant One-Eyed Demon.nena]
   |    1m 40s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Little Prince and the Snake.nena]
   |    1m 40s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Loan of a Cooking Pot.nena]
   |    1m 40s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Man Who Wanted to Complain to God.nena]
   |    1m 40s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Old Man and the Fish.nena]
   |    1m 40s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Purchase of a Donkey.nena]
   |    1m 40s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Snake’s Dilemma.nena]
   |    1m 40s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.0

force-closing types {'sentence'} in The Snake’s Dilemma ln 13


   |    1m 41s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Wife Who Learns How to Work.nena]
   |    1m 41s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Wife’s Condition.nena]
   |    1m 41s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Wise Brother.nena]


force-closing types {'inton'} in The Wife Who Learns How to Work ln 1
force-closing types {'inton'} in The Wife Who Learns How to Work ln 7


   |    1m 42s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/The Wise Young Daughter.nena]
   |    1m 43s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Trickster.nena]
   |    1m 43s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Two Birds Fall in Love.nena]
   |    1m 43s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Two Wicked Daughters-In-Law.nena]
   |    1m 43s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Village Life (2).nena]


force-closing types {'subsentence', 'inton', 'sentence'} in Two Wicked Daughters-In-Law ln 9


   |    1m 43s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Village Life (3).nena]
   |    1m 44s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Village Life (4).nena]
   |    1m 44s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Village Life (5).nena]
   |    1m 44s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Village Life (6).nena]


force-closing types {'inton'} in Village Life (5) ln 1
force-closing types {'inton'} in Village Life (6) ln 34


   |    1m 46s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Village Life.nena]


force-closing types {'inton'} in Village Life ln 1
force-closing types {'inton'} in Village Life ln 5


   |    1m 46s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Vineyards.nena]
   |    1m 46s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Weddings and Festivals.nena]


force-closing types {'inton'} in Village Life ln 18
force-closing types {'inton'} in Village Life ln 20


   |    1m 47s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Weddings.nena]
   |    1m 47s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/When Shall I Die?.nena]
   |    1m 47s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Women Are Stronger Than Men.nena]
   |    1m 48s processing: [/Users/cody/github/CambridgeSemiticsLab/nena_corpus/nena/0.01/Urmi_C/Women Do Things Best.nena]
   |    1m 48s 1429 glosses matched...
   |    1m 48s 6674 glosses not matched...
   |    1m 48s "edge" actions: 0
   |    1m 48s "feature" actions: 756315
   |    1m 48s "node" actions: 294155
   |    1m 48s "resume" actions: 0
   |    1m 48s "slot" actions: 539381
   |    1m 48s "terminate" actions: 833720
   |          2 x "dialect" node 
   |      35985 x "inton" node 
   |     539381 x "letter" node  = slot type
   |       2544 x "line" node 
   |     120148 x "morpheme" node 
   |        351 x "paragr

# Documentation

Automatically construct documentation from the features and arrange by objects in descending order.

In [None]:
docs = Path('../docs')
feat_documentation = docs.joinpath('features.md')

In [None]:
TF = Fabric(locations=str(OUT_DIR))
api = TF.loadAll()
tf_vars = api.makeAvailableIn(globals())
F, L, Fall = api.F, api.L, api.Fall
C = api.C

In [None]:
doctext = """
# NENA Text-Fabric Corpus

The NENA Text-Fabric (TF) corpus contains textual transcriptions and linguistic annotations from the research group under Geoffrey Khan at the University of Cambridge.

## Contents

* [data model](#data-model)
* [features](#features)

## Data Model

For a full description of the Text-Fabric data model, see the [datamodel documentation](https://annotation.github.io/text-fabric/Model/Data-Model/).

One can think about the NENA Text-Fabric resource in two ways. The first is as a **conceptual** model, and the second is as a literal **implementation**. The conceptual model is simply a way of thinking about the text and all its various parts (words, sentences, letters, etc.). The literal implementation is the way that conceptual model is actually stored on a computer. 

The **conceptual** model of the TF NENA corpus is a graph. In mathematics, a [graph](https://en.wikipedia.org/wiki/Graph_theory) is a method of indicating relationships between entities. The entities in a graph are called "nodes", often illustrated visually as circles. Their relationships to one another are called "edges", illustrated with lines drawn between two or more circles. A visual representation can be seen below.

<img src="images/graph_illustration.png" height=30% width=30%>

In the case of a [text graph](https://www.balisage.net/Proceedings/vol19/html/Dekker01/BalisageVol19-Dekker01.html), entities like letters, words, sentences are stored as nodes. These entities also have relationships. A key relationship in Text-Fabric is "containment": a sentence contains a word, a word contains a letter. Other, optional relationships might be syntactic relations or discourse relations between sentences. With the exception of "containment", the graph model of Text-Fabric does not "care" which other relationships are modeled (syntax, discourse, etc.). The user(s) are free to choose whatever relationships they are interested in.

For instance, in the example below we can see a containment relationship being modelled between a given word and its letter:

<img src="images/containment_illustration.png" height=30% width=30%>


*To be continued...*

<hr>

# Features
""".strip()

def maketable(tabledata, headers=[]):
    return tabulate.tabulate(tabledata, headers=headers, tablefmt='pipe')

for otype_data in C.levels.data:
    otype = otype_data[0]
    count = len(list(F.otype.s(otype)))
    feature_counts = collections.Counter()
    
    # add object to the doc
    doctext += '\n\n'
    doctext += f'## {otype} ({count}x)'
    
    for feat, fdata in featureMeta.items(): 
        
        # see if otype ever used with given feature
        uses = list(Fs(feat).freqList(nodeTypes=otype))
        values = [fl[0] for fl in uses]
        total = sum(fl[-1] for fl in uses)
        if not uses:
            continue
        uses.append(('TOTAL', total))
        
        # add feature section to the document
        doctext += '\n\n'
        doctext += f'### {feat}'
        doctext += '\n\n'
        doctext += fdata['about']
        doctext += '\n'
        
        # add data about the feature
        if fdata['feature_type'] == 'categorical':
            doctext += '\n'
            doctext += maketable(uses, (feat, 'frequency'))
        elif fdata['feature_type'] == 'text':
            doctext += '\n'
            doctext += 'See the [transcription tables](transcription.md).\n\n'
            doctext += maketable([uses[-1]])
        else:
            doctext += '\n'
            doctext += 'Arbitrary string.\n\n'
            doctext += 'examples:\n'
            doctext += '```' + '\n' + '\n'.join(str(v) for v in values[:5]) + '\n' + '```'
            doctext += '\n\n'
            doctext += maketable([uses[-1]])

In [None]:
with open(feat_documentation, 'w') as outfile:
    outfile.write(doctext)