# NENA 2 TF 2.0

This version of the NENA conversion is the first to use the new 
text markup and parsing model.

In [3]:
import collections
import json
import tabulate
import unicodedata as ud
from tf.fabric import Fabric
from tf.convert.walker import CV
from pathlib import Path

# configure paths to data input and output
VERSION = 'alpha'
CSL_DIR = Path.home().joinpath('github/CambridgeSemiticsLab/')
PROJECT_DIR = CSL_DIR.joinpath('nena_tf')
CORPUS_DIR = CSL_DIR.joinpath('nena_corpus')
INPUT_DIR = CORPUS_DIR.joinpath(f'parsed_texts/{VERSION}')
CORPUS_METADATA = CORPUS_DIR.joinpath('standards/metadata.json')
OUTPUT_DIR = PROJECT_DIR.joinpath(f'tf/{VERSION}')
TF_methods = {'Fabric': Fabric, 'CV': CV}

## Configure Feature and Object Metadata

In [4]:
corpus_meta = json.loads(CORPUS_METADATA.read_text())

In [5]:
corpus_meta['NENA_corpus']

{'name': 'North Eastern Neo-Aramaic Text Corpus',
 'origin': 'The NENA text corpus is derived from decades of fieldwork by Professor Geoffrey Khan and his colleagues at the University of Cambridge, Faculty of Asian and Middle Eastern Studies',
 'license': 'Creative Commons Attribution 4.0 International Public License',
 'contributors': 'Geoffrey Khan, Eleanor Coghill, Roberta Borghero, Lidia Napiorkowska, Hezy Mutzafi, Alinda Damsma, Paul Noorlander, Dorota Molin, Johan Lundberg',
 'scientific_programmers': 'Cody Kingham, James Strachan, Dirk Roorda, Hannes Vlaardingerbroek',
 'DOI': '10.5281/zenodo.3541999',
 'corpus_repo': 'https://github.com/CambridgeSemiticsLab/nena_corpus'}

In [6]:
intFeatures = {'number'}
otext = {
    'sectionTypes': 'dialect,text,line',
    'sectionFeatures': 'dialect,title,number',
    'fmt:text-orig-full': 'word#{text}{end}',
    'fmt:text-orig-lite': 'word#{text_lite}{end}',
    'fmt:text-trans-full': 'word#{full}{full_end}',
    'fmt:text-trans-lite': 'word#{lite}{lite_end}',
    'fmt:text-trans-fuzzy': 'word#{fuzzy}{fuzzy_end}',
}

In [23]:
class NenaTfBuilder:
    """Construct Text-Fabric graph resource from parsed JSON files"""
    
    def __init__(self, input_dir, output_dir, metadata, TF_methods, **TF_kwargs):
        """Load json data from input dir to prepare for TF conversion.
        
        Args:
            input_dir: pathlib Path which is a directory that contains
                subdirectories named after the respective dialects in the
                text corpus. Each subdirectory should contain parsed JSON
                texts ready for analysis.
            output_dir: directory to save the .tf files
            metadata: dictionary containing metadata on the corpus needed
                to construct the TF graph
            TF: Text-Fabric module loaded
            **TF_kwargs: optional kwargs to pass to Text-Fabric loader
        """
        
        # load JSON data and initialize TF objects with paths
        self.metadata = metadata
        self.dialect2parsings = self.load_parsed_jsons(input_dir)
        self.Fabric = TF_methods['Fabric'](locations=str(output_dir), **TF_kwargs)
        self.cv = TF_methods['CV'](self.Fabric)
        self.message = self.Fabric.tmObj # timestamped messages
        
    def load_parsed_jsons(self, dialect_dir):
        """Map directory of dialect subdirectories to parsed json data.
        
        Args:
            dialect_dir: a pathlib Path that contains subdirectories
                named after respective dialects; each subdirectory 
                contains parsed JSON files which are each a text
        
        Returns:
            dict with structure of dict[dialect] = list(text_parsings)
        """
        dialect2parsings = collections.defaultdict(list)
        for dialect_dir in sorted(INPUT_DIR.glob('*')):
            for text_file in sorted(dialect_dir.glob('*.json')):
                dialect = dialect_dir.name
                text_data = json.loads(text_file.read_text())
                dialect2parsings[dialect].append(text_data)
        return dialect2parsings
    
    def build(self, **walk_kwargs):
        """Executes the TF conversion on the loaded source
        
        Args:
            walk_kwargs: optional. Keyword arguments to feed 
                to TF's cv.walk function.
        """
        slot_type = 'letter'
        self.good = self.cv.walk(
            self.director,
            slot_type,
            **walk_kwargs,
        )
    
    def dict_intersect(self, dict1, dict2):
        """Set intersection from one dict to another"""
        return {k:v for k,v in dict1.items() if k in dict2}
    
    def director(self, cv):
        """Call cv methods to index the graph.
        
        This function does the bulk of the work of building the TF resource.
        It operates in one large loop that walks over all parsed data. 
        The supplied cv Text-Fabric class possesses methods that create node 
        IDs and associate features with those IDs. These methods are called 
        throughout the loop. 
        
        cv methods used here:
            cv.slot: make a new slot, the atomic element of the graph. All 
                nodes active during an active slot will contain that slot.
            cv.node: make a new node in the graph with supplied object name
            cv.feature: add a string/integer feature to a supplied cv.node
            cv.terminate: deactivate a given node; this ends any further
                slot embeddings, which are calculated automatically from 
                whichever slots are activated while the node is also active.
                
        Further info about cv functionality can be referenced in the 
        Text-Fabric documentation.
        
        Args:
            cv: Text-Fabric CV class loaded with Fabric
        """
        
        features = self.metadata['object_features']
        text_features = {f for f in features if f['value'] == 'text'}
        general_features = {f for f in features if f not in text_features}
        nodes = {} # gets updated throughout
    
        def swap_node(node_type):
            """Replace any active nodes with new node."""
            try:
                cv.terminate(nodes[node_type])
                nodes[node_type] = cv.node(node_type)
            except KeyError:
                nodes[node_type] = cv.node(node_type)
        
        # parse all data for every dialect
        for dialect, texts in self.dialect2parsings.items():
            
            # make dialect node / features
            nodes['dialect'] = cv.node('dialect')
            cv.feature(nodes['dialect'], dialect=dialect)
            
            # make text node / features
            for text in texts:
        
                nodes['text'] = cv.node('text')
                text_feats, paragraphs = text
                cv.feature(nodes['text'], **self.dict_intersect(text_feats, features))
                
                for ith_paragraph, paragraph in enumerate(paragraphs):
                    
                    nodes['stress'] = cv.node('stress')
                    nodes['inton'] = cv.node('inton')
                    nodes['subsentence'] = cv.node('subsentence')
                    nodes['sentence'] = cv.node('sentence')
                    nodes['paragraph'] = cv.node('paragraph')
                
                    # -- Process Paragraph elements --
                    # which are comprised of word and span tags
                    
                    # track span features for addition to words
                    # features get updated by span triggers during element iteration
                    span_feats = {
                        'speaker': list(text_feats['speakers'].values())[0],
                        'lang': 'NENA',
                        'timestamp': None,
                    }
                    
                    for ith_element, element in enumerate(paragraph):
                        
                        # -- process span elements --
                        if element['class'] == 'span':
                            
                            # build line nodes
                            if 'line_number' in element:
                                swap_node('line')
                                line_num = element['line_number']
                                cv.feature(
                                    nodes['line'], 
                                    line_number=line_num
                                )
                            
                            # update other span fields
                            span_feats.update(
                                self.dict_intersect(element, span_feats)
                            )
                            
                        # -- Process words, their letters, and their beginnings/ends --
                        elif element['class'] == 'word':
                            
                            nodes['word'] = cv.node('word')
                            word_features = {}
                            word_features.update(span_feats)
                            word_features.update(
                                self.dict_intersect(element, general_features)
                            )
                        
                            # 1. ** process word's letters and their features **
                            # also get text from the letters
                            for letter in element['letters']:
                                letter_node = cv.slot()
                                letter_features = self.dict_intersect(letter, features)
                                
                                # process features for letter
                                # pass on text features to word 
                                for feature, value in letter_features.items():
                                    cv.feature(letter_node, feature=value)
                                    if feature in text_features:
                                        word_features[feature] = word_features.get(feature,'') + value
                                        
                                cv.terminate(letter_node)
    
                            # 2. ** process word's parsing features **
                            # current process allows multiple parsings to co-exist
                            # thus we construct a composite parsestring for each 
                            # feature
                            parse_values = collections.defaultdict(list)
                            word_features['n_parses'] = len(element['parsings'])
                            for parse in element['parsings']:
                                # gather feature / val strings to be joined next
                                for feat, val in parse.items():
                                    parse_values[feat].append(val)
                            
                            # now add parse features to word features
                            # join multiple parse features on |
                            for feat, vals in parse_values.items():
                                word_features[feat] = '|'.join(vals)
                            
                            # 3. ** Process beginnings on a word **
                            for begin in element['beginnings']:
                                
                                # build begin strings
                                text_features = self.dict_intersect(end, text_features)
                                text_feat_prfx = {k+'_begin':v for k,v in text_features.items()}
                                for feat_val in text_feat_prfx.items():
                                    word_features[feat] = word_features.get(feat,'') + val
                            
                            # 4. ** Process endings on a word ** 
                            # mark sentence/subsentence/inton/stress bounds on endings of word
                            # also add endings as their own text features of a word, with _end suffix
                            detected_boundaries = set()
                            for end in element['endings']:
                                
                                # build end strings
                                text_features = self.dict_intersect(end, text_features)
                                text_feat_sffx = {k+'_end':v for k,v in text_features.items()}
                                for feat, val in text_feat_suffx.items():
                                    word_features[feat] = word_features.get(feat,'') + val
                                
                                # skip non-separating puncts
                                if end['class'] != 'separator':
                                    continue
                                
                                # detect stress bounds; end at any of the following:
                                if end['modifies'] in {'word', 'intonation group',  'subsentence', 'sentence'}:
                                    detected_boundaries.add('stress')
                                
                                # detect inton bounds
                                if end['modifies'] in {'intonation group', 'subsentence', 'sentence'}:
                                    detected_boundaries.add('inton')
                                
                                # detect subsentence bounds
                                if end['modifies'] in {'subsentence', 'sentence'}:
                                    detected_boundaries.add('subsentence')
                                
                                # detect sentence bounds
                                if end['modifies'] in {'sentence'}:
                                    detected_boundaries.add('sentence')
                                
                            # end the word and execute boundary divisions
                            cv.feature(nodes['word'], **word_features)
                            cv.terminate(nodes['word'])
                            for bound in detected_boundaries:
                                if ith_element+1 != len(paragraph):
                                    swap_node(bound)
                                else:
                                    cv.terminate(nodes[bound])

                    # we've come to the end of the paragraph
                    # we do some house-cleaning before finishing with the paragraph
                    
                    # do a sanity check for un-closed intons, subsentences, sentences
                    # possibly due to lack of proper punctuation in the source text (to be fixed later)
                    title = text_feats['title']
                    for obj in {'stress', 'inton', 'sentence', 'subsentence'} & cv.activeTypes():
                        sys.stderr.write(f'force-closing {obj} in {title}, §{ith_paragraph}.{ith_element}\n')
                        cv.terminate(nodes[obj])
                    
                    # check for active line on last paragraph
                    # thus line can straddle paragraphs
                    # but obviously it should not straddle texts!
                    if (ith_paragraph+1 == len(paragraphs)) & nodes.get('line', None):
                        cv.terminate(nodes['line'])
                    
                    # close shop on the §
                    cv.terminate(nodes['paragraph'])
                        
                # -- trigger section node endings --
                cv.terminate(nodes['text'])
            cv.terminate(nodes['dialect'])
    
#     def exclude_keys(self, dicti, *exclusions):
#         """Filter dictionary to exclude keys"""
#         return {k:v for k,v in dicti.items() if k not in exclusions}
    
#     def build_text(self, letters, text_features):
#         """Construct text representations of word based on available features.
        
#         Letters need to be re-joined to form a word's text representation.
#         This method constructs all available text representations from a 
#         list of letter dicts.
#         """
#         text_forms = {}
#         for feat in text_features:
#             try:
#                 text_forms[feat] = self.join_letters(letters, feat)
#             except KeyError:
#                 continue
#         return text_forms
    
#     def join_letters(self, letters, feature, on=''):
#         """Joins letters based on given text feature"""
#         letter_text = [l[feature] for l in letters]
#         return f'{on}'.join(letter_text)

In [24]:
nena_builder = NenaTfBuilder(INPUT_DIR, OUTPUT_DIR, corpus_meta, TF_methods, silent='deep')

In [26]:
nena_builder.build(generateTf=False)

  0.00s Importing data from walking through the source ...
   |     0.00s Preparing metadata... 
   |   SECTION   TYPES:    
   |   SECTION   FEATURES: 
   |   STRUCTURE TYPES:    
   |   STRUCTURE FEATURES: 
   |   TEXT      FEATURES:


UnboundLocalError: local variable 'textFormats' referenced before assignment

In [14]:
#nena_builder.dialect2parsings['Barwar'][0][1][0][1]

In [15]:
#nena_builder.metadata