# Merging Phrase Atoms

Nearly all phrase atoms have been parsed with the phrase_parser. 
These objects have relations between each other within embedding phrases
that can be accessed. We will take advantage of this in order to 
complete the phrase parsing.

In [39]:
import sys
import copy
import collections
from tf.app import use
sys.path.append('../scripts/tools/')
import nav_tree as nt
from load_parse import ParseLoader

# load BHSA in Text-Fabric
A = use('bhsa', hoist=globals())
A.displaySetup(
    condenseType='phrase', hiddenTypes={'subphrase'}, 
    extraFeatures='rela pdp st', withNodes=True
)

# load parsings
parses = ParseLoader('../../results/parsing/phrase_parsings.json').load()
slot2pos = ParseLoader('../../results/parsing/slot2pos.json').load()

In [55]:
class PhraseAtomComposer:

    def __init__(self, ph2parse, tf_api):
        self.F, self.E, self.L = tf_api.F, tf_api.E, tf_api.L
        self.ph2parse = copy.deepcopy(ph2parse) # write only to local copy
        self.mom2kids = self.build_edges()
    
#     def is_PP(self, node):
#         """Determine whether a given node + parse is a PP."""
#         if self.F.typ.v(node) == 'PP':
#             return True
#         words = L.d(node,'word')
#         if slot2pos[words[0]] == 'PREP':
#             return True
#         return False
    
#     def get_first_NP(self, parse):
#         """Retrieve first non-PP from a parse."""
#         for sp in nt.get_head_path(parse):
#             if sp[-1] != 'PP':
#                 return sp
#         return sp[1]
    
#     def get_first_nmpr_sp(self, parse):
#         """Find first subphrase headed by a proper noun."""
#         for sp in nt.traverse_tree(parse):
#             head = nt.get_head(sp)
#             if self.F.pdp.v(head) == 'nmpr':
#                 return sp
#         for slot in nt.get_slots(parse):
#             if self.F.pdp.v(slot) == 'nmpr':
#                 return slot
    
    def build_edges(self):
        """Build up an edges dictionary.
        
        The ETCBC phrase atom edges are a bit complex.
        Phrase atoms can have relation edges to other 
        phrase atoms, or to individual words contained in
        another phrase atom. Additionally, there are some 
        difficult cases I want to adjust.
        
        One of these, for instance, is where [NP -Appo> PP], 
        in which the appositional rela would be better pointed 
        at the first non-prepositional phrase. 
        
        Another case is the Link (conj) rela, which points at 
        the item that is being parallelized rather that the phrase 
        which follows the conjunction. 
        
        We fix these cases here and put them in a dict. The dict is 
        a mapping from a phrase node to either another phrase node 
        or to a tuple of slots. The mapping represents child to mother
        mapping at first. But this dict is reversed before it is 
        returned to yield a mother to children mapping.
        """
        
        child2mom = {}
        relamap = {
            'Appo': 'APPO',
            'Spec': 'ADJV',
            'Link': 'CONJ',
            'Sfxs': 'ADJV',
            'Para': 'PARA',
            'NA': None,
        }
        
        # build the maps
        for ph in self.F.otype.s('phrase_atom'):
            
            # get data on this ph and its mother
            rela = self.F.rela.v(ph)
            rela = relamap[rela]
            mom = self.E.mother.f(ph)
            momotype = set(F.otype.v(m) for m in mom)
                        
            # modify Link
            if rela == 'conj':
                # reassign these edges to point at 
                # the parallel element instead
                child2mom[ph] = (ph+1, rela)
                
            # change NP -Appo> PP
            # move the apposition edge to the 
            # first non-PP phrase in the mother parse
#             elif (rela == 'appo'
#                 and 'phrase_atom' in momotype
#                 and self.is_PP(mom[0])
#                 and not self.is_PP(ph)
#             ):
                
#                 # get mom phrase parsing
#                 try:
#                     mparse = self.get_parse(mom[0])
#                 except:
#                     continue
                
#                 # get heads and potential proper nouns for apposition assignment
# #                 mom_head = nt.get_head(mparse)
# #                 ph_head = nt.get_head(ph)
                
#                 # edge to single-slot
#                 if type(mparse[0]) == int and type(mparse[1]) == int:
#                     momslots = (mparse[1],) 
                
#                 # 2 possibilities from here:
#                 # 1. proper noun -appo> noun / noun -appo> proper noun
#                 # 2. edge to next available non-PP phrase (its slots)
#                 # this involves a good deal of complexity
#                 elif (mom_pn := self.get_first_nmpr_sp(mparse)):
#                     momslots = tuple(sorted(nt.get_slots(mom_pn)))
                
#                 else:
#                     sp = self.get_first_NP(mparse)
#                     try:
#                         momslots = tuple(sorted(nt.get_slots(sp)))
#                     except:
#                         raise Exception(mparse, sp)
                    
#                 # done; point phrase rela edge to the slots
#                 child2mom[ph] = (momslots, rela)
                
            # deal with normal phrases
            elif 'phrase_atom' in momotype:
                child2mom[ph] = (mom[0], rela)
                
            # word mothers as slots
            elif 'word' in momotype:
                child2mom[ph] = (mom, rela)
                
        # reverse the dict
        mom2kids = collections.defaultdict(list)
        for child, edge in child2mom.items():
            mom, rela = edge
            mom2kids[mom].append((child, rela))
        
        return mom2kids
        
    def get_parse(self, ph_atom):
        """Retrieve phrase atom parsing."""
        try:
            return self.ph2parse[ph_atom]
        except KeyError:
            words = self.L.d(ph_atom, 'word')
            if len(words) == 1:
                return words
            else:
                raise Exception(f'No parsing found for {ph_atom}!')
                
    def sort_children(self, node, edges):
        before_mom = []
        after_mom = []
        for edge in edges:
            child, rela = edge
            if child > node:
                after_mom.append(edge)
            else:
                before_mom.append(edge)
        after_mom.sort()
        before_mom.sort(reverse=True)
        return after_mom + before_mom
        
        
    def compose_phrase(self, node):
        """Recursively compose phrase elements.
        
        !TODO: detailed description
        """
        
        parse = self.get_parse(node)

        # modify internal phrase constituents with slot-based edge mappings
        if len(parse) == 3:
            for ph in nt.traverse_tree(parse):                
                for i, sp in enumerate(ph[:-1]):
                    slots = tuple(sorted(nt.get_slots(sp)))
                    for kid, rela in self.mom2kids.get(slots, []):
                        ph[i] = [
                            self.compose_phrase(kid),
                            ph[i], # build up recursively
                            rela
                        ]
        # look for slot-based relations on single-word phrases
        elif len(parse) == 1:
            for kid, rela in self.mom2kids.get((parse[0],), []):
                parse[0] = [
                    self.compose_phrase(kid),
                    parse[0],
                    rela
                ]
        
        # single-slot parse adjustment and sanity checks
        if len(parse) == 1:
            parse = parse[0]
        elif len(parse) != 3:
            raise Exception(f'Invalid parse length of {len(parse)}: {parse}')

        # compose external phrase constituents by iteratively wrapping a new list
        for kid, rela in self.sort_children(node, self.mom2kids.get(node, [])):
            parse = [
                self.compose_phrase(kid),
                parse,
                rela
            ]
        
        # finish
        return parse

In [74]:
composer = PhraseAtomComposer(parses, A.api)

test_run = composer.compose_phrase(1014617)

test_run

[[[[[[179317, 179318, 'DEF'], 179316, 'GP'], 179315, 'APPO'],
   [179314, [[[179312, 179313, 'DEF'], 179311, 'GP'], 179310, 'APPO'], 'CONJ'],
   'PARA'],
  179309,
  'GP'],
 [179306, [179308, 179307, 'GP'], 'PP'],
 'APPO']

In [75]:
print(nt.show_relas(test_run, T.text))

אֶ֜רֶץ סִיחֹ֣ון׀ מֶ֣לֶךְ הָאֱמֹרִ֗י וְעֹג֙ מֶ֣לֶךְ הַבָּשָׁ֔ן   --APPO-->  בְּאֶ֣רֶץ גִּלְעָ֑ד 
סִיחֹ֣ון׀ מֶ֣לֶךְ הָאֱמֹרִ֗י וְעֹג֙ מֶ֣לֶךְ הַבָּשָׁ֔ן   --GP-->  אֶ֜רֶץ 
עֹג֙ מֶ֣לֶךְ הַבָּשָׁ֔ן   --PARA-->  סִיחֹ֣ון׀ מֶ֣לֶךְ הָאֱמֹרִ֗י וְ
מֶ֣לֶךְ הַבָּשָׁ֔ן   --APPO-->  עֹג֙ 
הַבָּשָׁ֔ן   --GP-->  מֶ֣לֶךְ 
הַ  --DEF-->  בָּשָׁ֔ן 
וְ  --CONJ-->  סִיחֹ֣ון׀ מֶ֣לֶךְ הָאֱמֹרִ֗י 
מֶ֣לֶךְ הָאֱמֹרִ֗י   --APPO-->  סִיחֹ֣ון׀ 
הָאֱמֹרִ֗י   --GP-->  מֶ֣לֶךְ 
הָ  --DEF-->  אֱמֹרִ֗י 
בְּ  --PP-->  אֶ֣רֶץ גִּלְעָ֑ד 
גִּלְעָ֑ד   --GP-->  אֶ֣רֶץ 


In [5]:
A.pretty(L.u(1024353,'phrase')[0], )

In [6]:
A.pretty(L.u(1002118,'phrase')[0], condenseType='phrase', hiddenTypes={'subphrase'}, extraFeatures='rela')

In [7]:
A.pretty(755310, condenseType='phrase', hiddenTypes={'subphrase'}, extraFeatures='rela')

## Run the composer

In [79]:
def get_complete_phrases(ph2parse, tf_api):
    """Retrieve phrases completely covered by the parsings.
    
    The phrase parser runs on phrase_atoms, which are
    component parts of a complete phrase. In some cases the
    parser was unable to parse a phrase_atom, meaning that 
    some phrases are left without a complete parsing. This
    function only selects those phrases with complete parses.
    """
    F, L = tf_api.F, tf_api.L
    
    parsed_atoms = set(ph2parse)
    # add unparsed conjunctions
    for atom in F.otype.s('phrase_atom'):
        if F.rela.v(atom) == 'Link':
            parsed_atoms.add(atom)

    # select only those phrases completely covered by the parser
    whole_phrases = []
    for phrase in F.otype.s('phrase'):
        ph_atoms = set(L.d(phrase, 'phrase_atom'))
        if parsed_atoms.issuperset(ph_atoms):
            whole_phrases.append(phrase)
            
    # TODO: Remove this and instead put a new feature in the 
    # data metrics report
    whole_atoms= set(at for ph in whole_phrases for at in L.d(ph,'phrase_atom') if at in parsed_atoms)
    part_atoms = parsed_atoms - whole_atoms
            
    return whole_phrases, part_atoms

In [80]:
whole_phrases, part_atoms = get_complete_phrases(parses, A.api)

print(f'N whole phrases: {len(whole_phrases)}')
print(f'N part atoms: {len(part_atoms)}')

N whole phrases: 113051
N part atoms: 807


In [81]:
755310 in whole_phrases

True

In [72]:
t = A.search('phrase function=Time')

  0.26s 4080 results


NB: part phrases count includes many phrases which were already excluded.

In [62]:
def compose_phrases(ph2parse, tf_api):
    """Compose all eligible phrases."""
    
    F, L = tf_api.F, tf_api.L
    
    whole_phrases, part_phrases = get_complete_phrases(ph2parse, tf_api)
    composer = PhraseAtomComposer(parses, A.api)
    full_parses = {}
    
    # iterate through all whole phrases and call composer on the 
    # first phrase atom of each one; the network connections between
    # all of the atoms should cause them all to be grabbed
    for phrase in whole_phrases:
        first_atom = L.d(phrase, 'phrase_atom')[0]
        comp_parse = composer.compose_phrase(first_atom)
        
        # sanity check: 
        # compare slots in parse with slots in atom 
        # to make sure all slots are accounted for
        ph_slots = set(L.d(phrase,'word'))
        comp_slots = set(nt.get_slots(comp_parse))
        if ph_slots != comp_slots:
            raise Exception(f'Missing slots for {phrase}; orig: {ph_slots}; comp: {comp_slots}')
            
        full_parses[phrase] = comp_parse
            
    return full_parses

In [63]:
full_parses = compose_phrases(parses, A.api)

In [64]:
len(full_parses)

112344