# Testing Sly Parser for Hebrew Noun Phrases

In [1]:
import json
from tf.app import use
from sly import Lexer, Parser
from sly.lex import Token
import networkx as nx

# load Hebrew data
bhsa = use('bhsa', hoist=globals())

with open('../../results/parsing/slot2pos.json', 'r') as infile:
    slot2pos = {int(k):v for k,v in json.load(infile).items()}

In [2]:
bhsa.displaySetup(condenseType='clause', withNodes=True, extraFeatures='st lex function')

## Get Testing Data

In [3]:
# get Hebrew nodes as tester data

bhsa.indent(reset=True)

queries = [

"""
phrase rela=NA
/where/
    word
/have/
    pdp=subs prs=absent
/-/
    word ls=card
    word pdp=subs ls#card

""",
    
# """
# p:phrase
#     w1:word pdp=subs st=c ls#card
#     <: w2:word pdp=subs ls=card prs#absent

# p =: w1 
# p := w2
# """,
]

samples = []
sample_nodes = []

for q in queries: 
    samples += bhsa.search(q, shallow=True)
    
samples.sort()

sample_nodes = {pn:L.d(pn, 'word') for pn in samples}
    
bhsa.info(f'total samples: {len(samples)}')

  2.40s 1186 results
  2.40s total samples: 1186


In [4]:
list(sample_nodes.values())[:10]

[(77, 78),
 (1992, 1993),
 (2175, 2176, 2177),
 (2245, 2246),
 (2286, 2287),
 (2389, 2390, 2391),
 (2433, 2434, 2435),
 (2587, 2588, 2589, 2590),
 (2876, 2877, 2878),
 (2882, 2883)]

In [5]:
bhsa.show(list(sample_nodes.values())[:40])

<hr>

In [21]:
class CX:
    def __init__(self, slot, pos):
        self.slot = slot
        self.cat = pos
    def __repr__(self):
        return f'CX({self.slot})'
        
class BhsaLexer(Lexer):
    """Wrap a BHSA node in SLY Token object.
        
    This enables us to use our custom tokens rather
    than raw string input.
    """

    # unique parts of speech values harvested from 
    # the parts of speech parser and pasted here
    # see results/parsing/uniquepos.txt
    tokens = {
#         ADJV,
         ADVB,
         ART,
         C,
         CARD,
         CONJ,
#         INRG,
#         INTJ,
#         NEGA,
         NOUN,
#         ORDN,
#         PRDE,
         PREP,
#         PRIN,
#         PROPN,
#         PRPS,
#         QUANT,
         SFX,
#         VERB,
    }
    
    def get_token(self, cx, cat, index):
        """Compile SLY token object."""
        token = Token()
        token.value = cx
        token.type = cat
        token.index = index
        token.lineno = index
        return token
    
    def tokenize(self, slots, slot2pos):
        """Write over the SLY tokenize method to yield custom data.

        Args:
            slots: iterable of integers which correspond with
                slots / word nodes in BHSA
            slot2pos: a dict that maps slots to their parts of speech
        
        Yields:
            generator of SLY Tokens
        """
        i = 0
        for slot in slots:
            
            # configure the word
            pos = slot2pos[slot]
            cx = CX(slot, pos)
            token = self.get_token(cx, pos, i)
            yield token
            i += 1
            
            # split off morphological tokens
            for morph in self.tokenize_morphs(slot):
                yield self.get_token(morph, morph.cat, i)
                i += 1
        
    def tokenize_morphs(self, slot):
        """Tokenize morphological forms."""
        if F.st.v(slot) == 'c' and slot2pos[slot] == 'NOUN':
            yield CX(slot, 'C')
        if F.prs.v(slot) != 'absent':
            yield CX(slot, 'SFX')
        
class PhraseParser(Parser):

    """A CFG parser for Biblical Hebrew phrases."""

    # initialize standard methods / attributes
    # and add a custom one: error_list
    def __init__(self, error_list):
        super().__init__()
        self.error_list = error_list
    
    tokens = BhsaLexer.tokens

    def error(self, token):
#         bhsa.prettyTuple([token.value.slot], seq=0)
#         ph = L.u(token.value.slot, 'phrase')[0]
#         tokens = list(lexer.tokenize(sample_nodes[ph], slot2pos))
        self.error_list.append(token)
    
    precedence = [
        ('left', CONJ),
        ('left', CARD, NOUN),
    ]
    
    debugfile = 'parser.out'
    
    # -- FINAL PHRASE --
    @_('np', 'pp','ccard', 
       'num', 'gp_card', 'gp_num')
    def phrase(self, p):
        return p[0]

    # -- noun-based phrases --
    @_('NOUN SFX')
    def np(self, p):
        return p[0].slot
    
    @_('df', 'appo', 'gp')
    def np(self, p):
        return p[0]
    
    # -- definite phrases --
    @_('ART NOUN')
    def df(self, p):
        return [p[0].slot, p[1].slot, 'DF']

    # -- prepositional phrases --
    @_('PREP phrase')
    def pp(self, p):
        return [p[0].slot, p[1], 'PP']
    
    @_('PREP NOUN', 'PREP ADVB')
    def pp(self, p):
        return [p[0].slot, p[1].slot, 'PP']
    
    @_('PREP SFX')
    def pp(self, p):
        return p[0]
    
    # -- genitive phrases --
    @_('NOUN C NOUN')
    def gp(self, p):
        return [p[2].slot, p[0].slot, 'GP']
    
    @_('NOUN C np')
    def gp(self, p):
        return [p[2], p[0].slot, 'GP']
    
    @_('NOUN C num')
    def gp_card(self, p):
        return [p[2], p[0].slot, 'GP_CARD']
    
    @_('NOUN C CARD')
    def gp_num(self, p):
        return [p[0].slot, p[2].slot, 'GP_NUM']
    
    # -- appositional phrases --
    @_('NOUN NOUN')
    def appo(self, p):
        return [p[0].slot, p[1].slot, 'APPO']
    
    @_('df df')
    def appo(self, p):
        return [p[1], p[0], 'APPO']
    
    # -- chained cardinal numbers --
    @_('CARD CARD')
    def ccard(self, p):
        return [p[0].slot, p[1].slot, 'CCARD']
    
    @_('CARD CONJ CARD')
    def ccard(self, p):
        return [p[0].slot, [p[1].slot, p[2].slot, 'CONJ'], 'CCARD']
    
    @_('ccard CONJ ccard')
    def ccard(self, p):
        return [p[0], [p[1].slot, p[2], 'CONJ'], 'CCARD']
    
    @_('CARD CONJ ccard')
    def ccard(self, p):
        return [p[0].slot, [p[1].slot, p[2], 'CONJ'], 'CCARD']
    
    @_('ccard CARD')
    def ccard(self, p):
        return [p[0], p[1].slot, 'CCARD']
    
    # -- cardinal quantifications --
    @_('NOUN CARD', 'CARD NOUN')
    def num(self, p):
        return [p.CARD.slot, p.NOUN.slot, 'NUM']
    
    @_('ccard NOUN')
    def num(self, p):
        return [p[0], p[1].slot, 'NUM']
    
    @_('ccard np')
    def num(self, p):
        return [p[0], p[1], 'NUM']
    
    @_('CARD np')
    def num(self, p):
        return [p[0].slot, p[1], 'NUM']
    
    @_('np CARD')
    def num(self, p):
        return [p[1].slot, p[0], 'NUM']

lexer = BhsaLexer()

Parser debugging for PhraseParser written to parser.out


In [29]:
type(None) in {None}

False

In [22]:
tokens = lexer.tokenize(list(sample_nodes.values())[0], slot2pos)

list(tokens)

[Token(type='NOUN', value=CX(77), lineno=0, index=0),
 Token(type='CARD', value=CX(78), lineno=1, index=1)]

In [23]:
tokens = lexer.tokenize(sample_nodes[676648][:2], slot2pos)

list(tokens)

[Token(type='NOUN', value=CX(40497), lineno=0, index=0),
 Token(type='CARD', value=CX(40498), lineno=1, index=1)]

In [24]:
parser.parse(lexer.tokenize(sample_nodes[676648][:2], slot2pos))

[40498, 40497, 'NUM']

In [25]:
bhsa.prettyTuple([694558], seq=0)

### Large scale tests

In [26]:
parsed = {}
error_list = []
parser = PhraseParser(error_list)


for i, phrase in enumerate(sample_nodes):
    nodeset = sample_nodes[phrase]
    parsing = parser.parse(lexer.tokenize(nodeset, slot2pos))
    parsed[phrase] = parsing

print(len(parsed), 'parsed')
print(len(error_list), 'not parsed')

1186 parsed
61 not parsed


## Display unparsed

In [62]:
for i, token in enumerate(error_records):
    bhsa.prettyTuple([token.value.slot], seq=i)
    ph = L.u(token.value.slot, 'phrase')[0]
    tokens = list(lexer.tokenize(sample_nodes[ph], slot2pos))
    print(tokens)
    print()

[Token(type='CARD', value=CX(7901), lineno=0, index=0), Token(type='NOUN', value=CX(7902), lineno=1, index=1), Token(type='NOUN', value=CX(7903), lineno=2, index=2), Token(type='NOUN', value=CX(7904), lineno=3, index=3)]



[Token(type='NOUN', value=CX(40497), lineno=0, index=0), Token(type='CARD', value=CX(40498), lineno=1, index=1), Token(type='NOUN', value=CX(40499), lineno=2, index=2)]



[Token(type='NOUN', value=CX(44015), lineno=0, index=0), Token(type='C', value=CX(44015), lineno=1, index=1), Token(type='NOUN', value=CX(44016), lineno=2, index=2), Token(type='CARD', value=CX(44017), lineno=3, index=3), Token(type='NOUN', value=CX(44018), lineno=4, index=4), Token(type='NOUN', value=CX(44019), lineno=5, index=5)]



[Token(type='NOUN', value=CX(45368), lineno=0, index=0), Token(type='NOUN', value=CX(45369), lineno=1, index=1), Token(type='C', value=CX(45369), lineno=2, index=2), Token(type='NOUN', value=CX(45370), lineno=3, index=3), Token(type='CARD', value=CX(45371), lineno=4, index=4)]



[Token(type='CARD', value=CX(49775), lineno=0, index=0), Token(type='NOUN', value=CX(49776), lineno=1, index=1), Token(type='CARD', value=CX(49777), lineno=2, index=2), Token(type='NOUN', value=CX(49778), lineno=3, index=3)]



[Token(type='CARD', value=CX(66273), lineno=0, index=0), Token(type='NOUN', value=CX(66274), lineno=1, index=1), Token(type='NOUN', value=CX(66275), lineno=2, index=2), Token(type='C', value=CX(66275), lineno=3, index=3), Token(type='NOUN', value=CX(66276), lineno=4, index=4)]



[Token(type='CARD', value=CX(67292), lineno=0, index=0), Token(type='NOUN', value=CX(67293), lineno=1, index=1), Token(type='CARD', value=CX(67294), lineno=2, index=2), Token(type='NOUN', value=CX(67295), lineno=3, index=3)]



[Token(type='NOUN', value=CX(74182), lineno=0, index=0), Token(type='CARD', value=CX(74183), lineno=1, index=1), Token(type='NOUN', value=CX(74184), lineno=2, index=2), Token(type='CARD', value=CX(74185), lineno=3, index=3), Token(type='NOUN', value=CX(74186), lineno=4, index=4), Token(type='CARD', value=CX(74187), lineno=5, index=5), Token(type='NOUN', value=CX(74188), lineno=6, index=6), Token(type='NOUN', value=CX(74189), lineno=7, index=7), Token(type='C', value=CX(74189), lineno=8, index=8), Token(type='NOUN', value=CX(74190), lineno=9, index=9), Token(type='CARD', value=CX(74191), lineno=10, index=10)]



[Token(type='NOUN', value=CX(74264), lineno=0, index=0), Token(type='CARD', value=CX(74265), lineno=1, index=1), Token(type='NOUN', value=CX(74266), lineno=2, index=2), Token(type='CARD', value=CX(74267), lineno=3, index=3), Token(type='NOUN', value=CX(74268), lineno=4, index=4), Token(type='CARD', value=CX(74269), lineno=5, index=5), Token(type='NOUN', value=CX(74270), lineno=6, index=6), Token(type='NOUN', value=CX(74271), lineno=7, index=7), Token(type='C', value=CX(74271), lineno=8, index=8), Token(type='NOUN', value=CX(74272), lineno=9, index=9), Token(type='CARD', value=CX(74273), lineno=10, index=10)]



[Token(type='NOUN', value=CX(74345), lineno=0, index=0), Token(type='CARD', value=CX(74346), lineno=1, index=1), Token(type='NOUN', value=CX(74347), lineno=2, index=2), Token(type='CARD', value=CX(74348), lineno=3, index=3), Token(type='NOUN', value=CX(74349), lineno=4, index=4), Token(type='CARD', value=CX(74350), lineno=5, index=5), Token(type='NOUN', value=CX(74351), lineno=6, index=6), Token(type='NOUN', value=CX(74352), lineno=7, index=7), Token(type='C', value=CX(74352), lineno=8, index=8), Token(type='NOUN', value=CX(74353), lineno=9, index=9), Token(type='CARD', value=CX(74354), lineno=10, index=10)]



[Token(type='NOUN', value=CX(74426), lineno=0, index=0), Token(type='CARD', value=CX(74427), lineno=1, index=1), Token(type='NOUN', value=CX(74428), lineno=2, index=2), Token(type='CARD', value=CX(74429), lineno=3, index=3), Token(type='NOUN', value=CX(74430), lineno=4, index=4), Token(type='CARD', value=CX(74431), lineno=5, index=5), Token(type='NOUN', value=CX(74432), lineno=6, index=6), Token(type='NOUN', value=CX(74433), lineno=7, index=7), Token(type='C', value=CX(74433), lineno=8, index=8), Token(type='NOUN', value=CX(74434), lineno=9, index=9), Token(type='CARD', value=CX(74435), lineno=10, index=10)]



[Token(type='NOUN', value=CX(74507), lineno=0, index=0), Token(type='CARD', value=CX(74508), lineno=1, index=1), Token(type='NOUN', value=CX(74509), lineno=2, index=2), Token(type='CARD', value=CX(74510), lineno=3, index=3), Token(type='NOUN', value=CX(74511), lineno=4, index=4), Token(type='CARD', value=CX(74512), lineno=5, index=5), Token(type='NOUN', value=CX(74513), lineno=6, index=6), Token(type='NOUN', value=CX(74514), lineno=7, index=7), Token(type='C', value=CX(74514), lineno=8, index=8), Token(type='NOUN', value=CX(74515), lineno=9, index=9), Token(type='CARD', value=CX(74516), lineno=10, index=10)]



[Token(type='NOUN', value=CX(74588), lineno=0, index=0), Token(type='CARD', value=CX(74589), lineno=1, index=1), Token(type='NOUN', value=CX(74590), lineno=2, index=2), Token(type='CARD', value=CX(74591), lineno=3, index=3), Token(type='NOUN', value=CX(74592), lineno=4, index=4), Token(type='CARD', value=CX(74593), lineno=5, index=5), Token(type='NOUN', value=CX(74594), lineno=6, index=6), Token(type='NOUN', value=CX(74595), lineno=7, index=7), Token(type='C', value=CX(74595), lineno=8, index=8), Token(type='NOUN', value=CX(74596), lineno=9, index=9), Token(type='CARD', value=CX(74597), lineno=10, index=10)]



[Token(type='NOUN', value=CX(74669), lineno=0, index=0), Token(type='CARD', value=CX(74670), lineno=1, index=1), Token(type='NOUN', value=CX(74671), lineno=2, index=2), Token(type='CARD', value=CX(74672), lineno=3, index=3), Token(type='NOUN', value=CX(74673), lineno=4, index=4), Token(type='CARD', value=CX(74674), lineno=5, index=5), Token(type='NOUN', value=CX(74675), lineno=6, index=6), Token(type='NOUN', value=CX(74676), lineno=7, index=7), Token(type='C', value=CX(74676), lineno=8, index=8), Token(type='NOUN', value=CX(74677), lineno=9, index=9), Token(type='CARD', value=CX(74678), lineno=10, index=10)]



[Token(type='NOUN', value=CX(74750), lineno=0, index=0), Token(type='CARD', value=CX(74751), lineno=1, index=1), Token(type='NOUN', value=CX(74752), lineno=2, index=2), Token(type='CARD', value=CX(74753), lineno=3, index=3), Token(type='NOUN', value=CX(74754), lineno=4, index=4), Token(type='CARD', value=CX(74755), lineno=5, index=5), Token(type='NOUN', value=CX(74756), lineno=6, index=6), Token(type='NOUN', value=CX(74757), lineno=7, index=7), Token(type='C', value=CX(74757), lineno=8, index=8), Token(type='NOUN', value=CX(74758), lineno=9, index=9), Token(type='CARD', value=CX(74759), lineno=10, index=10)]



[Token(type='NOUN', value=CX(74831), lineno=0, index=0), Token(type='CARD', value=CX(74832), lineno=1, index=1), Token(type='NOUN', value=CX(74833), lineno=2, index=2), Token(type='CARD', value=CX(74834), lineno=3, index=3), Token(type='NOUN', value=CX(74835), lineno=4, index=4), Token(type='CARD', value=CX(74836), lineno=5, index=5), Token(type='NOUN', value=CX(74837), lineno=6, index=6), Token(type='NOUN', value=CX(74838), lineno=7, index=7), Token(type='C', value=CX(74838), lineno=8, index=8), Token(type='NOUN', value=CX(74839), lineno=9, index=9), Token(type='CARD', value=CX(74840), lineno=10, index=10)]



[Token(type='NOUN', value=CX(74912), lineno=0, index=0), Token(type='CARD', value=CX(74913), lineno=1, index=1), Token(type='NOUN', value=CX(74914), lineno=2, index=2), Token(type='CARD', value=CX(74915), lineno=3, index=3), Token(type='NOUN', value=CX(74916), lineno=4, index=4), Token(type='CARD', value=CX(74917), lineno=5, index=5), Token(type='NOUN', value=CX(74918), lineno=6, index=6), Token(type='NOUN', value=CX(74919), lineno=7, index=7), Token(type='C', value=CX(74919), lineno=8, index=8), Token(type='NOUN', value=CX(74920), lineno=9, index=9), Token(type='CARD', value=CX(74921), lineno=10, index=10)]



[Token(type='NOUN', value=CX(74993), lineno=0, index=0), Token(type='CARD', value=CX(74994), lineno=1, index=1), Token(type='NOUN', value=CX(74995), lineno=2, index=2), Token(type='CARD', value=CX(74996), lineno=3, index=3), Token(type='NOUN', value=CX(74997), lineno=4, index=4), Token(type='CARD', value=CX(74998), lineno=5, index=5), Token(type='NOUN', value=CX(74999), lineno=6, index=6), Token(type='NOUN', value=CX(75000), lineno=7, index=7), Token(type='C', value=CX(75000), lineno=8, index=8), Token(type='NOUN', value=CX(75001), lineno=9, index=9), Token(type='CARD', value=CX(75002), lineno=10, index=10)]



[Token(type='NOUN', value=CX(75074), lineno=0, index=0), Token(type='CARD', value=CX(75075), lineno=1, index=1), Token(type='NOUN', value=CX(75076), lineno=2, index=2), Token(type='CARD', value=CX(75077), lineno=3, index=3), Token(type='NOUN', value=CX(75078), lineno=4, index=4), Token(type='CARD', value=CX(75079), lineno=5, index=5), Token(type='NOUN', value=CX(75080), lineno=6, index=6), Token(type='NOUN', value=CX(75081), lineno=7, index=7), Token(type='C', value=CX(75081), lineno=8, index=8), Token(type='NOUN', value=CX(75082), lineno=9, index=9), Token(type='CARD', value=CX(75083), lineno=10, index=10)]



[Token(type='NOUN', value=CX(75101), lineno=0, index=0), Token(type='C', value=CX(75101), lineno=1, index=1), Token(type='NOUN', value=CX(75102), lineno=2, index=2), Token(type='CARD', value=CX(75103), lineno=3, index=3), Token(type='CARD', value=CX(75104), lineno=4, index=4), Token(type='NOUN', value=CX(75105), lineno=5, index=5), Token(type='C', value=CX(75105), lineno=6, index=6), Token(type='NOUN', value=CX(75106), lineno=7, index=7), Token(type='CARD', value=CX(75107), lineno=8, index=8), Token(type='CARD', value=CX(75108), lineno=9, index=9), Token(type='NOUN', value=CX(75109), lineno=10, index=10), Token(type='C', value=CX(75109), lineno=11, index=11), Token(type='NOUN', value=CX(75110), lineno=12, index=12), Token(type='CARD', value=CX(75111), lineno=13, index=13), Token(type='CARD', value=CX(75112), lineno=14, index=14)]



[Token(type='NOUN', value=CX(75101), lineno=0, index=0), Token(type='C', value=CX(75101), lineno=1, index=1), Token(type='NOUN', value=CX(75102), lineno=2, index=2), Token(type='CARD', value=CX(75103), lineno=3, index=3), Token(type='CARD', value=CX(75104), lineno=4, index=4), Token(type='NOUN', value=CX(75105), lineno=5, index=5), Token(type='C', value=CX(75105), lineno=6, index=6), Token(type='NOUN', value=CX(75106), lineno=7, index=7), Token(type='CARD', value=CX(75107), lineno=8, index=8), Token(type='CARD', value=CX(75108), lineno=9, index=9), Token(type='NOUN', value=CX(75109), lineno=10, index=10), Token(type='C', value=CX(75109), lineno=11, index=11), Token(type='NOUN', value=CX(75110), lineno=12, index=12), Token(type='CARD', value=CX(75111), lineno=13, index=13), Token(type='CARD', value=CX(75112), lineno=14, index=14)]



[Token(type='NOUN', value=CX(75101), lineno=0, index=0), Token(type='C', value=CX(75101), lineno=1, index=1), Token(type='NOUN', value=CX(75102), lineno=2, index=2), Token(type='CARD', value=CX(75103), lineno=3, index=3), Token(type='CARD', value=CX(75104), lineno=4, index=4), Token(type='NOUN', value=CX(75105), lineno=5, index=5), Token(type='C', value=CX(75105), lineno=6, index=6), Token(type='NOUN', value=CX(75106), lineno=7, index=7), Token(type='CARD', value=CX(75107), lineno=8, index=8), Token(type='CARD', value=CX(75108), lineno=9, index=9), Token(type='NOUN', value=CX(75109), lineno=10, index=10), Token(type='C', value=CX(75109), lineno=11, index=11), Token(type='NOUN', value=CX(75110), lineno=12, index=12), Token(type='CARD', value=CX(75111), lineno=13, index=13), Token(type='CARD', value=CX(75112), lineno=14, index=14)]



[Token(type='NOUN', value=CX(79415), lineno=0, index=0), Token(type='CARD', value=CX(79416), lineno=1, index=1), Token(type='NOUN', value=CX(79417), lineno=2, index=2)]



[Token(type='NOUN', value=CX(79459), lineno=0, index=0), Token(type='NOUN', value=CX(79460), lineno=1, index=1), Token(type='CARD', value=CX(79461), lineno=2, index=2), Token(type='NOUN', value=CX(79462), lineno=3, index=3)]



[Token(type='NOUN', value=CX(79701), lineno=0, index=0), Token(type='NOUN', value=CX(79702), lineno=1, index=1), Token(type='C', value=CX(79702), lineno=2, index=2), Token(type='NOUN', value=CX(79703), lineno=3, index=3), Token(type='CARD', value=CX(79704), lineno=4, index=4)]



[Token(type='NOUN', value=CX(87805), lineno=0, index=0), Token(type='NOUN', value=CX(87806), lineno=1, index=1), Token(type='C', value=CX(87806), lineno=2, index=2), Token(type='NOUN', value=CX(87807), lineno=3, index=3), Token(type='CARD', value=CX(87808), lineno=4, index=4), Token(type='NOUN', value=CX(87809), lineno=5, index=5), Token(type='CARD', value=CX(87810), lineno=6, index=6), Token(type='CARD', value=CX(87811), lineno=7, index=7), Token(type='NOUN', value=CX(87812), lineno=8, index=8), Token(type='NOUN', value=CX(87813), lineno=9, index=9), Token(type='C', value=CX(87813), lineno=10, index=10), Token(type='NOUN', value=CX(87814), lineno=11, index=11)]



[Token(type='NOUN', value=CX(87805), lineno=0, index=0), Token(type='NOUN', value=CX(87806), lineno=1, index=1), Token(type='C', value=CX(87806), lineno=2, index=2), Token(type='NOUN', value=CX(87807), lineno=3, index=3), Token(type='CARD', value=CX(87808), lineno=4, index=4), Token(type='NOUN', value=CX(87809), lineno=5, index=5), Token(type='CARD', value=CX(87810), lineno=6, index=6), Token(type='CARD', value=CX(87811), lineno=7, index=7), Token(type='NOUN', value=CX(87812), lineno=8, index=8), Token(type='NOUN', value=CX(87813), lineno=9, index=9), Token(type='C', value=CX(87813), lineno=10, index=10), Token(type='NOUN', value=CX(87814), lineno=11, index=11)]



[Token(type='NOUN', value=CX(88001), lineno=0, index=0), Token(type='NOUN', value=CX(88002), lineno=1, index=1), Token(type='C', value=CX(88002), lineno=2, index=2), Token(type='NOUN', value=CX(88003), lineno=3, index=3), Token(type='CARD', value=CX(88004), lineno=4, index=4), Token(type='NOUN', value=CX(88005), lineno=5, index=5), Token(type='CARD', value=CX(88006), lineno=6, index=6), Token(type='NOUN', value=CX(88007), lineno=7, index=7), Token(type='NOUN', value=CX(88008), lineno=8, index=8), Token(type='C', value=CX(88008), lineno=9, index=9), Token(type='NOUN', value=CX(88009), lineno=10, index=10), Token(type='CARD', value=CX(88010), lineno=11, index=11)]



[Token(type='NOUN', value=CX(88001), lineno=0, index=0), Token(type='NOUN', value=CX(88002), lineno=1, index=1), Token(type='C', value=CX(88002), lineno=2, index=2), Token(type='NOUN', value=CX(88003), lineno=3, index=3), Token(type='CARD', value=CX(88004), lineno=4, index=4), Token(type='NOUN', value=CX(88005), lineno=5, index=5), Token(type='CARD', value=CX(88006), lineno=6, index=6), Token(type='NOUN', value=CX(88007), lineno=7, index=7), Token(type='NOUN', value=CX(88008), lineno=8, index=8), Token(type='C', value=CX(88008), lineno=9, index=9), Token(type='NOUN', value=CX(88009), lineno=10, index=10), Token(type='CARD', value=CX(88010), lineno=11, index=11)]



[Token(type='NOUN', value=CX(88096), lineno=0, index=0), Token(type='NOUN', value=CX(88097), lineno=1, index=1), Token(type='C', value=CX(88097), lineno=2, index=2), Token(type='NOUN', value=CX(88098), lineno=3, index=3), Token(type='CARD', value=CX(88099), lineno=4, index=4), Token(type='CARD', value=CX(88100), lineno=5, index=5), Token(type='NOUN', value=CX(88101), lineno=6, index=6), Token(type='CARD', value=CX(88102), lineno=7, index=7), Token(type='NOUN', value=CX(88103), lineno=8, index=8), Token(type='NOUN', value=CX(88104), lineno=9, index=9), Token(type='C', value=CX(88104), lineno=10, index=10), Token(type='NOUN', value=CX(88105), lineno=11, index=11), Token(type='CARD', value=CX(88106), lineno=12, index=12), Token(type='CARD', value=CX(88107), lineno=13, index=13)]



[Token(type='NOUN', value=CX(88096), lineno=0, index=0), Token(type='NOUN', value=CX(88097), lineno=1, index=1), Token(type='C', value=CX(88097), lineno=2, index=2), Token(type='NOUN', value=CX(88098), lineno=3, index=3), Token(type='CARD', value=CX(88099), lineno=4, index=4), Token(type='CARD', value=CX(88100), lineno=5, index=5), Token(type='NOUN', value=CX(88101), lineno=6, index=6), Token(type='CARD', value=CX(88102), lineno=7, index=7), Token(type='NOUN', value=CX(88103), lineno=8, index=8), Token(type='NOUN', value=CX(88104), lineno=9, index=9), Token(type='C', value=CX(88104), lineno=10, index=10), Token(type='NOUN', value=CX(88105), lineno=11, index=11), Token(type='CARD', value=CX(88106), lineno=12, index=12), Token(type='CARD', value=CX(88107), lineno=13, index=13)]



[Token(type='NOUN', value=CX(91698), lineno=0, index=0), Token(type='CARD', value=CX(91699), lineno=1, index=1), Token(type='NOUN', value=CX(91700), lineno=2, index=2), Token(type='CARD', value=CX(91701), lineno=3, index=3)]



[Token(type='NOUN', value=CX(114214), lineno=0, index=0), Token(type='CARD', value=CX(114215), lineno=1, index=1), Token(type='NOUN', value=CX(114216), lineno=2, index=2), Token(type='CARD', value=CX(114217), lineno=3, index=3)]



[Token(type='NOUN', value=CX(114414), lineno=0, index=0), Token(type='CARD', value=CX(114415), lineno=1, index=1), Token(type='NOUN', value=CX(114416), lineno=2, index=2), Token(type='CARD', value=CX(114417), lineno=3, index=3)]



[Token(type='NOUN', value=CX(124496), lineno=0, index=0), Token(type='CARD', value=CX(124497), lineno=1, index=1), Token(type='CARD', value=CX(124498), lineno=2, index=2)]



[Token(type='NOUN', value=CX(124550), lineno=0, index=0), Token(type='CARD', value=CX(124551), lineno=1, index=1), Token(type='CARD', value=CX(124552), lineno=2, index=2)]



[Token(type='NOUN', value=CX(124569), lineno=0, index=0), Token(type='CARD', value=CX(124570), lineno=1, index=1), Token(type='CARD', value=CX(124571), lineno=2, index=2)]



[Token(type='NOUN', value=CX(125133), lineno=0, index=0), Token(type='CARD', value=CX(125134), lineno=1, index=1), Token(type='CARD', value=CX(125135), lineno=2, index=2)]



[Token(type='NOUN', value=CX(139129), lineno=0, index=0), Token(type='CARD', value=CX(139130), lineno=1, index=1), Token(type='NOUN', value=CX(139131), lineno=2, index=2)]



[Token(type='NOUN', value=CX(141637), lineno=0, index=0), Token(type='CARD', value=CX(141638), lineno=1, index=1), Token(type='NOUN', value=CX(141639), lineno=2, index=2)]



[Token(type='CARD', value=CX(144620), lineno=0, index=0), Token(type='NOUN', value=CX(144621), lineno=1, index=1), Token(type='CARD', value=CX(144622), lineno=2, index=2), Token(type='CARD', value=CX(144623), lineno=3, index=3), Token(type='NOUN', value=CX(144624), lineno=4, index=4)]



[Token(type='CARD', value=CX(162616), lineno=0, index=0), Token(type='NOUN', value=CX(162617), lineno=1, index=1), Token(type='NOUN', value=CX(162618), lineno=2, index=2), Token(type='C', value=CX(162618), lineno=3, index=3), Token(type='NOUN', value=CX(162619), lineno=4, index=4)]



[Token(type='NOUN', value=CX(181160), lineno=0, index=0), Token(type='CARD', value=CX(181161), lineno=1, index=1), Token(type='CARD', value=CX(181162), lineno=2, index=2), Token(type='NOUN', value=CX(181163), lineno=3, index=3)]



[Token(type='NOUN', value=CX(181685), lineno=0, index=0), Token(type='CARD', value=CX(181686), lineno=1, index=1), Token(type='NOUN', value=CX(181687), lineno=2, index=2), Token(type='CARD', value=CX(181688), lineno=3, index=3), Token(type='NOUN', value=CX(181689), lineno=4, index=4), Token(type='CARD', value=CX(181690), lineno=5, index=5)]



[Token(type='CARD', value=CX(193218), lineno=0, index=0), Token(type='NOUN', value=CX(193219), lineno=1, index=1), Token(type='NOUN', value=CX(193220), lineno=2, index=2), Token(type='C', value=CX(193220), lineno=3, index=3), Token(type='NOUN', value=CX(193221), lineno=4, index=4)]



[Token(type='CARD', value=CX(195632), lineno=0, index=0), Token(type='NOUN', value=CX(195633), lineno=1, index=1), Token(type='NOUN', value=CX(195634), lineno=2, index=2), Token(type='C', value=CX(195634), lineno=3, index=3), Token(type='NOUN', value=CX(195635), lineno=4, index=4)]



[Token(type='NOUN', value=CX(205922), lineno=0, index=0), Token(type='CARD', value=CX(205923), lineno=1, index=1), Token(type='NOUN', value=CX(205924), lineno=2, index=2)]



[Token(type='CARD', value=CX(214000), lineno=0, index=0), Token(type='NOUN', value=CX(214001), lineno=1, index=1), Token(type='CARD', value=CX(214002), lineno=2, index=2), Token(type='NOUN', value=CX(214003), lineno=3, index=3)]



[Token(type='NOUN', value=CX(264425), lineno=0, index=0), Token(type='CARD', value=CX(264426), lineno=1, index=1), Token(type='CARD', value=CX(264427), lineno=2, index=2), Token(type='NOUN', value=CX(264428), lineno=3, index=3)]



[Token(type='NOUN', value=CX(267206), lineno=0, index=0), Token(type='CARD', value=CX(267207), lineno=1, index=1), Token(type='NOUN', value=CX(267208), lineno=2, index=2)]



[Token(type='NOUN', value=CX(380811), lineno=0, index=0), Token(type='CARD', value=CX(380812), lineno=1, index=1), Token(type='NOUN', value=CX(380813), lineno=2, index=2), Token(type='CARD', value=CX(380814), lineno=3, index=3), Token(type='NOUN', value=CX(380815), lineno=4, index=4), Token(type='CARD', value=CX(380816), lineno=5, index=5), Token(type='CARD', value=CX(380817), lineno=6, index=6)]



[Token(type='NOUN', value=CX(393437), lineno=0, index=0), Token(type='CARD', value=CX(393438), lineno=1, index=1), Token(type='CARD', value=CX(393439), lineno=2, index=2)]



[Token(type='NOUN', value=CX(394619), lineno=0, index=0), Token(type='CARD', value=CX(394620), lineno=1, index=1), Token(type='CARD', value=CX(394621), lineno=2, index=2)]



[Token(type='NOUN', value=CX(394641), lineno=0, index=0), Token(type='CARD', value=CX(394642), lineno=1, index=1), Token(type='CARD', value=CX(394643), lineno=2, index=2)]



[Token(type='NOUN', value=CX(402394), lineno=0, index=0), Token(type='C', value=CX(402394), lineno=1, index=1), Token(type='NOUN', value=CX(402395), lineno=2, index=2), Token(type='NOUN', value=CX(402396), lineno=3, index=3), Token(type='CARD', value=CX(402397), lineno=4, index=4), Token(type='CARD', value=CX(402398), lineno=5, index=5)]



[Token(type='NOUN', value=CX(408214), lineno=0, index=0), Token(type='CARD', value=CX(408215), lineno=1, index=1), Token(type='CARD', value=CX(408216), lineno=2, index=2)]



[Token(type='NOUN', value=CX(412553), lineno=0, index=0), Token(type='CARD', value=CX(412554), lineno=1, index=1), Token(type='CARD', value=CX(412555), lineno=2, index=2)]



[Token(type='NOUN', value=CX(413447), lineno=0, index=0), Token(type='CARD', value=CX(413448), lineno=1, index=1), Token(type='CARD', value=CX(413449), lineno=2, index=2)]



[Token(type='NOUN', value=CX(414843), lineno=0, index=0), Token(type='C', value=CX(414843), lineno=1, index=1), Token(type='NOUN', value=CX(414844), lineno=2, index=2), Token(type='CARD', value=CX(414845), lineno=3, index=3), Token(type='CARD', value=CX(414846), lineno=4, index=4), Token(type='CARD', value=CX(414847), lineno=5, index=5)]



[Token(type='NOUN', value=CX(421720), lineno=0, index=0), Token(type='CARD', value=CX(421721), lineno=1, index=1), Token(type='NOUN', value=CX(421722), lineno=2, index=2), Token(type='CARD', value=CX(421723), lineno=3, index=3), Token(type='NOUN', value=CX(421724), lineno=4, index=4), Token(type='CARD', value=CX(421725), lineno=5, index=5)]



[Token(type='NOUN', value=CX(425532), lineno=0, index=0), Token(type='CARD', value=CX(425533), lineno=1, index=1), Token(type='CARD', value=CX(425534), lineno=2, index=2)]



## Display Parsed

In [63]:
# bhsa.prettyTuple(L.u(59835, 'phrase'), seq=0, extraFeatures='rela')

In [64]:
def get_nodes(pgraph):
    if type(pgraph) == int:
        yield pgraph
    else:
        for node in pgraph:
            if type(node) == int:
                yield node
            elif type(node) == list:
                yield from get_nodes(node)

def print_relas(pgraph):
    """Print phrase relas embedded in a phrase."""

    snode, tnode, mod = pgraph
    if tnode is not None:
        print(snode, f'-{mod}>', list(get_nodes(tnode)))
    else:
        print(snode, mod)
    if type(snode) == list:
        print_relas(snode)
    elif type(tnode) == list:
        print_relas(tnode)

In [65]:
def show_phrases(parsed_phrases):
    for i,parse in enumerate(parsed_phrases):
        nodes = tuple(get_nodes(parse))
        bhsa.prettyTuple(nodes, seq=i, condenseType='clause', withNodes=True)
        print_relas(parse)
        print()

In [66]:
import random

In [68]:
shuff_parsed = [p for p in parsed.values()]
random.shuffle(shuff_parsed)

show_phrases(shuff_parsed[:20])

113875 -NUM> [113876]



372146 -NUM> [372145]



286853 -NUM> [286854]



213287 -NUM> [213288]



75046 -GP_CARD> [75047, 75048, 75049]
[75047, 75048, 'CCARD'] -NUM> [75049]
75047 -CCARD> [75048]



89228 -NUM> [89229]



372825 -NUM> [372826]



77823 -NUM> [77824]



286391 -NUM> [286392]



79779 -GP_NUM> [79780]



74742 -NUM> [74740, 74741]
74740 -GP> [74741]



374928 -NUM> [374929]



286126 -NUM> [286127]



[180849, 180850, 'CCARD'] -NUM> [180851]
180849 -CCARD> [180850]



61877 -NUM> [61878]



264852 -NUM> [264853]



370201 -NUM> [370200]



391448 -NUM> [391449]



147976 -NUM> [147977]



TypeError: 'NoneType' object is not iterable

# Graph Testing

In [53]:
graph = nx.DiGraph()
graph2 = nx.DiGraph()
graph.add_edge((1,), (3, 4), mod='DEF')
graph2.add_edge((5,), (6, 7), mod='DEF')

# make supergraph with conjunctive connection
graph3 = nx.DiGraph()
graph3.add_edge(graph, graph2, mod='CONJ')

In [92]:
G = nx.DiGraph()

G.add_node(1)