# Testing Sly Parser for Hebrew Noun Phrases

In [8]:
from tf.app import use
from sly import Lexer, Parser
from sly.lex import Token
import networkx as nx

# load Hebrew data
bhsa = use('bhsa', hoist=globals())

In [9]:
bhsa.displaySetup(condenseType='phrase', withNodes=True)

In [10]:
Lexer.tokenize

<function sly.lex.Lexer.tokenize(self, text, lineno=1, index=0)>

In [11]:
# get Hebrew nodes as tester data

bhsa.indent(reset=True)

queries = [
"""
ph:phrase rela=NA
    w1:word pdp=prep
    <: w2:word pdp=subs

w1 =: ph
w2 := ph
""",
    
"""
ph:phrase rela=NA
    w1:word pdp=prep
    <: word lex=H
    <: w2:word pdp=subs

w1 =: ph
w2 := ph
"""
]

samples = [] 

for q in queries: 
    samples += bhsa.search(q, shallow=True)
    
samples.sort()
    
bhsa.info(f'total samples: {len(samples)}')

  1.23s 12339 results
  1.47s 7035 results
  1.47s total samples: 19374


In [12]:
def get_POS(wnode):
    """Retrieve the part of speech for a wordnode."""
    pdp = F.pdp.v(wnode).upper()

class NodeLexer(Lexer):
    tokens = {
        PREP,
        SUBS,
        ART,
    }
    def tokenize(self, nodes):
        """Yield Sly tokens from list of phrase nodes."""
        for i, wnode in enumerate(nodes):
            token = Token()
            token.value = wnode
            token.type = get_POS(wnode)
            token.index = i
            token.lineno = i
            yield token

class PhraseParser(Parser):

    tokens = NodeLexer.tokens

    @_('np', 'defi', 'pp')
    def phrase(self, p):
        return p[0]

    @_('ART np')
    def defi(self, p):
        return [p.ART, p.np, 'DEF']

    @_('SUBS')
    def np(self, p):
        return [p.SUBS, 'NP']

    @_('PREP np', 'PREP defi', 'PREP pp')
    def pp(self, p):
        return [p[0], p[1], 'PP']

lexer = NodeLexer()
parser = PhraseParser()

In [13]:
tokens = lexer.tokenize(sample_nodes[1])

list(tokens)

NameError: name 'sample_nodes' is not defined

In [312]:
test_phrase = parser.parse(lexer.tokenize(sample_nodes[1]))

test_phrase

[43, [44, [45, 'NP'], 'DEF'], 'PP']

In [267]:
def get_nodes(pgraph):
    for node in pgraph:
        if type(node) == int:
            yield node
        elif type(node) == list:
            yield from get_nodes(node)

def print_relas(pgraph):
    """Print phrase relas embedded in a phrase."""
    if len(pgraph) == 3:
        snode, tnode, mod = pgraph
        print(snode, f'-{mod}>', list(get_nodes(tnode)))
    elif len(pgraph) == 2:
        snode, typ = pgraph
        tnode = None
        print(snode, typ)
    if type(snode) == list:
        print_relas(snode)
    elif type(tnode) == list:
        print_relas(tnode)

In [268]:
parsed = []

for nodeset in sample_nodes:
    parsing = parser.parse(lexer.tokenize(nodeset))
    parsed.append(parsing)

print(len(parsed), 'parsed')
    
parsed[:10]

19374 parsed


[[1, [2, 'NP'], 'PP'],
 [43, [44, [45, 'NP'], 'DEF'], 'PP'],
 [61, [62, [63, 'NP'], 'DEF'], 'PP'],
 [66, [67, [68, 'NP'], 'DEF'], 'PP'],
 [98, [99, [100, 'NP'], 'DEF'], 'PP'],
 [128, [129, [130, 'NP'], 'DEF'], 'PP'],
 [163, [164, [165, 'NP'], 'DEF'], 'PP'],
 [193, [194, 'NP'], 'PP'],
 [198, [199, [200, 'NP'], 'DEF'], 'PP'],
 [212, [213, 'NP'], 'PP']]

In [269]:
def show_phrases(parsed_phrases):
    for i,parse in enumerate(parsed_phrases):
        nodes = tuple(get_nodes(parse))
        bhsa.prettyTuple(nodes, seq=i, condenseType='clause', withNodes=True)
        print_relas(parse)
        print()

In [270]:
show_phrases(parsed[:20])

1 -PP> [2]
2 NP



43 -PP> [44, 45]
44 -DEF> [45]
45 NP



61 -PP> [62, 63]
62 -DEF> [63]
63 NP



66 -PP> [67, 68]
67 -DEF> [68]
68 NP



98 -PP> [99, 100]
99 -DEF> [100]
100 NP



128 -PP> [129, 130]
129 -DEF> [130]
130 NP



163 -PP> [164, 165]
164 -DEF> [165]
165 NP



193 -PP> [194]
194 NP



198 -PP> [199, 200]
199 -DEF> [200]
200 NP



212 -PP> [213]
213 NP



221 -PP> [222]
222 NP



268 -PP> [269]
269 NP



276 -PP> [277, 278]
277 -DEF> [278]
278 NP



324 -PP> [325, 326]
325 -DEF> [326]
326 NP



400 -PP> [401]
401 NP



407 -PP> [408]
408 NP



435 -PP> [436, 437]
436 -DEF> [437]
437 NP



454 -PP> [455]
455 NP



462 -PP> [463]
463 NP



474 -PP> [475]
475 NP



# Graph Testing

In [53]:
graph = nx.DiGraph()
graph2 = nx.DiGraph()
graph.add_edge((1,), (3, 4), mod='DEF')
graph2.add_edge((5,), (6, 7), mod='DEF')

# make supergraph with conjunctive connection
graph3 = nx.DiGraph()
graph3.add_edge(graph, graph2, mod='CONJ')

In [92]:
G = nx.DiGraph()

G.add_node(1)