# Testing Sly Parser for Hebrew Noun Phrases

In [1]:
from tf.app import use
from sly import Lexer, Parser

# load Hebrew data
bhsa = use('bhsa', hoist=globals())

In [3]:
#import networkx

In [4]:
bhsa.displaySetup(condenseType='phrase', withNodes=True)

In [5]:
Lexer.tokenize

<function sly.lex.Lexer.tokenize(self, text, lineno=1, index=0)>

In [14]:
bhsa.indent(reset=True)

queries = [
"""
ph:phrase rela=NA
    w1:word pdp=prep
    <: w2:word pdp=subs

w1 =: ph
w2 := ph
""",
    
"""
ph:phrase rela=NA
    w1:word pdp=prep
    <: word lex=H
    <: w2:word pdp=subs

w1 =: ph
w2 := ph
"""
]

samples = [] 

for q in queries: 
    samples += bhsa.search(q, shallow=True)
    
samples.sort()
    
bhsa.info(f'total samples: {len(samples)}')

  1.43s 12339 results
  1.76s 7035 results
  1.77s total samples: 19374


In [126]:
class Word:
    def __init__(self, wnode):
        """Initialize BHSA node with attributes."""
        self.value = [wnode]
        self.type = F.pdp.v(wnode).upper()
    def __repr__(self):
        return f'{self.value}'
        
class Phrase:
    def __init__(self, *cx_lists):
        self.value = [n for cx in cx_lists for n in cx]
    def __repr__(self):
        return f'{self.value}'
    def __iter__(self):
        for item in self.value:
            yield item
    def __len__(self):
        return len(self.value)
        
class NodeLexer(Lexer):
    
    tokens = {
        PREP,
        SUBS,
        ART,
    }
    
    def tokenize(self, phrase_nodes):
        """Yield tokens from phrases"""
        for pnode in phrase_nodes:
            for wnode in L.d(pnode, 'word'):
                yield Word(wnode)

class PhraseParser(Parser):
    
    tokens = NodeLexer.tokens
    
#     precedence = (
#         ('left', '')
#     )
    
    @_('parsing parsing')
    def parsing(self, p):
        return p[0] + p[1]
    
    @_('np', 'defi', 'pp')
    def parsing(self, p):
        return [p[0]]
        
    @_('ART np')
    def defi(self, p):
        return Phrase(*p)
    
    @_('SUBS')
    def np(self, p):
        return Phrase(p[0])
    
    @_('PREP np', 'PREP defi', 'PREP pp')
    def pp(self, p):
        return Phrase(*p)



In [127]:
lexer = NodeLexer()
parser = PhraseParser()
parsed = parser.parse(lexer.tokenize(samples))

parsed[:10]

[[1, 2],
 [43, 44, 45],
 [61, 62, 63],
 [66, 67, 68],
 [98, 99, 100],
 [128, 129, 130],
 [163, 164, 165],
 [193, 194],
 [198, 199, 200],
 [212, 213]]

In [129]:
bhsa.show(parsed[200:300])