# Prototype Semantics Labels

In [1]:
from tf.app import use
from load_parse import ParseLoader
import nav_tree as nt
A = use('bhsa', hoist=globals())
parsed_phrases = ParseLoader('../../results/parsing/phrase_parsings.json').load()
print(f'loaded {len(parsed_phrases)} parsed phrases')

loaded 123430 parsed phrases


In [2]:
parsed_phrases[929733] # need to fix these cases, should be parallel PPs 

[[38780, 38781, 'PP'], [38778, 38779, 'PP'], 'APPO']

In [3]:
# above should look like this:

[
    [38780, 38781, 'PP'],
    [38778, 38779, 'PP'],
    'APPO'
]

[[38780, 38781, 'PP'], [38778, 38779, 'PP'], 'APPO']

## Haspelmath's Time Adverbial Functions

According to Haspelmath 1997, the following semantic functions for time
adverbials are attested in world languages:

* simultaneous location (in, on, at, Ø)
* anterior ("before")
* posterior ("after")
* anterior durative ("until")
* posterior durative ("since")
* distance future ("in two weeks' time"), at speech time
* distance past ("three centuries ago"), at speech time
* distance retrospective ("two months before"), at other time
* distance prospective ("three days later"), at other time
* distance with anterior/posterior ("six days before the passover...", "five minutes after the beginning...")
* atelic extent ("for five hours")
* telic extent ("in five hours")
* distance posterior ("since five years [ago]")

## Pipeline proposal for semantic tagging

1. use existing parsings to generate tokens from the phrase parsings (semantic tokenizer)
2. parse tokens into appropriate tags

Examples

|phrase|tokens|tag|
|------|------|---|
|בלילה|B CTIME|simul|
|בשלש יומים|B DIST| distFut/telicExt|
|שׁבע שׁנים|DIST|atelicExt|

Where `DIST` is a token output by the tokenizer for quantified times or pluralized times.
And where `/` indicates where manual disambiguation will be needed.

## The Challenge: Walk the tree and decide what stays and goes

* modifiers that are not relevant for locating or quantifying a time
    * e.g. ביום יהוה, genitival/adjectival modifiers
    * e.g. כל היום, the quantifier already designates this as a temporal distance, the article is only necessary for understanding "which day"

In [31]:
from sly.lex import Token as SlyToken
from sly import Parser as SlyParser
        
class SemTokenizer:
    """Generate selected tokens from a phrase parsing.
    
    The tokens will be subsequently parsed to generate
    semantic function labels for the phrases.
    """

    def __init__(self, tf_api):
        self.F, self.L = tf_api.F, tf_api.L
        self.tokens = set() # to feed to the parser later
    
    def get_sly_token(self, tag, index):
        """Get SLY token object with customized data."""
        token = SlyToken()
        self.tokens.add(tag) # track unique tokens
        token.value = ''
        token.type = tag
        token.index = index
        token.lineno = 1
        return token
    
    def tokenize(self, parsedphrase):
        """Follow path to right-most item (head) and yield tokens.
        
        tokenize must adjudicate what gets yielded as a token
        and what does not. At times, we need to direct tokenize down
        a path that is not on the head-path (e.g. parallel relas).
        Sometimes whether a rela becomes a token is conditional 
        on a number of factors tuned to produce good tokens.
        """
        
        # ignore these relations
        ignore = {'DEF', 'ADJV', 'GP', 'APPO'}
        
        def tag_rela(rela, name):
            return rela == name and rela not in ignore
        
        # retrieve the head path and begin walking down it
        head_phrases = list(nt.get_head_path(parsedphrase))
        for i, phrase in enumerate(head_phrases):
            
            src, tgt, rela = phrase
            
            # tokenize prepositions
            if tag_rela(rela, 'PP'):
                yield self.prep_token(phrase, i)
                
            # handle parallels
            elif tag_rela(rela, 'PARA'):
                # we use unfold_paras in case there are 
                # additional recursively embedded parallel
                # phrases below; unfold_paras will stop as 
                # soon as it encounters a phrase with a rela
                # that != PARA or CONJ
                for subphrase in nt.unfold_paras(src):
                    yield from self.tokenize(subphrase) # recursively tokenize them
                
            # tokenize appositional relations if relevant
            elif tag_rela(rela, 'APPO'):
                token = self.appo_token(phrase, i)
                if token:
                    yield token
            
            # drip-bucket tokenizer
            elif rela not in ignore:
                yield self.get_sly_token(rela, i)
                
            # give TIME token once reaching the head
            if i+1 == len(head_phrases):
                yield self.time_token(phrase, i)
                        
    def time_token(self, phrase, i):
        """Parse times into tokens."""
        time = phrase[1]
        if self.F.nu.v(time) == 'pl':
            token = 'TIMES'
        else:
            token = 'TIME'
        return self.get_sly_token(token, i)
    
    def prep_token(self, phrase, i):
        """Parse prepositions into singular tokens."""
        token = self.F.lex.v(phrase[0])
        return self.get_sly_token(token, i)
    
    def appo_token(self, phrase, i):
        """Parse appositional tokens."""
        src, tgt, rela = phrase
        
        # get the head item of the appositional phrase
        if type(src) == int:
            appo_head = src
        else:
            appo_head = nt.get_head(phrase[0])
        
        # process appositional demonstratives
        if self.F.pdp.v(appo_head) == 'prde':
            appo_lex = self.F.lex.v(appo_head)
            dist = self.demon_dist(appo_lex).upper()
            token = f'DEM_{dist}'
            return self.get_sly_token(token, i)
            
    def demon_dist(self, lex):
        """Get the distance of a demonstrative."""
        demon_map = { 
                'Z>T': 'near',
                'HJ>': 'far',
                'HMH': 'far',
                '>LH': 'near',
                'HM': 'far',
                'HW>': 'far',
                'ZH': 'near'
        }
        return demon_map[lex]

In [32]:
examples = {
    'sim': 905154,
    'ant': 913348,
    'post': 911277,
    'ant_dur': 908288,
    'post_dur': 905779,
    'dist_fut': 919136,
    'dist_past': None,
    'dist_retro': None,
    'dist_pro': None,
    'dist_antpos': None, # see 825591,
    'atelic_ext': 906690,
    'telic_ext': 919766,
}

st = SemTokenizer(A.api)

for ex, node in examples.items():
    if not node:
        continue
        
    parsing = parsed_phrases[node]
    sem_tokens = list(st.tokenize(parsing))
    
    A.pretty(node, withNodes=True, extraFeatures='pdp')
    print(node,':', ex)
    print(parsing)
    print(sem_tokens)

905154 : sim
[687, [[690, 691, 'DEF'], [688, 689, 'DEF'], 'APPO'], 'PP']
[Token(type='B', value='', lineno=1, index=0), Token(type='TIME', value='', lineno=1, index=2)]


913348 : ant
[13755, [13756, 13757, 'PP'], 'PP']
[Token(type='L', value='', lineno=1, index=0), Token(type='PNH/', value='', lineno=1, index=1), Token(type='TIME', value='', lineno=1, index=1)]


911277 : post
[10655, [[10658, 10659, 'DEF'], [10656, 10657, 'DEF'], 'APPO'], 'PP']
[Token(type='>XR/', value='', lineno=1, index=0), Token(type='TIMES', value='', lineno=1, index=2)]


908288 : ant_dur
[6157, 6158, 'PP']
[Token(type='<D', value='', lineno=1, index=0), Token(type='TIME', value='', lineno=1, index=0)]


905779 : post_dur
[1706, [1707, 1708, 'PP'], 'PP']
[Token(type='MN', value='', lineno=1, index=0), Token(type='QY/', value='', lineno=1, index=1), Token(type='TIMES', value='', lineno=1, index=1)]


919136 : dist_fut
[22132, [22133, [22134, 22135, 'NUM'], 'ADJV'], 'PP']
[Token(type='B', value='', lineno=1, index=0), Token(type='NUM', value='', lineno=1, index=2), Token(type='TIMES', value='', lineno=1, index=2)]


906690 : atelic_ext
[[3272, [3273, 3274, 'NUM'], 'CONJ'], [3270, 3271, 'NUM'], 'PARA']
[Token(type='NUM', value='', lineno=1, index=0), Token(type='TIME', value='', lineno=1, index=0), Token(type='NUM', value='', lineno=1, index=1), Token(type='TIME', value='', lineno=1, index=1)]


919766 : telic_ext
[23218, [23219, [[23221, 23222, 'DEF'], 23220, 'GP'], 'NUM'], 'PP']
[Token(type='B', value='', lineno=1, index=0), Token(type='NUM', value='', lineno=1, index=1), Token(type='TIMES', value='', lineno=1, index=2)]


## Semantic Class Parser

In [53]:
class SemParser(SlyParser):

    """Parse semantic tokens with a YACC grammar."""

    # initialize standard methods / attributes
    def __init__(self, error_tracker):
        super().__init__()
        self.error_tracker = error_tracker

    tokens = {
        '<D',
        '>XR/',
        'B',
        'L',
        'MN',
        'NUM',
        'PNH/',
        'QY/',
        'TIME',
        'TIMES'
    }

    def error(self, token):
        """Keep track of errors."""
        try:
            self.error_tracker['e'] = token.value.slot
        except:
            self.error_tracker['e'] = 'reached end'

    #debugfile = 'parser.out'
       
    # -- FINAL MATCHES --
    @_('simul', 'ante', 'post', 'ant_dur',
       'post_dur', 'in_dur', 'atelic_ext')
    def category(self, p): 
        return p[0] 

    @_('B TIME')
    def simul(self, p):
        return 'simul'
    
    @_('before TIME', 'before duration')
    def ante(self, p):
        return 'ante'
    
    @_('>XR/ TIME', '>XR/ duration')
    def post(self, p):
        return 'post'
    
    @_('<D TIME', '<D duration', '<D time')
    def ant_dur(self, p):
        return 'ant_dur'
    
    @_('MN TIME', 'MN time', 'MN duration')
    def post_dur(self, p):
        return 'post_dur'
    
    @_('B duration')
    def in_dur(self, p):
        return 'telic_ext|dist_fut'
    
    @_('TIME', 'duration')
    def atelic_ext(self, p):
        return 'atelic_ext'
    
    # -- TEMPORARY MATCHES --
    @_('TIMES', 'duration duration',
       'NUM TIME', 'NUM time', 'NUM duration')
    def duration(self, p):
        return ''
    
    @_('L PNH/')
    def before(self, p):
        return ''
    
    @_('QY/ duration', 'QY/ TIME')
    def time(self, p):
        return ''



In [54]:
error = {'e': None}
sem_parser = SemParser(error)

# test the parser
for ex, node in examples.items():
    if not node:
        continue
    parsing = parsed_phrases[node]
    
    sem_parse = sem_parser.parse(st.tokenize(parsing))
    
    print(node)
    print(T.text(node))
    print(sem_parse)
    print()

905154
בַּיֹּ֣ום הַשְּׁבִיעִ֔י 
simul

913348
לִפְנֵ֥י מֹותִֽי׃ 
ante

911277
אַחֲרֵי֙ הַדְּבָרִ֣ים הָאֵ֔לֶּה 
post

908288
עַד־עֹולָֽם׃ 
ant_dur

905779
מִקֵּ֣ץ יָמִ֑ים 
post_dur

919136
בְּעֹ֣וד׀ שְׁלֹ֣שֶׁת יָמִ֗ים 
telic_ext|dist_fut

906690
אַרְבָּעִ֣ים יֹ֔ום וְאַרְבָּעִ֖ים לָֽיְלָה׃ 
atelic_ext

919766
בְּשֶׁ֖בַע שְׁנֵ֣י הַשָּׂבָ֑ע 
telic_ext|dist_fut

