In [1]:
import json
from pathlib import Path
from tf.app import use
A = use('bhsa', hoist=globals())

In [2]:
A.displaySetup(condenseType='clause', extraFeatures='function pdp st', withNodes=True)

# load parsings
with open('../../results/parsing/phrase_parsings.json', 'r') as infile:
    parses = {int(ph):parse for ph, parse in json.load(infile).items()}
with open('../../results/parsing/slot2pos.json', 'r') as infile:
    slot2pos = {int(slot):pos for slot, pos in json.load(infile).items()}

In [3]:
def get_slots(phrase):
    """Recursively retrieve slots from a phrase tree."""
    if type(phrase) == int:
        yield phrase
        return
    src, tgt, rela = phrase
    if type(src) == int:
        yield src
    else:
        yield from get_slots(src)
    if type(tgt) == int:
        yield tgt
    else:
        yield from get_slots(tgt)

def get_head(phrase):
    src, tgt, rela = phrase
    if type(tgt) == int:
        return tgt
    else:
        return get_head(tgt)
        
def traverse_tree(phrase):
    """Traversing down a phrase tree."""
    yield phrase
    src, tgt, rela = phrase
    src_slots = sorted(get_slots(src))
    tgt_slots = sorted(get_slots(tgt))
    head = get_head(phrase)
    if type(src) == list:
        yield from traverse_tree(src)
    if type(tgt) == list:
        yield from traverse_tree(tgt)
        
def show_relas(bhsa_node, parse):
    """Visualize relationship in a tree."""
    head = get_head(parse)
    subphrases = list(traverse_tree(parse))
    A.pretty(bhsa_node, highlights={head})
    for src, tgt, rela in subphrases:
        print(f'{src} -{rela}-> {tgt}')

In [4]:
test_phrase = 904936
test_parse = parses[test_phrase]
show_relas(test_phrase, test_parse)

285 -PP-> [286, [[289, 290, 'DEF'], [287, 288, 'DEF'], 'APPO'], 'NUM']
286 -NUM-> [[289, 290, 'DEF'], [287, 288, 'DEF'], 'APPO']
[289, 290, 'DEF'] -APPO-> [287, 288, 'DEF']
289 -DEF-> 290
287 -DEF-> 288


# Build Some Data

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
def build_phrase_data(phrase2parse):
    
    rows = []
    sp_id = 1
    for phrase, parsing in phrase2parse.items():
        if type(parsing) == int or len(parsing) < 3:
            continue
        subphrases = list(traverse_tree(parsing))
        for sp in subphrases:
            src, tgt, kind = sp
            slots = list(get_slots(sp))
            nslots = len(slots)
            rows.append({
                'id': sp_id,
                'phrase': phrase,
                'nslots': len(slots),
                'kind': kind
            })
            sp_id += 1
            
    return pd.DataFrame(rows)

In [7]:
df = build_phrase_data(parses)

In [8]:
df

Unnamed: 0,id,phrase,nslots,kind
0,1,904749,2,PP
1,2,904752,7,PARA
2,3,904752,4,CONJ
3,4,904752,3,PP
4,5,904752,2,DEF
...,...,...,...,...
136375,136376,1172271,2,DEF
136376,136377,1172279,2,PP
136377,136378,1172281,2,PP
136378,136379,1172284,3,PP


In [9]:
ph_kinds = df.kind.value_counts()

ph_kinds

PP       53325
DEF      27257
GP       24519
CONJ      8359
PARA      7750
QUANT     4454
APPO      4072
NUM       2542
ADJV      2323
CARDC     1602
ADVB       133
DEMON       44
Name: kind, dtype: int64

In [10]:
length_ct = pd.pivot_table(
    df,
    index='kind',
    columns='nslots',
    aggfunc='size',
    fill_value=0,
)

length_ct

nslots,2,3,4,5,6,7,8,9,10,11,...,22,23,24,25,26,27,28,30,31,34
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ADJV,1803,248,192,39,21,15,2,0,3,0,...,0,0,0,0,0,0,0,0,0,0
ADVB,133,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
APPO,1067,329,2561,20,82,4,5,1,1,1,...,0,0,0,0,0,0,0,0,0,0
CARDC,653,405,268,146,43,51,9,13,7,6,...,0,0,0,0,0,0,0,0,0,0
CONJ,3125,1884,1457,399,535,106,267,173,119,34,...,2,3,2,1,1,0,0,0,1,0
DEF,27136,27,10,69,4,0,8,0,0,3,...,0,0,0,0,0,0,0,0,0,0
DEMON,44,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GP,17094,5833,956,443,120,34,13,13,4,2,...,0,0,0,0,0,0,0,0,0,0
NUM,1568,623,246,68,22,8,3,1,1,1,...,0,0,0,0,0,0,0,0,0,0
PARA,0,2646,257,1777,415,948,470,310,161,285,...,7,10,3,6,1,2,1,2,0,1


In [11]:
length_pr = length_ct.div(length_ct.sum(1), 0).round(2)

length_pr

nslots,2,3,4,5,6,7,8,9,10,11,...,22,23,24,25,26,27,28,30,31,34
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ADJV,0.78,0.11,0.08,0.02,0.01,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ADVB,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
APPO,0.26,0.08,0.63,0.0,0.02,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CARDC,0.41,0.25,0.17,0.09,0.03,0.03,0.01,0.01,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CONJ,0.37,0.23,0.17,0.05,0.06,0.01,0.03,0.02,0.01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DEF,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DEMON,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GP,0.7,0.24,0.04,0.02,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NUM,0.62,0.25,0.1,0.03,0.01,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PARA,0.0,0.34,0.03,0.23,0.05,0.12,0.06,0.04,0.02,0.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
