# Testing Wordgrammar Wordsets

This notebook examines the output of the wordsets pipeline to ensure that the data being produced is good.

In [1]:
import collections
import pandas as pd
from IPython.display import display, HTML
import random
import pprint
from wordsets import WordSets
from positions import Positions
from context import Mom
from tf.app import use
A = use('bhsa', hoist=globals(), silent=True)

   |     0.00s No structure info in otext, the structure part of the T-API cannot be used


In [2]:
wsets = WordSets(A, silent=False)

processing accents...
	done
processing quants...
	done
processing preps...
	done
processing conjunctions...
	done
processing constructs...
	done


In [3]:
def beauty(text):
    pretty = f'''
<span style="font-family:Times New Roman; font-size:20pt">
    {text}
</hspan>'''
    display(HTML(pretty))

In [4]:
def showconds(conddict):
    '''
    Print out results from a conddict.
    '''
    for node, conds in conddict:
        print('node', node)
        for cond, truth in conds.items():
            print('\t{:<25} {:>25}'.format(cond, str(truth)))

## Context

In [5]:
m = Mom(7, A, quants=wsets.quants, preps=wsets.preps, noms=wsets.nominals)
m.every()

In [6]:
pprint.pprint(m.explain)

{'adja': ((8,
           {"P(0,'st') == 'a'": True,
            "P(1,'nu') == P(0,'nu')": False,
            "P(1,'sp') in nominals": False}),
          (8,
           {"P(0,'st') == 'a'": True,
            "P(1,'nu') == P(0,'nu')": False,
            "P(1,'sp vt').issubset({'verb', 'ptcp', 'ptca'})": False}),
          (9,
           {"P(0,'st') == 'a'": True,
            "P(1,'sp') == 'art'": False,
            "P(2,'nu') == P(0,'nu')": False,
            "P(2,'sp') in nominals": False})),
 'const': ((8, {"P(0,'st') == 'c'": False, "P(1,'sp') != 'art'": True}),
           (9, {"P(0,'st') == 'c'": False, "P(1,'sp') == 'art'": False})),
 'coord': ((9,
            {'P(0) not in preps': True,
             "P(1,'sp') == 'conj'": True,
             'P(2) not in preps': False,
             "P(2,'sp') in nominals": False}),
           (10,
            {'P(0) not in preps': True,
             "P(1,'sp') == 'conj'": True,
             "P(2,'sp') == 'art'": False}),
           (10,
            

## Accents

In [4]:
mwords = wsets.accents.mwords
atype = wsets.accents.accenttype
atype2set = wsets.accents.atype2set
atype2name2set = wsets.accents.atype2name2set

### mwords

In [5]:
chapter = random.choice(list(F.otype.s('chapter')))
A.table(
    (L.u(w, 'verse')+mwords[w] for w in L.d(chapter, 'word'))
, end=100)

n,p,verse,word,word.1,Unnamed: 5,Unnamed: 6
1,Joshua 7:1,וַיִּמְעֲל֧וּ בְנֵֽי־יִשְׂרָאֵ֛ל מַ֖עַל בַּחֵ֑רֶם וַיִּקַּ֡ח עָכָ֣ן בֶּן־כַּרְמִי֩ בֶן־זַבְדִּ֨י בֶן־זֶ֜רַח לְמַטֵּ֤ה יְהוּדָה֙ מִן־הַחֵ֔רֶם וַיִּֽחַר־אַ֥ף יְהוָ֖ה בִּבְנֵ֥י יִשְׂרָאֵֽל׃,וַ,יִּמְעֲל֧וּ,,
2,Joshua 7:1,וַיִּמְעֲל֧וּ בְנֵֽי־יִשְׂרָאֵ֛ל מַ֖עַל בַּחֵ֑רֶם וַיִּקַּ֡ח עָכָ֣ן בֶּן־כַּרְמִי֩ בֶן־זַבְדִּ֨י בֶן־זֶ֜רַח לְמַטֵּ֤ה יְהוּדָה֙ מִן־הַחֵ֔רֶם וַיִּֽחַר־אַ֥ף יְהוָ֖ה בִּבְנֵ֥י יִשְׂרָאֵֽל׃,וַ,יִּמְעֲל֧וּ,,
3,Joshua 7:1,וַיִּמְעֲל֧וּ בְנֵֽי־יִשְׂרָאֵ֛ל מַ֖עַל בַּחֵ֑רֶם וַיִּקַּ֡ח עָכָ֣ן בֶּן־כַּרְמִי֩ בֶן־זַבְדִּ֨י בֶן־זֶ֜רַח לְמַטֵּ֤ה יְהוּדָה֙ מִן־הַחֵ֔רֶם וַיִּֽחַר־אַ֥ף יְהוָ֖ה בִּבְנֵ֥י יִשְׂרָאֵֽל׃,בְנֵֽי־,יִשְׂרָאֵ֛ל,,
4,Joshua 7:1,וַיִּמְעֲל֧וּ בְנֵֽי־יִשְׂרָאֵ֛ל מַ֖עַל בַּחֵ֑רֶם וַיִּקַּ֡ח עָכָ֣ן בֶּן־כַּרְמִי֩ בֶן־זַבְדִּ֨י בֶן־זֶ֜רַח לְמַטֵּ֤ה יְהוּדָה֙ מִן־הַחֵ֔רֶם וַיִּֽחַר־אַ֥ף יְהוָ֖ה בִּבְנֵ֥י יִשְׂרָאֵֽל׃,בְנֵֽי־,יִשְׂרָאֵ֛ל,,
5,Joshua 7:1,וַיִּמְעֲל֧וּ בְנֵֽי־יִשְׂרָאֵ֛ל מַ֖עַל בַּחֵ֑רֶם וַיִּקַּ֡ח עָכָ֣ן בֶּן־כַּרְמִי֩ בֶן־זַבְדִּ֨י בֶן־זֶ֜רַח לְמַטֵּ֤ה יְהוּדָה֙ מִן־הַחֵ֔רֶם וַיִּֽחַר־אַ֥ף יְהוָ֖ה בִּבְנֵ֥י יִשְׂרָאֵֽל׃,מַ֖עַל,,,
6,Joshua 7:1,וַיִּמְעֲל֧וּ בְנֵֽי־יִשְׂרָאֵ֛ל מַ֖עַל בַּחֵ֑רֶם וַיִּקַּ֡ח עָכָ֣ן בֶּן־כַּרְמִי֩ בֶן־זַבְדִּ֨י בֶן־זֶ֜רַח לְמַטֵּ֤ה יְהוּדָה֙ מִן־הַחֵ֔רֶם וַיִּֽחַר־אַ֥ף יְהוָ֖ה בִּבְנֵ֥י יִשְׂרָאֵֽל׃,בַּ,,חֵ֑רֶם,
7,Joshua 7:1,וַיִּמְעֲל֧וּ בְנֵֽי־יִשְׂרָאֵ֛ל מַ֖עַל בַּחֵ֑רֶם וַיִּקַּ֡ח עָכָ֣ן בֶּן־כַּרְמִי֩ בֶן־זַבְדִּ֨י בֶן־זֶ֜רַח לְמַטֵּ֤ה יְהוּדָה֙ מִן־הַחֵ֔רֶם וַיִּֽחַר־אַ֥ף יְהוָ֖ה בִּבְנֵ֥י יִשְׂרָאֵֽל׃,בַּ,,חֵ֑רֶם,
8,Joshua 7:1,וַיִּמְעֲל֧וּ בְנֵֽי־יִשְׂרָאֵ֛ל מַ֖עַל בַּחֵ֑רֶם וַיִּקַּ֡ח עָכָ֣ן בֶּן־כַּרְמִי֩ בֶן־זַבְדִּ֨י בֶן־זֶ֜רַח לְמַטֵּ֤ה יְהוּדָה֙ מִן־הַחֵ֔רֶם וַיִּֽחַר־אַ֥ף יְהוָ֖ה בִּבְנֵ֥י יִשְׂרָאֵֽל׃,בַּ,,חֵ֑רֶם,
9,Joshua 7:1,וַיִּמְעֲל֧וּ בְנֵֽי־יִשְׂרָאֵ֛ל מַ֖עַל בַּחֵ֑רֶם וַיִּקַּ֡ח עָכָ֣ן בֶּן־כַּרְמִי֩ בֶן־זַבְדִּ֨י בֶן־זֶ֜רַח לְמַטֵּ֤ה יְהוּדָה֙ מִן־הַחֵ֔רֶם וַיִּֽחַר־אַ֥ף יְהוָ֖ה בִּבְנֵ֥י יִשְׂרָאֵֽל׃,וַ,יִּקַּ֡ח,,
10,Joshua 7:1,וַיִּמְעֲל֧וּ בְנֵֽי־יִשְׂרָאֵ֛ל מַ֖עַל בַּחֵ֑רֶם וַיִּקַּ֡ח עָכָ֣ן בֶּן־כַּרְמִי֩ בֶן־זַבְדִּ֨י בֶן־זֶ֜רַח לְמַטֵּ֤ה יְהוּדָה֙ מִן־הַחֵ֔רֶם וַיִּֽחַר־אַ֥ף יְהוָ֖ה בִּבְנֵ֥י יִשְׂרָאֵֽל׃,וַ,יִּקַּ֡ח,,


### accent class

In [6]:
chapter = random.choice(list(F.otype.s('chapter')))


show = [(w,) for w in L.d(chapter, 'word')]

def show_accents(wset):
    for s in wset:
        word = s[0]
        text = f'{T.text(word)} | {T.text(mwords[word])}'
        trans = T.text(mwords[word], fmt='text-trans-full')
        tab = '&nbsp;&nbsp;&nbsp;&nbsp;'
        print(s[0], T.sectionFromNode(word))
        beauty(text)
        beauty(tab+trans)
        beauty(tab+atype[word])
        beauty('<hr>')
        
show_accents(show[:25])

342247 ('Job', 26, 1)


342248 ('Job', 26, 1)


342249 ('Job', 26, 1)


342250 ('Job', 26, 1)


342251 ('Job', 26, 1)


342252 ('Job', 26, 2)


342253 ('Job', 26, 2)


342254 ('Job', 26, 2)


342255 ('Job', 26, 2)


342256 ('Job', 26, 2)


342257 ('Job', 26, 2)


342258 ('Job', 26, 2)


342259 ('Job', 26, 2)


342260 ('Job', 26, 2)


342261 ('Job', 26, 3)


342262 ('Job', 26, 3)


342263 ('Job', 26, 3)


342264 ('Job', 26, 3)


342265 ('Job', 26, 3)


342266 ('Job', 26, 3)


342267 ('Job', 26, 3)


342268 ('Job', 26, 3)


342269 ('Job', 26, 3)


342270 ('Job', 26, 3)


342271 ('Job', 26, 4)


### Stats

In [7]:
T.text(18154, fmt='text-trans-full')

'J.I52C.@52Q;9252H52W.52 '

In [8]:
# A.table(A.search('''

# verse
#     word g_word~11

# ''')[:10])

In [9]:
labels = collections.Counter()

for label, wset in atype2set.items():
    labels[label] += len(wset)
    
labels.most_common()

[('disjunct', 272898), ('conjunct', 152669), ('unknown', 1017)]

In [10]:
show_accents(atype2set['unknown'][:25])

17920 ('Genesis', 32, 24)


17921 ('Genesis', 32, 24)


17922 ('Genesis', 32, 24)


30384 ('Exodus', 4, 10)


33479 ('Exodus', 9, 22)


33480 ('Exodus', 9, 22)


33805 ('Exodus', 10, 1)


33806 ('Exodus', 10, 1)


65256 ('Leviticus', 21, 18)


68621 ('Leviticus', 26, 28)


68622 ('Leviticus', 26, 28)


77731 ('Numbers', 12, 9)


77732 ('Numbers', 12, 9)


87066 ('Numbers', 27, 9)


87067 ('Numbers', 27, 9)


96253 ('Deuteronomy', 4, 46)


96254 ('Deuteronomy', 4, 46)


99272 ('Deuteronomy', 10, 7)


100363 ('Deuteronomy', 12, 2)


106199 ('Deuteronomy', 23, 18)


106556 ('Deuteronomy', 24, 10)


106557 ('Deuteronomy', 24, 10)


112656 ('Deuteronomy', 33, 28)


130445 ('Judges', 6, 2)


137795 ('Judges', 16, 25)


### accent match types

In [12]:
mtypes = collections.Counter()

for kind, matches in atype2name2set.items():
    for match, wset in matches.items():
        mtypes[match] += len(wset)
        
mtypes.most_common()

[(('tiphchah',), 62516),
 (('munach',), 59426),
 (('mereka',), 53378),
 (('zaqeph qaton',), 41739),
 (('silluq',), 36480),
 (('atnach',), 34553),
 (('pashta',), 34346),
 (('mehuppak',), 19466),
 (('rebia',), 16542),
 (('tebir',), 12439),
 (('azla/qadma',), 11390),
 (('geresh',), 6566),
 (('darga',), 5228),
 (('rebia', 'rebia mugrash'), 4837),
 (('dechi',), 3829),
 (('gershayim',), 3273),
 (('telisha qetannah',), 3036),
 (('zaqeph gadol',), 3002),
 (('paseq',), 2650),
 (('telisha gedola',), 2108),
 (('zarqa',), 1698),
 (('segolta',), 1596),
 (('pazer qaton',), 1528),
 (('yetiv',), 1169),
 (('oleh weyored',), 550),
 (('paseq', 'azla legarmeh'), 443),
 (('yerah',), 364),
 (('paseq', 'mehuppak legarmeh'), 354),
 (('illuy',), 257),
 (('tsinor',), 227),
 (('pazer',), 133),
 (('paseq', 'shalshelet gedolah'), 51),
 (('qarney parah',), 31),
 (('mehuppak', 'azla/qadma'), 19),
 (('atnach', 'tiphchah'), 18),
 (('silluq', 'tiphchah'), 18),
 (('munach', 'mereka'), 18),
 (('paseq', 'mehuppak legarmeh

In [11]:
# A.show(atype2name2set['disjunct'][('paseq', 'mehuppak legarmeh', 'azla legarmeh')], withNodes=True, end=10)

In [31]:
A.table(A.search('''

phrase_atom
    word sp=subs
    <: word sp=conj
    <: word sp=subs

''')[:2], withNodes=True)

  1.56s 3785 results


n,p,phrase_atom,word,word.1,word.2
1,Genesis 1:2,תֹ֨הוּ֙ וָבֹ֔הוּ 904756,תֹ֨הוּ֙ 16,וָ 17,בֹ֔הוּ 18
2,Genesis 1:4,בֵּ֥ין הָאֹ֖ור וּבֵ֥ין הַחֹֽשֶׁךְ׃ 904781,אֹ֖ור 53,וּ 54,בֵ֥ין 55


### Conjunction Pairs

In [5]:
M = Mom(53, A, quants=wsets.quants, preps=wsets.preps, noms=wsets.nominals)

In [9]:
showconds(M.explain['coord'])

node 55
	P(0) not in preps                              True
	P(1,'sp') == 'conj'                            True
	P(2,'sp') in nominals                          True
	P(2) not in preps                             False
node 56
	P(0) not in preps                              True
	P(1,'sp') == 'conj'                            True
	P(2,'sp') == 'art'                            False
node 56
	P(1,'sp') == 'conj'                            True
	P(2) in preps                                  True
	P(3,'sp') in nominals                         False
	P(-1) in preps                                False
node 57
	P(1,'sp') == 'conj'                            True
	P(2) in preps                                  True
	P(3,'sp') == 'art'                             True
	P(-1,'sp') == 'art'                            True
	P(-2) in preps                                 True


In [10]:
conjpairs = wsets.conjs.pairs
cpairresults = wsets.conjs.pairresults

#### Random Inspections

In [12]:
conjpairs['>JC/']

{'>B/',
 '>BNR/',
 '>CH/',
 '>JC/',
 '>LHJM/',
 '>NJ',
 '>TH',
 '>X/',
 'BHMH/',
 'BJN[',
 'BJT/',
 'BN/',
 'C>WL=/',
 'CBV/',
 'DBR/',
 'DM/',
 'DWD==/',
 'FR>YR/',
 'GBR/',
 'GBWR/',
 'GR/',
 'HMH',
 'HW>',
 'JFR>LJ/',
 'JLD/',
 'JRWCLM/',
 'JW>B/',
 'JWQJM/',
 'KHN/',
 'KL/',
 'KLJ/',
 'LB/',
 'MCPXH/',
 'MLK/',
 'N<R/',
 'NBJ>/',
 'NXLH/',
 'QWL/',
 'R<=/',
 'RGM_MLK/',
 'RJB/',
 'RXB=====/',
 'VP/',
 'XJL/',
 'XM=/',
 'YWB>/'}

In [14]:
A.show(cpairresults['>JC/']['DBR/'], withNodes=True)

In [20]:
F.sp.v(150164)

'subs'

In [10]:
wsets.accents.accenttype[150674]

'conjunct'