# Testing Wordgrammar Wordsets

This notebook examines the output of the wordsets pipeline to ensure that the data being produced is good.

In [1]:
import collections
import pandas as pd
from IPython.display import display, HTML
import random
import pprint
from wordsets import WordSets
from positions import Positions
from context import Mom
from tf.app import use
A = use('bhsa', hoist=globals(), silent=True)

   |     0.00s No structure info in otext, the structure part of the T-API cannot be used


In [2]:
wsets = WordSets(A, silent=False)

processing accents...
	done
processing quants...
	done
processing preps...
	done
processing conjunctions...
	done
processing constructs...
	done


In [6]:
def beauty(text):
    pretty = f'''
<span style="font-family:Times New Roman; font-size:20pt">
    {text}
</hspan>'''
    display(HTML(pretty))

In [7]:
def showconds(conddict):
    '''
    Print out results from a conddict.
    '''
    for node, conds in conddict:
        print('node', node)
        for cond, truth in conds.items():
            print('\t{:<25} {:>25}'.format(cond, str(truth)))

## Context

In [8]:
m = Mom(7, A, quants=wsets.quants, preps=wsets.preps, noms=wsets.nominals)
m.every()

In [9]:
pprint.pprint(m.explain)

{'adja': ((8,
           {"P(0,'st') == 'a'": True,
            "P(1,'nu') == P(0,'nu')": False,
            "P(1,'sp') in nominals": False}),
          (8,
           {"P(0,'st') == 'a'": True,
            "P(1,'nu') == P(0,'nu')": False,
            "P(1,'sp vt').issubset({'verb', 'ptcp', 'ptca'})": False}),
          (9,
           {"P(0,'st') == 'a'": True,
            "P(1,'sp') == 'art'": False,
            "P(2,'nu') == P(0,'nu')": False,
            "P(2,'sp') in nominals": False})),
 'const': ((8, {"P(0,'st') == 'c'": False, "P(1,'sp') != 'art'": True}),
           (9, {"P(0,'st') == 'c'": False, "P(1,'sp') == 'art'": False})),
 'coord': ((9,
            {'P(0) not in preps': True,
             "P(1,'sp') == 'conj'": True,
             'P(2) not in preps': False,
             "P(2,'sp') in nominals": False}),
           (10,
            {'P(0) not in preps': True,
             "P(1,'sp') == 'conj'": True,
             "P(2,'sp') == 'art'": False}),
           (10,
            

## Accents

In [10]:
mwords = wsets.accents.mwords
atype = wsets.accents.accenttype
atype2set = wsets.accents.atype2set
atype2name2set = wsets.accents.atype2name2set

### mwords

In [11]:
chapter = random.choice(list(F.otype.s('chapter')))
A.table(
    (L.u(w, 'verse')+mwords[w] for w in L.d(chapter, 'word'))
, end=100)

n,p,verse,word,word.1,Unnamed: 5,Unnamed: 6
1,Deuteronomy 15:1,מִקֵּ֥ץ שֶֽׁבַע־שָׁנִ֖ים תַּעֲשֶׂ֥ה שְׁמִטָּֽה׃,מִ,קֵּ֥ץ,,
2,Deuteronomy 15:1,מִקֵּ֥ץ שֶֽׁבַע־שָׁנִ֖ים תַּעֲשֶׂ֥ה שְׁמִטָּֽה׃,מִ,קֵּ֥ץ,,
3,Deuteronomy 15:1,מִקֵּ֥ץ שֶֽׁבַע־שָׁנִ֖ים תַּעֲשֶׂ֥ה שְׁמִטָּֽה׃,שֶֽׁבַע־,שָׁנִ֖ים,,
4,Deuteronomy 15:1,מִקֵּ֥ץ שֶֽׁבַע־שָׁנִ֖ים תַּעֲשֶׂ֥ה שְׁמִטָּֽה׃,שֶֽׁבַע־,שָׁנִ֖ים,,
5,Deuteronomy 15:1,מִקֵּ֥ץ שֶֽׁבַע־שָׁנִ֖ים תַּעֲשֶׂ֥ה שְׁמִטָּֽה׃,תַּעֲשֶׂ֥ה,,,
6,Deuteronomy 15:1,מִקֵּ֥ץ שֶֽׁבַע־שָׁנִ֖ים תַּעֲשֶׂ֥ה שְׁמִטָּֽה׃,שְׁמִטָּֽה׃,,,
7,Deuteronomy 15:2,וְזֶה֮ דְּבַ֣ר הַשְּׁמִטָּה֒ שָׁמֹ֗וט כָּל־בַּ֨עַל֙ מַשֵּׁ֣ה יָדֹ֔ו אֲשֶׁ֥ר יַשֶּׁ֖ה בְּרֵעֵ֑הוּ לֹֽא־יִגֹּ֤שׂ אֶת־רֵעֵ֨הוּ֙ וְאֶת־אָחִ֔יו כִּֽי־קָרָ֥א שְׁמִטָּ֖ה לַֽיהוָֽה׃,וְ,זֶה֮,,
8,Deuteronomy 15:2,וְזֶה֮ דְּבַ֣ר הַשְּׁמִטָּה֒ שָׁמֹ֗וט כָּל־בַּ֨עַל֙ מַשֵּׁ֣ה יָדֹ֔ו אֲשֶׁ֥ר יַשֶּׁ֖ה בְּרֵעֵ֑הוּ לֹֽא־יִגֹּ֤שׂ אֶת־רֵעֵ֨הוּ֙ וְאֶת־אָחִ֔יו כִּֽי־קָרָ֥א שְׁמִטָּ֖ה לַֽיהוָֽה׃,וְ,זֶה֮,,
9,Deuteronomy 15:2,וְזֶה֮ דְּבַ֣ר הַשְּׁמִטָּה֒ שָׁמֹ֗וט כָּל־בַּ֨עַל֙ מַשֵּׁ֣ה יָדֹ֔ו אֲשֶׁ֥ר יַשֶּׁ֖ה בְּרֵעֵ֑הוּ לֹֽא־יִגֹּ֤שׂ אֶת־רֵעֵ֨הוּ֙ וְאֶת־אָחִ֔יו כִּֽי־קָרָ֥א שְׁמִטָּ֖ה לַֽיהוָֽה׃,דְּבַ֣ר,,,
10,Deuteronomy 15:2,וְזֶה֮ דְּבַ֣ר הַשְּׁמִטָּה֒ שָׁמֹ֗וט כָּל־בַּ֨עַל֙ מַשֵּׁ֣ה יָדֹ֔ו אֲשֶׁ֥ר יַשֶּׁ֖ה בְּרֵעֵ֑הוּ לֹֽא־יִגֹּ֤שׂ אֶת־רֵעֵ֨הוּ֙ וְאֶת־אָחִ֔יו כִּֽי־קָרָ֥א שְׁמִטָּ֖ה לַֽיהוָֽה׃,הַ,שְּׁמִטָּה֒,,


### accent class

In [12]:
chapter = random.choice(list(F.otype.s('chapter')))


show = [(w,) for w in L.d(chapter, 'word')]

def show_accents(wset):
    for s in wset:
        word = s[0]
        text = f'{T.text(word)} | {T.text(mwords[word])}'
        trans = T.text(mwords[word], fmt='text-trans-full')
        tab = '&nbsp;&nbsp;&nbsp;&nbsp;'
        print(s[0], T.sectionFromNode(word))
        beauty(text)
        beauty(tab+trans)
        beauty(tab+atype[word])
        beauty('<hr>')
        
show_accents(show[:25])

225377 ('Isaiah', 39, 1)


225378 ('Isaiah', 39, 1)


225379 ('Isaiah', 39, 1)


225380 ('Isaiah', 39, 1)


225381 ('Isaiah', 39, 1)


225382 ('Isaiah', 39, 1)


225383 ('Isaiah', 39, 1)


225384 ('Isaiah', 39, 1)


225385 ('Isaiah', 39, 1)


225386 ('Isaiah', 39, 1)


225387 ('Isaiah', 39, 1)


225388 ('Isaiah', 39, 1)


225389 ('Isaiah', 39, 1)


225390 ('Isaiah', 39, 1)


225391 ('Isaiah', 39, 1)


225392 ('Isaiah', 39, 1)


225393 ('Isaiah', 39, 1)


225394 ('Isaiah', 39, 1)


225395 ('Isaiah', 39, 1)


225396 ('Isaiah', 39, 1)


225397 ('Isaiah', 39, 1)


225398 ('Isaiah', 39, 1)


225399 ('Isaiah', 39, 2)


225400 ('Isaiah', 39, 2)


225401 ('Isaiah', 39, 2)


### Stats

In [15]:
labels = collections.Counter()

for label, wset in atype2set.items():
    labels[label] += len(wset)
    
labels.most_common()

[('disjunct', 272898), ('conjunct', 152669), ('unknown', 1017)]

In [16]:
show_accents(atype2set['unknown'][:25])

17920 ('Genesis', 32, 24)


17921 ('Genesis', 32, 24)


17922 ('Genesis', 32, 24)


30384 ('Exodus', 4, 10)


33479 ('Exodus', 9, 22)


33480 ('Exodus', 9, 22)


33805 ('Exodus', 10, 1)


33806 ('Exodus', 10, 1)


65256 ('Leviticus', 21, 18)


68621 ('Leviticus', 26, 28)


68622 ('Leviticus', 26, 28)


77731 ('Numbers', 12, 9)


77732 ('Numbers', 12, 9)


87066 ('Numbers', 27, 9)


87067 ('Numbers', 27, 9)


96253 ('Deuteronomy', 4, 46)


96254 ('Deuteronomy', 4, 46)


99272 ('Deuteronomy', 10, 7)


100363 ('Deuteronomy', 12, 2)


106199 ('Deuteronomy', 23, 18)


106556 ('Deuteronomy', 24, 10)


106557 ('Deuteronomy', 24, 10)


112656 ('Deuteronomy', 33, 28)


130445 ('Judges', 6, 2)


137795 ('Judges', 16, 25)


### accent match types

In [17]:
mtypes = collections.Counter()

for kind, matches in atype2name2set.items():
    for match, wset in matches.items():
        mtypes[match] += len(wset)
        
mtypes.most_common()

[(('tiphchah',), 62516),
 (('munach',), 59426),
 (('mereka',), 53378),
 (('zaqeph qaton',), 41739),
 (('silluq',), 36480),
 (('atnach',), 34553),
 (('pashta',), 34346),
 (('mehuppak',), 19466),
 (('rebia',), 16542),
 (('tebir',), 12439),
 (('azla/qadma',), 11390),
 (('geresh',), 6566),
 (('darga',), 5228),
 (('rebia', 'rebia mugrash'), 4837),
 (('dechi',), 3829),
 (('gershayim',), 3273),
 (('telisha qetannah',), 3036),
 (('zaqeph gadol',), 3002),
 (('paseq',), 2650),
 (('telisha gedola',), 2108),
 (('zarqa',), 1698),
 (('segolta',), 1596),
 (('pazer qaton',), 1528),
 (('yetiv',), 1169),
 (('oleh weyored',), 550),
 (('paseq', 'azla legarmeh'), 443),
 (('yerah',), 364),
 (('paseq', 'mehuppak legarmeh'), 354),
 (('illuy',), 257),
 (('tsinor',), 227),
 (('pazer',), 133),
 (('paseq', 'shalshelet gedolah'), 51),
 (('qarney parah',), 31),
 (('mehuppak', 'azla/qadma'), 19),
 (('atnach', 'tiphchah'), 18),
 (('silluq', 'tiphchah'), 18),
 (('munach', 'mereka'), 18),
 (('paseq', 'mehuppak legarmeh

### Conjunction Pairs

In [20]:
M = Mom(53, A, quants=wsets.quants, preps=wsets.preps, noms=wsets.nominals)

M.every()

In [21]:
showconds(M.explain['coord'])

node 55
	P(0) not in preps                              True
	P(1,'sp') == 'conj'                            True
	P(2,'sp') in nominals                          True
	P(2) not in preps                             False
node 56
	P(0) not in preps                              True
	P(1,'sp') == 'conj'                            True
	P(2,'sp') == 'art'                            False
node 56
	P(1,'sp') == 'conj'                            True
	P(2) in preps                                  True
	P(3,'sp') in nominals                         False
	P(-1) in preps                                False
node 57
	P(1,'sp') == 'conj'                            True
	P(2) in preps                                  True
	P(3,'sp') == 'art'                             True
	P(-1,'sp') == 'art'                            True
	P(-2) in preps                                 True


In [22]:
conjpairs = wsets.conjs.pairs
cpairresults = wsets.conjs.pairresults

#### Random Inspections

In [75]:
showme = []
for i in range(1, 101):
    firstnoun = random.choice(list(cpairresults))
    secondnoun = random.choice(list(cpairresults[firstnoun]))
    result = random.choice(cpairresults[firstnoun][secondnoun])
    result = L.u(result[0], 'phrase') + result
    showme.append(result)
    
A.table(showme, withNodes=True)

n,p,phrase,word,word.1
1,1_Samuel 2:14,בַכִּיֹּ֜ור אֹ֣ו בַדּ֗וּד אֹ֤ו בַקַּלַּ֨חַת֙ אֹ֣ו בַפָּר֔וּר 732134,דּ֗וּד 142326,קַּלַּ֨חַת֙ 142330
2,Joshua 15:44,קְעִילָ֥ה וְאַכְזִ֖יב וּמָֽרֵאשָׁ֑ה 720398,אַכְזִ֖יב 121996,קְעִילָ֥ה 121994
3,Nehemiah 7:61,מִתֵּ֥ל מֶ֨לַח֙ תֵּ֣ל חַרְשָׁ֔א כְּר֥וּב אַדֹּ֖ון וְאִמֵּ֑ר 885488,אִמֵּ֑ר 386924,אַדֹּ֖ון 386922
4,2_Kings 20:20,אֶת־הַבְּרֵכָה֙ וְאֶת־הַתְּעָלָ֔ה 772447,תְּעָלָ֔ה 208646,בְּרֵכָה֙ 208642
5,Isaiah 30:6,לָבִ֧יא וָלַ֣יִשׁ 780152,לָבִ֧יא 221889,לַ֣יִשׁ 221891
6,2_Chronicles 35:13,בַּסִּירֹ֤ות וּבַדְּוָדִים֙ וּבַצֵּ֣לָחֹ֔ות 904300,צֵּ֣לָחֹ֔ות 425695,דְּוָדִים֙ 425691
7,Jeremiah 6:7,חֳלִ֥י וּמַכָּֽה׃ 791211,חֳלִ֥י 237775,מַכָּֽה׃ 237777
8,Genesis 36:26,חֶמְדָּ֥ן וְאֶשְׁבָּ֖ן וְיִתְרָ֥ן וּכְרָֽן׃ 663784,אֶשְׁבָּ֖ן 19954,חֶמְדָּ֥ן 19952
9,Jeremiah 41:16,גְּבָרִ֞ים אַנְשֵׁ֣י הַמִּלְחָמָ֗ה וְנָשִׁ֤ים וְטַף֙ וְסָ֣רִסִ֔ים 802676,נָשִׁ֤ים 257745,מִּלְחָמָ֗ה 257743
10,Genesis 46:21,בֶּ֤לַע וָבֶ֨כֶר֙ וְאַשְׁבֵּ֔ל גֵּרָ֥א וְנַעֲמָ֖ן אֵחִ֣י וָרֹ֑אשׁ מֻפִּ֥ים וְחֻפִּ֖ים וָאָֽרְדְּ׃ 668007,חֻפִּ֖ים 26332,מֻפִּ֥ים 26330


### Stats

In [76]:
len(conjpairs)

3018

### Construct Pairs

In [77]:
conpairs = wsets.consts.pairs
conpairresults = wsets.consts.pairresults

In [78]:
showme = []
for i in range(1, 101):
    firstnoun = random.choice(list(conpairresults))
    secondnoun = random.choice(list(conpairresults[firstnoun]))
    result = random.choice(conpairresults[firstnoun][secondnoun])
    result = L.u(result[0], 'phrase') + result
    showme.append(result)
    
A.table(showme, withNodes=True)

n,p,phrase,word,word.1
1,Job 32:8,נִשְׁמַ֖ת שַׁדַּ֣י 859104,נִשְׁמַ֖ת 343920,שַׁדַּ֣י 343921
2,Ezekiel 17:9,כָּל־טַרְפֵּ֤י צִמְחָהּ֙ 811641,טַרְפֵּ֤י 272350,צִמְחָהּ֙ 272351
3,Job 26:14,שֵּׁ֣מֶץ דָּ֭בָר 857937,שֵּׁ֣מֶץ 342362,דָּ֭בָר 342363
4,Ezekiel 9:4,עַל־מִצְחֹ֣ות הָאֲנָשִׁ֗ים 808935,מִצְחֹ֣ות 268180,אֲנָשִׁ֗ים 268182
5,Ezra 8:36,לַאֲחַשְׁדַּרְפְּנֵי֙ הַמֶּ֔לֶךְ וּפַחֲוֹ֖ות עֵ֣בֶר הַנָּהָ֑ר 883124,אֲחַשְׁדַּרְפְּנֵי֙ 382245,מֶּ֔לֶךְ 382247
6,Ezekiel 23:21,לְמַ֖עַן שְׁדֵ֥י נְעוּרָֽיִךְ׃ ס 814124,שְׁדֵ֥י 276030,נְעוּרָֽיִךְ׃ ס 276031
7,Psalms 77:5,שְׁמֻרֹ֣ות עֵינָ֑י 844310,שְׁמֻרֹ֣ות 323299,עֵינָ֑י 323300
8,Jeremiah 2:24,לִמֻּ֣ד מִדְבָּ֗ר 789833,לִמֻּ֣ד 235773,מִדְבָּ֗ר 235774
9,Isaiah 61:3,מַעֲטֵ֣ה תְהִלָּ֔ה 788007,מַעֲטֵ֣ה 233075,תְהִלָּ֔ה 233076
10,Proverbs 11:29,לַחֲכַם־לֵֽב׃ 863427,חֲכַם־ 349893,לֵֽב׃ 349894


### Stats

In [79]:
len(conpairs)

1831