In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tf.app import use
from tf.browser.ner.power import PowerNER

In [41]:
A = use("ETCBC/bhsa:clone", checkout="clone")

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


Before we start, we make a test instruction set.

We pick up all words whose lexeme starts with `BJT_` and that have multiple occurrence forms.
We collect the occurrence forms and use them to populate a spreadsheet with instructions.

See the file `ner/sheets/places.xlsx`

In [38]:
F = A.api.F

In [39]:
candidates = {}
candidates_utf8 = {}

for w in F.otype.s("word"):
    lex = F.lex.v(w)
    if not lex.startswith("BJT_"):
        continue
    lex_utf8 = F.lex_utf8.v(w)
    candidates.setdefault(lex, set()).add(F.g_cons.v(w))
    candidates_utf8.setdefault(lex_utf8, set()).add(F.g_cons_utf8.v(w))

multiples = {lex: shapes for (lex, shapes) in candidates.items() if len(shapes) > 1}
multiples_utf8 = {lex: shapes for (lex, shapes) in candidates_utf8.items() if len(shapes) > 1}

def show(d):
    for (k, vs) in sorted(d.items()):
        print(k)
        print("\t" + (" ; ".join(v.replace(" ", "_") for v in vs)))
        
show(multiples)
show(multiples_utf8)

BJT_C>N/
	BJT_C>N ; BJT_CN
BJT_DGWN/
	BJT_DGN ; BJT_DGWN
BJT_HJCMWT/
	BJT_HJCJMT ; BJT_HJCMWT ; BJT_HJCMT
BJT_XWRWN/
	BJT_XRN ; BJT_XWRN ; BJT_XWRWN ; BJT_XRWN
בית דגון
	בית_דגן ; בית_דגון
בית הישׁמות
	בית_הישׁימת ; בית_הישׁמת ; בית_הישׁמות
בית חורון
	בית_חרן ; בית_חורן ; בית_חרון ; בית_חורון
בית שׁאן
	בית_שׁאן ; בית_שׁן


Now we start with the entity assignment.

In [28]:
PA = PowerNER(A)

In [29]:
PA.readInstructions("places", force=True)

4 entities with 11 occurrence specs
0 entities do not have occurrence specifiers
All occurrence specifiers are unambiguous


In [30]:
PA.makeInventory()
PA.showInventory()

בית.דגון                 LOC   בית_דגון                 1 x בית דגון
בית.דגון                 LOC   בית_דגן                  1 x בית דגון
בית.חורון                LOC   בית_חורון                5 x בית חורון
בית.חורון                LOC   בית_חורן                 5 x בית חורון
בית.חורון                LOC   בית_חרון                 3 x בית חורון
בית.חורון                LOC   בית_חרן                  1 x בית חורון
Total 16


In [31]:
PA.setSet("power")

Annotation set power has 0 annotations


In [32]:
PA.resetSet()

Annotation set power has 0 annotations


In [33]:
PA.markEntities()

Already present:     0 x
Added:              16 x


In [35]:
results = PA.filterContent(anyEnt=True, showStats=None)

15 verses


In [36]:
PA.showContent(results)