<img align="right" src="images/dans-small.png"/>
<img align="right" src="images/tf-small.png"/>
<img align="right" src="images/etcbc.png"/>

We make a link between the morphology in the
[Openscriptures](http://openscriptures.org)
and the linguistics in the [BHSA](https://github.com/ETCBC/bhsa).

We proceed as follows:

* extract the morphology from the files in
  [openscriptures/morphhb/wlc](https://github.com/openscriptures/morphhb/tree/master/wlc)
* link the words in the openscripture files to slots in the BHSA
* compile the openscripture morphology data into a TF feature file.

In [1]:
import os
from glob import glob
from lxml import etree
from itertools import zip_longest
from unicodedata import normalize, category
from IPython.display import display, HTML

from tf.fabric import Fabric

# Loading BHSA

In [2]:
REPO = os.path.expanduser('~/github/etcbc/bhsa')
baseDir = '{}/tf'.format(REPO)
tempDir = '{}/_temp'.format(REPO)
VERSION = '2017'

TF = Fabric(locations='{}/tf/{}'.format(REPO, VERSION), modules=[''])
api = TF.load('''
    book
    g_cons_utf8 g_word_utf8
''')
api.makeAvailableIn(globals())


This is Text-Fabric 3.1.1
Api reference : https://github.com/Dans-labs/text-fabric/wiki/Api
Tutorial      : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb
Example data  : https://github.com/Dans-labs/text-fabric-data

114 features found and 0 ignored
  0.00s loading features ...
   |     0.01s B book                 from /Users/dirk/github/etcbc/bhsa/tf/2017
   |     0.21s B g_cons_utf8          from /Users/dirk/github/etcbc/bhsa/tf/2017
   |     0.21s B g_word_utf8          from /Users/dirk/github/etcbc/bhsa/tf/2017
   |     0.00s Feature overview: 108 for nodes; 5 for edges; 1 configs; 7 computed
  4.64s All features loaded/computed - for details use loadLog()


In [3]:
bhsBooks = sorted(F.book.v(n) for n in F.otype.s('book'))
print('\n'.join(bhsBooks))

Amos
Canticum
Chronica_I
Chronica_II
Daniel
Deuteronomium
Ecclesiastes
Esra
Esther
Exodus
Ezechiel
Genesis
Habakuk
Haggai
Hosea
Iob
Jeremia
Jesaia
Joel
Jona
Josua
Judices
Leviticus
Maleachi
Micha
Nahum
Nehemia
Numeri
Obadia
Proverbia
Psalmi
Reges_I
Reges_II
Ruth
Sacharia
Samuel_I
Samuel_II
Threni
Zephania


# Reading open scriptures

In [4]:
OS_BASE = os.path.expanduser('~/github/openscriptures/morphhb/wlc')
os.chdir(OS_BASE)
osmBookSet = set(fn[0:-4] for fn in glob('*.xml') if fn != 'VerseMap.xml')

In [5]:
print('\n'.join(sorted(osmBookSet)))

1Chr
1Kgs
1Sam
2Chr
2Kgs
2Sam
Amos
Dan
Deut
Eccl
Esth
Exod
Ezek
Ezra
Gen
Hab
Hag
Hos
Isa
Jer
Job
Joel
Jonah
Josh
Judg
Lam
Lev
Mal
Mic
Nah
Neh
Num
Obad
Prov
Ps
Ruth
Song
Zech
Zeph


In [6]:
osmBooks = '''
Amos
Song
1Chr
2Chr
Dan
Deut
Eccl
Ezra
Esth
Exod
Ezek
Gen
Hab
Hag
Hos
Job
Jer
Isa
Joel
Jonah
Josh
Judg
Lev
Mal
Mic
Nah
Neh
Num
Obad
Prov
Ps
1Kgs
2Kgs
Ruth
Zech
1Sam
2Sam
Lam
Zeph
'''.strip().split()

In [7]:
osmBookFromBhs = {}
bhsBookFromOsm = {}
for (i, bhsBook) in enumerate(bhsBooks):
    osmBook = osmBooks[i]
    osmBookFromBhs[bhsBook] = osmBook
    bhsBookFromOsm[osmBook] = bhsBook

In [8]:
NS = '{http://www.bibletechnologies.net/2003/OSIS/namespace}'
NFD = 'NFD'
LO = 'Lo'

finals = {
    'ך':\
    'כ',
    'ם':\
    'מ',
    'ן':\
    'נ',
    'ף':\
    'פ',
    'ץ':\
    'צ',
}

finalsI = {v: k for (k,v) in finals.items()}

# k	05DA	ך	letter final kaf
# K	05DB	כ	letter kaf
# m	05DD	ם	letter final mem
# M	05DE	מ	letter mem
# n	05DF	ן	letter final nun
# N	05E0	נ	letter nun
# p	05E3	ף	letter final pe
# P	05E4	פ	letter pe
# y	05E5	ץ	letter final tsadi
# Y	05E6	צ	letter tsadi


def toCons(fw): return ''.join(c for c in normalize(NFD, fw) if category(c) == LO)
def final(c): return finalsI.get(c, c)
def finalCons(s): return s[0:-1]+final(s[-1])

def readOsmBook(osmBook, osmWords, stats):
    infile = '{}.xml'.format(osmBook)
    parser = etree.XMLParser(remove_blank_text=True, ns_clean=True)
    root = etree.parse(infile, parser).getroot()
    osisTextNode = root[0]
    divNode = osisTextNode[1]
    chapterNodes = list(divNode)
    print('reading {:<5} ({:<15}) {:>3} chapters'.format(osmBook, bhsBookFromOsm[osmBook], len(chapterNodes)))
    ch = 0
    for chapterNode in chapterNodes:
        if chapterNode.tag != NS+'chapter': continue
        ch += 1
        vs = 0
        for verseNode in list(chapterNode):
            if verseNode.tag != NS+'verse': continue
            vs += 1
            w = 0
            for wordNode in list(verseNode):
                if wordNode.tag != NS+'w': continue
                w += 1
                lemma = wordNode.get('lemma', None)
                morph = wordNode.get('morph', None)
                text = wordNode.text
                lemmas = lemma.split('/') if lemma != None else []
                morphs = morph.split('/') if morph != None else []
                texts = text.split('/') if text != None else []
                for (lm, mph, tx) in zip_longest(lemmas, morphs, texts, fillvalue=''):
                    txc = None if tx == None else toCons(tx)
                    osmWords.append((tx, txc, mph, lm, osmBook, ch, vs, w))
                    if not mph:
                        stats['noMorph'] += 1
                    if not tx:
                        stats['xMorph'] += 1

In [9]:
osmWords = []
stats = dict(noMorph=0, xMorph=0)

for bn in F.otype.s('book'):
    bhsBook = T.sectionFromNode(bn, lang='la')[0]
    osmBook = osmBookFromBhs[bhsBook]
    readOsmBook(osmBook, osmWords, stats)

print('''
BHS words:       {:>6}
Collected words: {:>6}
No morphology:   {:>6}
Mismatches:      {:>6}
{} % of the words are morphologically annotated.
'''.format(
        F.otype.maxSlot,
        len(osmWords),
        stats['noMorph'], 
        stats['xMorph'], 
        round(100 * (len(osmWords) - stats['noMorph'] - stats['xMorph'])/len(osmWords)),
))

reading Gen   (Genesis        )  50 chapters
reading Exod  (Exodus         )  40 chapters
reading Lev   (Leviticus      )  27 chapters
reading Num   (Numeri         )  36 chapters
reading Deut  (Deuteronomium  )  34 chapters
reading Josh  (Josua          )  24 chapters
reading Judg  (Judices        )  21 chapters
reading 1Sam  (Samuel_I       )  31 chapters
reading 2Sam  (Samuel_II      )  24 chapters
reading 1Kgs  (Reges_I        )  22 chapters
reading 2Kgs  (Reges_II       )  25 chapters
reading Isa   (Jesaia         )  66 chapters
reading Jer   (Jeremia        )  52 chapters
reading Ezek  (Ezechiel       )  48 chapters
reading Hos   (Hosea          )  14 chapters
reading Joel  (Joel           )   4 chapters
reading Amos  (Amos           )   9 chapters
reading Obad  (Obadia         )   1 chapters
reading Jonah (Jona           )   4 chapters
reading Mic   (Micha          )   7 chapters
reading Nah   (Nahum          )   3 chapters
reading Hab   (Habakuk        )   3 chapters
reading Ze

In [10]:
list(enumerate(osmWords[0:100]))

[(0, ('בְּ', 'ב', 'HR', 'b', 'Gen', 1, 1, 1)),
 (1, ('רֵאשִׁ֖ית', 'ראשית', 'Ncfsa', '7225', 'Gen', 1, 1, 1)),
 (2, ('בָּרָ֣א', 'ברא', 'HVqp3ms', '1254 a', 'Gen', 1, 1, 2)),
 (3, ('אֱלֹהִ֑ים', 'אלהים', 'HNcmpa', '430', 'Gen', 1, 1, 3)),
 (4, ('אֵ֥ת', 'את', 'HTo', '853', 'Gen', 1, 1, 4)),
 (5, ('הַ', 'ה', 'HTd', 'd', 'Gen', 1, 1, 5)),
 (6, ('שָּׁמַ֖יִם', 'שמים', 'Ncmpa', '8064', 'Gen', 1, 1, 5)),
 (7, ('וְ', 'ו', 'HC', 'c', 'Gen', 1, 1, 6)),
 (8, ('אֵ֥ת', 'את', 'To', '853', 'Gen', 1, 1, 6)),
 (9, ('הָ', 'ה', 'HTd', 'd', 'Gen', 1, 1, 7)),
 (10, ('אָֽרֶץ', 'ארץ', 'Ncbsa', '776', 'Gen', 1, 1, 7)),
 (11, ('וְ', 'ו', 'HC', 'c', 'Gen', 1, 2, 1)),
 (12, ('הָ', 'ה', 'Td', 'd', 'Gen', 1, 2, 1)),
 (13, ('אָ֗רֶץ', 'ארץ', 'Ncbsa', '776', 'Gen', 1, 2, 1)),
 (14, ('הָיְתָ֥ה', 'היתה', 'HVqp3fs', '1961', 'Gen', 1, 2, 2)),
 (15, ('תֹ֨הוּ֙', 'תהו', 'HNcmsa', '8414', 'Gen', 1, 2, 3)),
 (16, ('וָ', 'ו', 'HC', 'c', 'Gen', 1, 2, 4)),
 (17, ('בֹ֔הוּ', 'בהו', 'Ncmsa', '922', 'Gen', 1, 2, 4)),
 (18, ('וְ', 'ו', 

Why are there 40,000 word more in OSM than in BHSA?
Let's explore.

In [11]:
def showCase(w, j, ln):
    print(T.sectionFromNode(w))
    print('BHS')
    for n in range(w, w + ln):
        print('word  {} = [{}]'.format(n, toCons(F.g_cons_utf8.v(n))))
    print('OSM')
    for n in range(j, j + ln):
        print('morph {} = [{}]'.format(n, osmWords[n][1]))

In [12]:
for (i, w) in enumerate(F.otype.s('word')):
    bhs = toCons(F.g_cons_utf8.v(w))
    osm = osmWords[i][1]
    if bhs != osm:
        print('Mismatch at {}: bhs=[{}] osm=[{}]'.format(i, bhs, osm))
        break

Mismatch at 61: bhs=[] osm=[אור]


In [13]:
showCase(60, 59, 5)

('Genesis', 1, 5)
BHS
word  60 = [אלהים]
word  61 = [ל]
word  62 = []
word  63 = [אור]
word  64 = [יום]
OSM
morph 59 = [אלהים]
morph 60 = [ל]
morph 61 = [אור]
morph 62 = [יום]
morph 63 = [ו]


Aha, the BHSA has encoded an empty article here, because the pointing of surrounding letters signals an article.
So let's ignore the inserted empty articles of the BHSA.

In [14]:
j = -1
for w in F.otype.s('word'):
    bhs = toCons(F.g_cons_utf8.v(w))
    if bhs == '': continue
    j += 1
    osm = osmWords[j][1]
    if bhs != osm:
        print('''Mismatch at BHS-{} OS-{}:\nbhs=[{}]\nos=[{}]'''.format(w, j, bhs, osm))
        break

Mismatch at BHS-194 OS-187:
bhs=[מינו]
os=[מינ]


In [15]:
showCase(194, 187, 5)

('Genesis', 1, 11)
BHS
word  194 = [מינו]
word  195 = [אשר]
word  196 = [זרעו]
word  197 = [בו]
word  198 = [על]
OSM
morph 187 = [מינ]
morph 188 = [ו]
morph 189 = [אשר]
morph 190 = [זרע]
morph 191 = [ו]


Aha, the BHS works with word boundaries, and the OSM with morpheme boundaries.

# Aligning

We have to develop a way of aligning the BHS words with the OSM morphemes.

For each BHS word, we grab OSM morphemes until all consonants in the BHS word have been matched.
If needed, we grab additional BHS words, if the current OSM morpheme happens to be longer than the current BHS word.

We will encounter discrepancies, which we list in an exception list.

The exceptions are coded as follows:

If `w: n` is in the dictionary of exceptions, it means that slot (word) `w` in the BHSA is different from its counterpart morpheme(s) in the OSM.

If `n > 0`, that many OSM morphemes will be gobbled to align with slot `w`.

If `n < 0`, that many slots from `w` will be gobbled to match the current OSM morpheme.

In [16]:
exceptions = {
    58571: 1,
    87007: 2,
    97044: 1,
    97049: 1,
    139052: 1,
    166914: 1,
    166915: 1,
    199373: 1,
    199374: 1,
    199375: 2,
    207185: 1,
    207186: 1,
    207187: 2,
    215253: 1,
    224043: 1,
    224044: 1,
    224045: 2,
    227601: 1,
    238174: 1,
    238175: 1,
    243336: 1,
    243337: 1,
    250007: 1,
    250008: 1,
    256643: 1,
    266189: 1,
    287360: 1,
    287360: 2,
    309061: 1,
    309062: 1,
    324527: 1,
    324528: 1,
    337702: 1,
    337703: 1,
    345559: 1,
    345560: 2,
    345572: 1,
    345573: 2,
    351280: 1,
    351878: 1,
    351879: 1,
    376865: 1,
    376866: 2,
    376867: 2,
    383405: 2,
    384049: 1,
    384050: 1,
    384052: 2,
    405102: -2,
    426506: 1,
    426507: 1,
}

In [17]:
j = -1
osmIndex = {}
u = None
remainingErrors = False
for w in F.otype.s('word'):
    if u != None and w <= u: continue
    bhs = toCons(F.g_cons_utf8.v(w))
    if bhs == '':
        osmIndex[w] = None
        continue
    j += 1
    startJ = j
    startW = w
    osm = osmWords[j][1]

    maxGobble = exceptions.get(w, None)
    gobble = 1
    while len(osm) < len(bhs) or(maxGobble != None and maxGobble > 0):
        if maxGobble != None and gobble >= maxGobble: break
        j += 1
        osm += osmWords[j][1]
        gobble += 1
    u = w
    gobble = 1
    while len(osm) > len(bhs) or (maxGobble != None and maxGobble < 0):
        if maxGobble != None and gobble >= -maxGobble: break
        u += 1
        bhs += toCons(F.g_cons_utf8.v(u))
        gobble += 1
    if maxGobble == None and finalCons(bhs) != finalCons(osm):
        print('''Mismatch in {} at BHS-{} OS-{}->{}:\nbhs=[{}]\nos=[{}]'''.format(
            '{} {}:{}'.format(*T.sectionFromNode(w)),
            w, startJ, j, bhs, osm,
        ))
        showCase(w, startJ, j - startJ + 10)
        remainingErrors = True
        break
    for k in range(startW, u + 1):
        osmIndex[k] = (startJ, j + 1)
        
if not remainingErrors:
    print('Succeeded in aligning BHS with OSM')
    print('{} BHS words matched against {} OSM morphemes with {} known exceptions'.format(
        len(osmIndex), len(osmWords), len(exceptions),
    ))


Succeeded in aligning BHS with OSM
426584 BHS words matched against 469448 OSM morphemes with 50 known exceptions


# Documenting exceptions
We prettyprint the list of exceptions.
For each exception we give:

* the passage reference, with an hyperlink to that verse in SHEBANQ: click the verse number,
  and you see the relevant underying data
* the BHSA data (slot number, consonantal representation, fully pointed representation)
* the OSM data (sequence number of the relevant `<w>` element, consonantal rep, fully pointed rep.

In [18]:
def jText(j):
    (text, cons, morph, lemma, bk, ch, vs, w) = osmWords[j]
    return '''
<p>w = {}</p>
<p>cons = <b class="h">{}</b></p>
<p>full = <span class="h">{}</span></p>
'''.format(w, cons, text)

def wText(w):
    return '''
<p>w = {}</p>
<p>cons = <b class="h">{}</b></p>
<p>full = <span class="h">{}</span></p>    
'''.format(w, F.g_cons_utf8.v(w), F.g_word_utf8.v(w))

In [19]:
html = '''
<html>
<head>
<style>
.h {
    font-family: Ezra SIL;
    font-size: large;
}
</style>
</head>
<body>
<table>
'''
shUrl = 'https://shebanq.ancient-data.org/hebrew/text?book={}&chapter={}&verse={}&version=c&mr=m&qw=q&tp=txt_p&tr=hb&wget=x&qget=x&nget=x&wd4_statfl=v&ph_arela=x&wd4_statrl=v&sn_an=x&cl=x&wd1_lang=x&wd1_subpos=x&wd2_person=v&sp_rela=v&wd1_pdp=x&sn_n=v&wd3_uvf=v&ph_fun=x&wd1_nmtp=x&gl=x&sp_n=v&pt=x&ph_an=x&ph_typ=x&cl_typ=v&tt=x&wd4_statro=x&wd3_vbs=v&wd1=v&tl=x&wd3=v&wd4=x&wd2_gender=v&ph=x&wd3_vbe=v&wd1_pos=x&ph_det=x&ph_rela=x&wd4_statfo=x&tl_tlv=x&wd2_stem=v&wd2_state=v&ht=v&ph_n=v&tl_tlc=v&cl_tab=x&wd3_nme=v&hl=x&cl_par=x&cl_an=x&cl_n=v&wd3_prs=v&wd3_pfm=v&sp=x&cl_code=v&ht_hk=v&wd2=x&hl_hlc=x&cl_rela=v&wd2_gnumber=v&wd2_tense=v&cl_txt=x&wd1_n=v&sn=x&ht_ht=v&hl_hlv=x&pref=alt'
for (w, n) in sorted(exceptions.items()):
    shLink = shUrl.format(*T.sectionFromNode(w, lang='la'))
    passage = '<a target="_blank" href="{}">{} {}:{}</a>'.format(shLink, *T.sectionFromNode(w))
    ws = list(range(w, w - n)) if n < 0 else [w]
    js = set()
    for w in ws:
        for j in range(*osmIndex[w]):
            js.add(j)
    js = sorted(js)
    wInfo = ', '.join(wText(x) for x in ws)
    jInfo = ', '.join(jText(x) for x in js)

    html += '''
<tr><td>{}</td><td>{}</td><td>{}</td></tr>
'''.format(passage, wInfo, jInfo)
html += '''
</table>
</body>
</html>
'''
display(HTML(html))

0,1,2
Leviticus 11:42,w = 58571 cons = גחון full = גָּחֹ֜ון,w = 4 cons = גח full = גָּח֜
Numbers 27:5,w = 87007 cons = משׁפטן full = מִשְׁפָּטָ֖ן,"w = 4 cons = משפט full = מִשְׁפָּטָ֖ , w = 4 cons = full ="
Deuteronomy 6:4,w = 97044 cons = שׁמע full = שְׁמַ֖ע,w = 1 cons = שמ full = שְׁמַ֖
Deuteronomy 6:4,w = 97049 cons = אחד full = אֶחָֽד,w = 6 cons = אח full = אֶחָֽ
Judges 18:30,w = 139052 cons = מנשׁה full = מְנַשֶּׁ֜ה,w = 11 cons = מ full = מְ
2_Samuel 12:22,w = 166914 cons = י full = י,w = 11 cons = יחנ full = יחנ
2_Samuel 12:22,w = 166915 cons = חנני full = חנני,w = 11 cons = ני full = ני
2_Kings 7:15,w = 199373 cons = בה full = בה,w = 14 cons = ב full = ב
2_Kings 7:15,w = 199374 cons = חפזם full = חפזם,w = 14 cons = החפז full = החפז
2_Kings 7:15,w = 199375 cons = ו full = וַ,"w = 14 cons = ם full = ם , w = 15 cons = ו full = וַ"
