<img align="right" src="images/dans-small.png"/>
<img align="right" src="images/tf-small.png"/>
<img align="right" src="images/etcbc.png"/>

We make a link between the morphology in the
[Openscriptures](http://openscriptures.org)
and the linguistics in the [BHSA](https://github.com/ETCBC/bhsa).

We proceed as follows:

* extract the morphology from the files in
  [openscriptures/morphhb/wlc](https://github.com/openscriptures/morphhb/tree/master/wlc)
* link the words in the openscripture files to slots in the BHSA
* compile the openscripture morphology data into a TF feature file.

In [1]:
import os
from glob import glob
from lxml import etree
from itertools import zip_longest
from unicodedata import normalize, category

from tf.fabric import Fabric

# Loading BHSA

In [102]:
REPO = os.path.expanduser('~/github/etcbc/bhsa')
baseDir = '{}/tf'.format(REPO)
tempDir = '{}/_temp'.format(REPO)
VERSION = '2017'

TF = Fabric(locations='{}/tf/{}'.format(REPO, VERSION), modules=[''])
api = TF.load('''
    book
    g_cons_utf8
''')
api.makeAvailableIn(globals())


This is Text-Fabric 3.1.1
Api reference : https://github.com/Dans-labs/text-fabric/wiki/Api
Tutorial      : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb
Example data  : https://github.com/Dans-labs/text-fabric-data

114 features found and 0 ignored
  0.00s loading features ...
   |     0.01s B book                 from /Users/dirk/github/etcbc/bhsa/tf/2017
   |     0.20s B g_cons_utf8          from /Users/dirk/github/etcbc/bhsa/tf/2017
   |     0.00s Feature overview: 108 for nodes; 5 for edges; 1 configs; 7 computed
  5.55s All features loaded/computed - for details use loadLog()


In [117]:
bhsBooks = sorted(F.book.v(n) for n in F.otype.s('book'))
print('\n'.join(bhsBooks))

Amos
Canticum
Chronica_I
Chronica_II
Daniel
Deuteronomium
Ecclesiastes
Esra
Esther
Exodus
Ezechiel
Genesis
Habakuk
Haggai
Hosea
Iob
Jeremia
Jesaia
Joel
Jona
Josua
Judices
Leviticus
Maleachi
Micha
Nahum
Nehemia
Numeri
Obadia
Proverbia
Psalmi
Reges_I
Reges_II
Ruth
Sacharia
Samuel_I
Samuel_II
Threni
Zephania


# Reading open scriptures

In [118]:
OS_BASE = os.path.expanduser('~/github/openscriptures/morphhb/wlc')
os.chdir(OS_BASE)
osmBookSet = set(fn[0:-4] for fn in glob('*.xml') if fn != 'VerseMap.xml')

In [119]:
print('\n'.join(sorted(osmBookSet)))

1Chr
1Kgs
1Sam
2Chr
2Kgs
2Sam
Amos
Dan
Deut
Eccl
Esth
Exod
Ezek
Ezra
Gen
Hab
Hag
Hos
Isa
Jer
Job
Joel
Jonah
Josh
Judg
Lam
Lev
Mal
Mic
Nah
Neh
Num
Obad
Prov
Ps
Ruth
Song
Zech
Zeph


In [120]:
osmBooks = '''
Amos
Song
1Chr
2Chr
Dan
Deut
Eccl
Ezra
Esth
Exod
Ezek
Gen
Hab
Hag
Hos
Job
Jer
Isa
Joel
Jonah
Josh
Judg
Lev
Mal
Mic
Nah
Neh
Num
Obad
Prov
Ps
1Kgs
2Kgs
Ruth
Zech
1Sam
2Sam
Lam
Zeph
'''.strip().split()

In [121]:
osmBookFromBhs = {}
bhsBookFromOsm = {}
for (i, bhsBook) in enumerate(bhsBooks):
    osmBook = osmBooks[i]
    osmBookFromBhs[bhsBook] = osmBook
    bhsBookFromOsm[osmBook] = bhsBook

In [122]:
NS = '{http://www.bibletechnologies.net/2003/OSIS/namespace}'
NFD = 'NFD'
LO = 'Lo'

finals = {
    'ך':\
    'כ',
    'ם':\
    'מ',
    'ן':\
    'נ',
    'ף':\
    'פ',
    'ץ':\
    'צ',
}

finalsI = {v: k for (k,v) in finals.items()}

# k	05DA	ך	letter final kaf
# K	05DB	כ	letter kaf
# m	05DD	ם	letter final mem
# M	05DE	מ	letter mem
# n	05DF	ן	letter final nun
# N	05E0	נ	letter nun
# p	05E3	ף	letter final pe
# P	05E4	פ	letter pe
# y	05E5	ץ	letter final tsadi
# Y	05E6	צ	letter tsadi


def toCons(fw): return ''.join(c for c in normalize(NFD, fw) if category(c) == LO)
def final(c): return finalsI.get(c, c)
def finalCons(s): return s[0:-1]+final(s[-1])

def readOsmBook(osmBook, osmWords, stats):
    infile = '{}.xml'.format(osmBook)
    parser = etree.XMLParser(remove_blank_text=True, ns_clean=True)
    root = etree.parse(infile, parser).getroot()
    osisTextNode = root[0]
    divNode = osisTextNode[1]
    chapterNodes = list(divNode)
    print('reading {:<5} ({:<15}) {:>3} chapters'.format(osmBook, bhsBookFromOsm[osmBook], len(chapterNodes)))
    for chapterNode in chapterNodes:
        if chapterNode.tag != NS+'chapter': continue
        for verseNode in list(chapterNode):
            if verseNode.tag != NS+'verse': continue
            for wordNode in list(verseNode):
                if wordNode.tag != NS+'w': continue
                lemma = wordNode.get('lemma', None)
                morph = wordNode.get('morph', None)
                text = wordNode.text
                lemmas = lemma.split('/') if lemma != None else []
                morphs = morph.split('/') if morph != None else []
                texts = text.split('/') if text != None else []
                for (lm, mph, tx) in zip_longest(lemmas, morphs, texts, fillvalue=''):
                    txc = None if tx == None else toCons(tx)
                    osmWords.append((tx, txc, mph, lm))
                    if not mph:
                        stats['noMorph'] += 1
                    if not tx:
                        stats['xMorph'] += 1

In [123]:
osmWords = []
stats = dict(noMorph=0, xMorph=0)

for bn in F.otype.s('book'):
    bhsBook = T.sectionFromNode(bn, lang='la')[0]
    osmBook = osmBookFromBhs[bhsBook]
    readOsmBook(osmBook, osmWords, stats)

print('''
BHS words:       {:>6}
Collected words: {:>6}
No morphology:   {:>6}
Mismatches:      {:>6}
{} % of the words are morphologically annotated.
'''.format(
        F.otype.maxSlot,
        len(osmWords),
        stats['noMorph'], 
        stats['xMorph'], 
        round(100 * (len(osmWords) - stats['noMorph'] - stats['xMorph'])/len(osmWords)),
))

reading Gen   (Genesis        )  50 chapters
reading Exod  (Exodus         )  40 chapters
reading Lev   (Leviticus      )  27 chapters
reading Num   (Numeri         )  36 chapters
reading Deut  (Deuteronomium  )  34 chapters
reading Josh  (Josua          )  24 chapters
reading Judg  (Judices        )  21 chapters
reading 1Sam  (Samuel_I       )  31 chapters
reading 2Sam  (Samuel_II      )  24 chapters
reading 1Kgs  (Reges_I        )  22 chapters
reading 2Kgs  (Reges_II       )  25 chapters
reading Isa   (Jesaia         )  66 chapters
reading Jer   (Jeremia        )  52 chapters
reading Ezek  (Ezechiel       )  48 chapters
reading Hos   (Hosea          )  14 chapters
reading Joel  (Joel           )   4 chapters
reading Amos  (Amos           )   9 chapters
reading Obad  (Obadia         )   1 chapters
reading Jonah (Jona           )   4 chapters
reading Mic   (Micha          )   7 chapters
reading Nah   (Nahum          )   3 chapters
reading Hab   (Habakuk        )   3 chapters
reading Ze

In [124]:
list(enumerate(osmWords[0:100]))

[(0, ('בְּ', 'ב', 'HR', 'b')),
 (1, ('רֵאשִׁ֖ית', 'ראשית', 'Ncfsa', '7225')),
 (2, ('בָּרָ֣א', 'ברא', 'HVqp3ms', '1254 a')),
 (3, ('אֱלֹהִ֑ים', 'אלהים', 'HNcmpa', '430')),
 (4, ('אֵ֥ת', 'את', 'HTo', '853')),
 (5, ('הַ', 'ה', 'HTd', 'd')),
 (6, ('שָּׁמַ֖יִם', 'שמים', 'Ncmpa', '8064')),
 (7, ('וְ', 'ו', 'HC', 'c')),
 (8, ('אֵ֥ת', 'את', 'To', '853')),
 (9, ('הָ', 'ה', 'HTd', 'd')),
 (10, ('אָֽרֶץ', 'ארץ', 'Ncbsa', '776')),
 (11, ('וְ', 'ו', 'HC', 'c')),
 (12, ('הָ', 'ה', 'Td', 'd')),
 (13, ('אָ֗רֶץ', 'ארץ', 'Ncbsa', '776')),
 (14, ('הָיְתָ֥ה', 'היתה', 'HVqp3fs', '1961')),
 (15, ('תֹ֨הוּ֙', 'תהו', 'HNcmsa', '8414')),
 (16, ('וָ', 'ו', 'HC', 'c')),
 (17, ('בֹ֔הוּ', 'בהו', 'Ncmsa', '922')),
 (18, ('וְ', 'ו', 'HC', 'c')),
 (19, ('חֹ֖שֶׁךְ', 'חשך', 'Ncmsa', '2822')),
 (20, ('עַל', 'על', 'HR', '5921 a')),
 (21, ('פְּנֵ֣י', 'פני', 'HNcbpc', '6440')),
 (22, ('תְה֑וֹם', 'תהום', 'HNcbsa', '8415')),
 (23, ('וְ', 'ו', 'HC', 'c')),
 (24, ('ר֣וּחַ', 'רוח', 'Ncbsc', '7307')),
 (25, ('אֱלֹהִ֔ים', 'אלהים'

Why are there 40,000 word more in OSM than in BHSA?
Let's explore.

In [14]:
for (i, w) in enumerate(F.otype.s('word')):
    bhs = toCons(F.g_cons_utf8.v(w))
    osm = toCons(osmWords[i][0])
    if bhs != osm:
        print('Mismatch at {}: bhs=[{}] osm=[{}]'.format(i, bhs, osm))
        break

Mismatch at 61: bhs=[] osm=[אור]


In [15]:
for i in range(61,65):
    print(i, F.g_cons_utf8.v(i))

61 ל
62 
63 אור
64 יום


Aha, the BHSA has encoded an empty article here, because the pointing of surrounding letters signals an article.
So let's ignore the inserted empty articles of the BHSA.

In [16]:
j = -1
for w in F.otype.s('word'):
    bhs = toCons(F.g_cons_utf8.v(w))
    if bhs == '': continue
    j += 1
    osm = osmWords[j][1]
    if bhs != osm:
        print('''Mismatch at BHS-{} OS-{}:\nbhs=[{}]\nos=[{}]'''.format(w, j, bhs, osm))
        break

Mismatch at BHS-194 OS-187:
bhs=[מינו]
os=[מינ]


In [17]:
for w in range(192,196):
    print(w, toCons(F.g_cons_utf8.v(w)), osmWords[w-7][1])

192 פרי פרי
193 ל ל
194 מינו מינ
195 אשר ו


Aha, the BHS does not split a word here, while the OSM does, or rather: the OSM specifies a morpheme boundary here.
Maybe we can remedy this by looking at the pronominal suffix in the BHS.

It turns out that we also have to account for the univalent final, e.g. in word 1096:

In [63]:
w = 1096

In [64]:
F.g_cons_utf8.v(w)

'תחתנה'

In [65]:
F.g_uvf_utf8.v(w)

'ֶנּ'

In [66]:
F.g_prs_utf8.v(w)

'ָה'

But in word 1144 we should not take the uvf with the prs:

In [68]:
w = 1144

In [69]:
F.g_cons_utf8.v(w)

'אביו'

In [70]:
F.g_uvf_utf8.v(w)

'ִי'

In [71]:
F.g_prs_utf8.v(w)

'ו'

We will try both ways.

We will encounter coding errors in the BHSA. We define a list of corrections, as a work-around.

We specify slots where we should override the (uvf+)prs with a given value.

In [88]:
bhsCodingErrorsPrs = {
    988: 'נו',
    993: 'נו',
    1125: 'י',
    1225: 'נו',
}

There seem to be OSM errors as well.

We specify word indexes where joins or splits should happen.
Positive number: join with so many subsequent words.

In [96]:
osmCodingErrors = {
    1277: 1,
    1288: 1,
}

In [97]:
j = -1
bhsPrs = {}
for w in F.otype.s('word'):
    bhs = toCons(F.g_cons_utf8.v(w))
    if bhs == '': continue
    j += 1
    osm = osmWords[j][1]
    gobble = osmCodingErrors.get(j, 0)
    if gobble:
        for n in range(gobble):
            j += 1
            osm += osmWords[j][1]
    cprs = bhsCodingErrorsPrs.get(w, None)
    if cprs == None:
        uvf = F.g_uvf_utf8.v(w)
        prs = F.g_prs_utf8.v(w)
        if prs:
            cprs = toCons(prs.strip())
            cuprs = toCons(uvf.strip()+prs.strip())
    bhsu = bhs
    if cprs:
        fcprs = finalCons(cprs)
        fucprs = finalCons(cuprs)
        if bhs.endswith(cuprs) or bhs.endswith(fucprs):
            bhsu = bhs[0:len(bhs)-len(cuprs)]
        if bhs.endswith(cprs) or bhs.endswith(fcprs):
            bhs = bhs[0:len(bhs)-len(cprs)]

    if osm not in {bhs, bhsu}:
        print('''Mismatch at BHS-{} OS-{}:\nbhs=[{}]\nos=[{}]'''.format(w, j, bhs, osm))
        break
    if cprs:
        j += 1
        osm = osmWords[j][1]
        if osm not in {fcprs, fucprs}:
            print('''Mismatch in prs of BHS-{} OS-{}:\nbhs=[{}]\nos=[{}]'''.format(w, j, fcprs, osm))
            break

Mismatch at BHS-1249 OS-1298:
bhs=[ממנ]
os=[ממ]


In [111]:
def showCase(w, j, ln):
    print(T.sectionFromNode(w))
    print('BHS')
    for n in range(w, w + ln):
        print('word  {} = [{}]'.format(n, toCons(F.g_cons_utf8.v(n))))
    print('OSM')
    for n in range(j, j + ln):
        print('morph {} = [{}]'.format(n, osmWords[n][1]))

In [98]:
showCase(1249, 1298, 5)

('Genesis', 3, 5)
BHS
word  1249 = [ממנו]
prs   1249 = [ו]
word  1250 = [ו]
word  1251 = [נפקחו]
word  1252 = [עיניכם]
prs   1252 = [כם]
word  1253 = [ו]
OSM
morph 1298 = [ממ]
morph 1299 = [נו]
morph 1300 = [ו]
morph 1301 = [נפקחו]
morph 1302 = [עיני]


# Too many cases
So far, it is not going well. We encounter too many instances of cases where our alignment fails.
We have to develop an other way of aligning the BHS words with the OSM morphemes.

For each BHS word, we grab OSM morphemes until all consonants in the BHS word have been matched.

The exceptions mean this:

If `w: n` is in the dictionary of exceptions, it means that slot (word) `w` in the BHSA is different from its counterpart in the OSM.

If `n > 0`, that many OSM words will be gobbled to align with slot `w`.

If `n < 0`, that many slots from `w` will be gobbled to match the current OSM morpheme.

In [228]:
exceptions = {
    58571: 1,
    87007: 2,
    97044: 1,
    97049: 1,
    139052: 1,
    166914: 1,
    166915: 1,
    199373: 1,
    199374: 1,
    199375: 2,
    207185: 1,
    207186: 1,
    207187: 2,
    215253: 1,
    224043: 1,
    224044: 1,
    224045: 2,
    227601: 1,
    238174: 1,
    238175: 1,
    243336: 1,
    243337: 1,
    250007: 1,
    250008: 1,
    256643: 1,
    266189: 1,
    287360: 1,
    287360: 2,
    309061: 1,
    309062: 1,
    324527: 1,
    324528: 1,
    337702: 1,
    337703: 1,
    345559: 1,
    345560: 2,
    345572: 1,
    345573: 2,
    351280: 1,
    351878: 1,
    351879: 1,
    376865: 1,
    376866: 2,
    376867: 2,
    383405: 2,
    384049: 1,
    384050: 1,
    384052: 2,
    405102: -2,
    426506: 1,
    426507: 1,
}

In [231]:
j = -1
osmIndex = {}
u = None
remainingErrors = False
for w in F.otype.s('word'):
    if u != None and w <= u: continue
    bhs = toCons(F.g_cons_utf8.v(w))
    if bhs == '':
        osmIndex[w] = None
        continue
    j += 1
    startJ = j
    startW = w
    osm = osmWords[j][1]

    maxGobble = exceptions.get(w, None)
    gobble = 1
    while len(osm) < len(bhs) or(maxGobble != None and maxGobble > 0):
        if maxGobble != None and gobble >= maxGobble: break
        j += 1
        osm += osmWords[j][1]
        gobble += 1
    u = w
    gobble = 1
    while len(osm) > len(bhs) or (maxGobble != None and maxGobble < 0):
        if maxGobble != None and gobble >= -maxGobble: break
        u += 1
        bhs += toCons(F.g_cons_utf8.v(u))
        gobble += 1
    if maxGobble == None and finalCons(bhs) != finalCons(osm):
        print('''Mismatch in {} at BHS-{} OS-{}->{}:\nbhs=[{}]\nos=[{}]'''.format(
            '{} {}:{}'.format(*T.sectionFromNode(w)),
            w, startJ, j, bhs, osm,
        ))
        showCase(w, startJ, j - startJ + 10)
        remainingErrors = True
        break
    for k in range(startW, u + 1):
        osmIndex[k] = (startJ, j)
        
if not remainingErrors:
    print('Succeeded in aligning BHS with OSM')
    print('{} BHS words matched against {} OSM morphemes with {} known exceptions'.format(
        len(osmIndex), len(osmWords), len(exceptions),
    ))


Succeeded in aligning BHS with OSM
426584 BHS words matched against 469448 OSM morphemes with 50 known exceptions


In [112]:
showCase(12369, 13485, 5)

('Genesis', 24, 65)
BHS
word  12369 = [ה]
word  12370 = [לזה]
word  12371 = [ה]
word  12372 = [הלך]
word  12373 = [ב]
OSM
morph 13485 = [הלזה]
morph 13486 = [ה]
morph 13487 = [הלך]
morph 13488 = [ב]
morph 13489 = [שדה]


In [116]:
osmWords[32000:32100]

[('אֶפְרַ֔יִם', 'אפרים', 'Np', '669'),
 ('בְּנֵ֖י', 'בני', 'HNcmpc', '1121 a'),
 ('שִׁלֵּשִׁ֑ים', 'שלשים', 'HNcmpa', '8029'),
 ('גַּ֗ם', 'גם', 'HTa', '1571'),
 ('בְּנֵ֤י', 'בני', 'HNcmpc', '1121 a'),
 ('מָכִיר֙', 'מכיר', 'HNp', '4353'),
 ('בֶּן', 'בן', 'HNcmsc', '1121 a'),
 ('מְנַשֶּׁ֔ה', 'מנשה', 'HNp', '4519'),
 ('יֻלְּד֖וּ', 'ילדו', 'HVPp3cp', '3205'),
 ('עַל', 'על', 'HR', '5921 a'),
 ('בִּרְכֵּ֥י', 'ברכי', 'HNcfdc', '1290'),
 ('יוֹסֵֽף', 'יוסף', 'HNp', '3130'),
 ('וַ', 'ו', 'HC', 'c'),
 ('יֹּ֤אמֶר', 'יאמר', 'Vqw3ms', '559'),
 ('יוֹסֵף֙', 'יוסף', 'HNp', '3130'),
 ('אֶל', 'אל', 'HR', '413'),
 ('אֶחָ֔י', 'אחי', '', '251'),
 ('ו', 'ו', '', ''),
 ('אָנֹכִ֖י', 'אנכי', 'HPp1cs', '595'),
 ('מֵ֑ת', 'מת', 'HVqrmsa', '4191'),
 ('וֵֽ', 'ו', 'HC', 'c'),
 ('אלֹהִ֞ים', 'אלהים', 'Ncmpa', '430'),
 ('פָּקֹ֧ד', 'פקד', 'HVqa', '6485 a'),
 ('יִפְקֹ֣ד', 'יפקד', 'HVqi3ms', '6485 a'),
 ('אֶתְ', 'את', 'HTo', '853'),
 ('כֶ֗ם', 'כם', 'Sp2mp', ''),
 ('וְ', 'ו', 'HC', 'c'),
 ('הֶעֱלָ֤ה', 'העלה', 'Vhq3ms', '5927