# Wrangling GBI Alignments into Workflow

GBI has graciously provided alignments for the ESV, KJV, and NIV.
Unfortunately those alignments are closed-source, so this notebook
will only provide code used for interpreting the aligned JSON file

In [39]:
import collections
import pandas as pd
import json
from pathlib import Path
from pprint import pprint

# organize pathways
PROJ_DIR = Path.home().joinpath('github/CambridgeSemiticsLab/translation_traditions_HB')
GBI_DATA_DIR = PROJ_DIR.joinpath('data/_private_/GBI_alignment')

In [9]:
file2data = {}
for file in GBI_DATA_DIR.glob('*.json'):
    if 'ot' in file.name:
        file2data[file.stem] = json.loads(file.read_text())
        
print('keys:', file2data.keys())

keys: dict_keys(['niv84.ot.alignment', 'kjv.ot.alignment', 'esv.ot.alignment'])


In [11]:
# let's check out the NIV OT alignment

niv_data = file2data['niv84.ot.alignment']

len(niv_data)

23202

In [61]:
ex_verse = niv_data[1]

# parts of individual entry
#ex_verse.keys()

In [60]:
#ex_verse['links']

In [59]:
# experiment with multiple word link: "was hovering"

def get_trans_text(manu, trans, verse):
    """Join text/translated words for comparison"""
    heb_txt = ' '.join(verse['manuscript']['words'][h]['text'] for h in manu).strip('\u200e')
    eng_txt = ' '.join(verse['translation']['words'][e]['text'] for e in sorted(trans))
    return (heb_txt, eng_txt)
    
for manu, trans in ex_verse['links']:
    if len(trans) > 1:
        trans = sorted(trans)
        heb_txt, eng_txt = get_trans_text(manu, trans, ex_verse)
        #print(f'{heb_txt} -> {eng_txt}')

In [56]:
verb_dataset = []

# experiment with collecting verbs in HB
for verse in niv_data:
    for manu, trans in verse['links']:
        _mainword_ = manu[0]
        if verse['manuscript']['words'][_mainword_]['pos'] == 'verb':
            heb_txt, eng_txt = get_trans_text(manu, trans, verse)
            verb_dataset.append((heb_txt, eng_txt))
            
#verb_dataset[:5]

In [57]:
#verb_dataset[:100]

In [58]:
#ex_verse['translation']['words'][:5]