# Build Verb Translation Dataset

In this notebook, we build a translation dataset based on NIV and ESV translation
alignments provided by GBI. The GBI data, which uses an underlying WLC Hebrew text
has already been aligned to the Amsterdam BHSA Hebrew dataset in 
[GBI_alignment_wrangling.ipynb](GBI_alignment_wrangling.ipynb). We can thus take
advantage of both databases and their associated data when building our dataset here.

In the dataset we'll attempt to parse the English text so that the syntax and (especially)
the verbal forms can be analyzed alongside the Hebrew grammar. We'll start out with 
Spacy for the English parsings. 

In [1]:
import re
import json
import collections
import re
import pandas as pd
from pathlib import Path
from tf.app import use
from gbi_functions import id2ref

# organize pathways
PROJ_DIR = Path.home().joinpath('github/CambridgeSemiticsLab/translation_traditions_HB')
GBI_DATA_DIR = PROJ_DIR.joinpath('data/_private_/GBI_alignment')

# load BHSA data
bhsa = use('bhsa')
api = bhsa.api
F, E, T, L, Fs, = api.F, api.E, api.T, api.L, api.Fs

# load GBI data
gbi_niv = json.loads(GBI_DATA_DIR.joinpath('niv84.ot.alignment.json').read_text())
gbi_esv = json.loads(GBI_DATA_DIR.joinpath('esv.ot.alignment.json').read_text())

# load BHSA / GBI Alignment
bhsa2gbi = json.loads(GBI_DATA_DIR.joinpath('bhsa2gbi.json').read_text())

In [2]:
# set up some dictionaries for convenient word data access

gbi_words = collections.defaultdict(dict)
wlc = {}
verse2words = collections.defaultdict(lambda: collections.defaultdict(list))
linkbyid = collections.defaultdict(list)
sources = (('niv', gbi_niv), ('esv', gbi_esv))

for name, source in sources:
    for verse in source:
        # unpack words for processing
        trans_words =  verse['translation']['words']
        manu_words = verse['manuscript']['words']
        
        # map translation word data
        for w in trans_words:
            ref_tuple = id2ref(w['id'], 'translation')
            verse2words[name][ref_tuple].append(w)
            gbi_words[name][w['id']] = w
        
        # map WLC word data
        # arbitrarily use the copy stored under NIV
        if name == 'niv':
            for w in manu_words:
                verse2words['wlc'][id2ref(w['id'])].append(w)
                wlc[w['id']] = w
                
        # map links to word ids
        for wlc_indices, trans_indices in verse['links']:
            wlc_ids = tuple(manu_words[i]['id'] for i in wlc_indices)
            trans_ids = tuple(sorted(trans_words[i]['id'] for i in trans_indices))
            linkbyid[name].append((wlc_ids, trans_ids))

In [3]:
# view a sampling
for wlc_ids, trn_ids in linkbyid['esv'][:40]:
    wlc_text = ' '.join(wlc[w]['text'].replace('\u200e', '') for w in wlc_ids)
    trn_text = ' '.join(gbi_words['esv'][w]['text'] for w in trn_ids)
    print(wlc_text, '->', trn_text)

בְּ -> In
רֵאשִׁ֖ית -> beginning
בָּרָ֣א -> created
אֱלֹהִ֑ים -> God
הַ -> the
שָּׁמַ֖יִם -> heavens
וְ -> and
הָ -> the
אָֽרֶץ -> earth
הָ -> The
אָ֗רֶץ -> earth
הָיְתָ֥ה -> was
תֹ֙הוּ֙ -> without form
וָ -> and
בֹ֔הוּ -> void
וְ -> and
חֹ֖שֶׁךְ -> darkness
עַל־ -> over
פְּנֵ֣י -> face of
תְה֑וֹם -> deep
וְ -> And
ר֣וּחַ -> Spirit of
אֱלֹהִ֔ים -> God
מְרַחֶ֖פֶת -> was hovering
עַל־ -> over
פְּנֵ֥י -> face of
הַ -> the
מָּֽיִם -> waters
וַ -> And
יֹּ֥אמֶר -> said
אֱלֹהִ֖ים -> God
יְהִ֣י -> Let there be
א֑וֹר -> light
וַֽ -> and
יְהִי־ -> there was
אֽוֹר -> light
וַ -> And
יַּ֧רְא -> saw
אֱלֹהִ֛ים -> God
הָ -> the


## Verb forms dataset

Target verb forms for initial dataset: we begin with qatal verb forms.

### Pre-heat GBI/BHSA verbs

As a first step, we preprocess the GBI/BHSA alignments to one-to-one since
they can be many-to-many. This way, one verb equals one verb. 
Many of the 1-to-2 alignments are due to the presence of pronominal
suffixes. In such cases, we need to filter them out to leave only 
the verbs behind.

In [248]:
verb_disagree = []
other_verbs = []
bverb2wverb = {}
wverb2bverb = {}
select = {'perf'}

for bhsa_nodes, gbi_ids in bhsa2gbi:
    
    # filter out only verbs
    bhsa_verbs = [w for w in bhsa_nodes if F.pdp.v(w) == 'verb'] 
    wlc_verbs = [w for w in gbi_ids if wlc[w]['pos'] == 'verb']
    data = (T.text(bhsa_nodes), T.sectionFromNode(bhsa_nodes[0]), bhsa_nodes, gbi_ids)
    
    # one case, Jer 51:3, has a double verb mapping caused by 
    # ידרך ידרך, which BHSA maps to a single word node, and gbi 
    # keeps as 2 words; we disambig that here and keep only 
    # first gbi word
    if bhsa_verbs and bhsa_verbs[0] == 262780:
         wlc_verbs = wlc_verbs[:1]
    
    # skip non-verbal contexts
    if not bhsa_verbs + wlc_verbs:
        continue
    
    # track disagreements between 2 sources
    elif (bhsa_verbs and not wlc_verbs) or (wlc_verbs and not bhsa_verbs):
        verb_disagree.append(data)
    
    # store result both ways: bhsa 2 wlc, wlc 2 bhsa
    elif len(bhsa_verbs) == 1 and len(wlc_verbs) == 1:
        
        # make a subset selection of verbs
        bverb, wverb = bhsa_verbs[0], wlc_verbs[0]
        parse = F.vt.v(bverb)
        if parse in select:
            bverb2wverb[bverb] = wverb
            wverb2bverb[wverb] = bverb
        else:
            other_verbs.append([bverb, wverb])
    
    # or there's a problem...
    else:
        raise Exception(f'Misalignment at {data}')
        
        
print(len(verb_disagree), 'verbs excluded due to pos disagreement')
print(len(verb_plural), 'verbal contexts have more than one verb')
print(len(bverb2wverb)+len(other_verbs), 'verbs agree')
print(len(bverb2wverb), 'selected for building dataset')

4427 verbs excluded due to pos disagreement
0 verbal contexts have more than one verb
68826 verbs agree
21082 selected for building dataset


NB that the ~4.4k verbs in disagreement is because we use contextual parts of speech
from the BHSA dataset. The GBI dataset does not seem to be as sensitive to context for
pos. A large proportion of these cases are participles used as nouns rather than verbs.

In [162]:
verb_disagree[500:510]

[('פְּקֻדֵיהֶ֖ם ', ('Numbers', 1, 39), [70153], [40010390011, 40010390012]),
 ('יֹצֵ֥א ', ('Numbers', 1, 40), [70183], [40010400141]),
 ('פְּקֻדֵיהֶ֖ם ', ('Numbers', 1, 41), [70185], [40010410011, 40010410012]),
 ('יֹצֵ֥א ', ('Numbers', 1, 42), [70214], [40010420141]),
 ('פְּקֻדֵיהֶ֖ם ', ('Numbers', 1, 43), [70216], [40010430011, 40010430012]),
 ('פְּקֻדִ֡ים ', ('Numbers', 1, 44), [70229], [40010440022]),
 ('פְּקוּדֵ֥י ', ('Numbers', 1, 45), [70250], [40010450031]),
 ('פְּקֻדִ֔ים ', ('Numbers', 1, 46), [70271], [40010460032]),
 ('פְקֻדֵיהֶ֑ם ', ('Numbers', 2, 4), [70477], [40020040022, 40020040023]),
 ('פְקֻדָ֑יו ', ('Numbers', 2, 6), [70502], [40020060022, 40020060023])]

### Cluster English text into contextualized chunks for parsing

Translations are currently aligned to each word. We'll need to be 
able to contextualize translated verbs within their sentences in order
to run the Spacy dependency parser on the full sentence. Here we build
the contexts that the parser will parse.

The translation data stores punctuators as separate parts, e.g.

```
{'id': 1001001012,
  'altId': '.-1',
  'text': '.',
  'transType': '',
  'isPunc': True,
  'isPrimary': False}
```

We will first select verses, then split on punctuators.

Procedure:

1. For each verb, get the verse in English translation, words in verse, and split verse by punctuators.
2. Map strings to string objects so they can be tracked while being manipulated like normal strings.

In [163]:
wlc[10070130171]

{'id': 10070130171,
 'altId': 'אֶל־\u200e-1',
 'text': 'אֶל־\u200e',
 'strong': 'H0413',
 'gloss': 'into',
 'gloss2': '入',
 'lemma': 'אֶל',
 'pos': 'prep',
 'morph': 'Pp'}

In [164]:
id2ref(10070130171)

('Genesis', 7, 13)

## Parsing English with Spacy

For understanding the basics of Spacy, see:
https://spacy.io/usage/linguistic-features

For each verb in the `select_verbs` dictionary, we retrieve its verse text in a
given translation. The translated text is parsed by Spacy, which supplies us with
a dependency tree, parts of speech, and verb tenses for the English side of things.

In [123]:
for heb, eng in linkbyid['esv']:
    if 1001003006 in eng:
        print([heb, eng])

[(10010030031,), (1001003006, 1001003007, 1001003008)]


In [128]:
verb_map[10010030031]

35

In [175]:
import spacy 
from spacy.tokens import Doc
import unicodedata as unicode
import pandas as pd
pd.set_option('display.max_rows', 200)
nlp = spacy.load('en_core_web_sm')

In [242]:
def get_links(gbi_id, source):
    """Retrieve linked pairs of IDs for GBI data"""
    for heb, eng in linkbyid[source]:
        if gbi_id in heb:
            return (heb, eng)

def process_tokens(gbi_tokens):
    """Get tokens for a given reference tuple"""
    tokens = [t['text'] for t in gbi_tokens]
    user_data = {i:t['id'] for i,t in enumerate(gbi_tokens)}
    doc = Doc(nlp.vocab, words=tokens, user_data=user_data)
    return doc

nlp.tokenizer = process_tokens

# build experimental dataset

data = []
missed_verbs = []

for bverb, wverb in bverb2wverb.items():
    
    try:
        linked_heb, linked_eng = get_links(wverb, 'niv')
    except:
        missed_verbs.append((bverb, wverb))
        continue
        
    ref_word = gbi_words['niv'][linked_eng[0]] # get arbitrary word for referencing
    eng_ref = id2ref(ref_word['id'], 'translation') # get english verse ref tuple
    eng_verse_words = verse2words['niv'][eng_ref] # get English words from the verse
    spacy_doc = nlp(eng_verse_words) # parse English
    imap = spacy_doc.user_data
    
    # filter out those parsed Eng. words to keep 
    of_interest = [] 
    for w in spacy_doc:
        conds = [
            imap[w.i] in linked_eng,
            w.tag_ not in {'PRP'}
        ]
        if all(conds):
            of_interest.append(w)
    
    parsed_tags = '|'.join(w.tag_ for w in of_interest) 
    parsed_dep = '|'.join(w.dep_ for w in of_interest)
    parsed_words = ' '.join(str(w) for w in of_interest)
    heb_ref = T.sectionFromNode(bverb)
    ref_string = '{} {}:{}'.format(*heb_ref)
    data.append({
        'ref': ref_string, 
        'book': heb_ref[0], 
        'verb_text': T.text(bverb), 
        #'sentence': T.text(L.u(bverb, 'sentence')[0]),
        'bhsa_verb': bverb, 
        'gbi_verb': wverb, 
        'eng_words': parsed_words, 
        'eng_tags': parsed_tags, 
        'eng_dep': parsed_dep
        })
    
    if len(data) == 100:
        break

In [243]:
len(data)

100

In [244]:
len(missed_verbs)

9

In [245]:
data_df = pd.DataFrame(data)

data_df.head()

Unnamed: 0,ref,book,verb_text,bhsa_verb,gbi_verb,eng_words,eng_tags,eng_dep
0,Genesis 1:1,Genesis,בָּרָ֣א,3,10010010021,created,VBD,ROOT
1,Genesis 1:2,Genesis,הָיְתָ֥ה,15,10010020021,was,VBD,ROOT
2,Genesis 1:5,Genesis,קָ֣רָא,69,10010050061,called,VBD,relcl
3,Genesis 1:10,Genesis,קָרָ֣א,172,10010100071,called,VBD,relcl
4,Genesis 1:15,Genesis,הָי֤וּ,267,10010150012,let be,VB|VB,ROOT|ccomp


In [249]:
spacy.explain('VBP')

'verb, non-3rd person singular present'

In [250]:
spacy.explain('VBZ')

'verb, 3rd person singular present'

In [251]:
spacy.explain('VBN')

'verb, past participle'

In [253]:
data_df.head(25)

Unnamed: 0,ref,book,verb_text,bhsa_verb,gbi_verb,eng_words,eng_tags,eng_dep
0,Genesis 1:1,Genesis,בָּרָ֣א,3,10010010021,created,VBD,ROOT
1,Genesis 1:2,Genesis,הָיְתָ֥ה,15,10010020021,was,VBD,ROOT
2,Genesis 1:5,Genesis,קָ֣רָא,69,10010050061,called,VBD,relcl
3,Genesis 1:10,Genesis,קָרָ֣א,172,10010100071,called,VBD,relcl
4,Genesis 1:15,Genesis,הָי֤וּ,267,10010150012,let be,VB|VB,ROOT|ccomp
5,Genesis 1:21,Genesis,שָׁרְצ֨וּ,397,10010210121,teems,NNS,relcl
6,Genesis 1:27,Genesis,בָּרָ֣א,545,10010270081,created,VBD,ccomp
7,Genesis 1:27,Genesis,בָּרָ֥א,550,10010270121,created,VBD,ROOT
8,Genesis 1:29,Genesis,נָתַ֨תִּי,594,10010290041,give,VBP,ccomp
9,Genesis 1:31,Genesis,עָשָׂ֔ה,660,10010310061,had made,VBD|VBN,aux|relcl


In [252]:
spacy.explain('VBD')

'verb, past tense'

In [206]:
spacy.explain('MD')

'verb, modal auxiliary'