# Build verb dataset from pre-compiled sources

In [1]:
import sys
import json
import collections
import re
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 200)
from pathlib import Path
from tf.fabric import Fabric
from tf.app import use
import pandas as pd

# custom modules 
sys.path.append('../')
import tf_tools
from gbi_functions import id2ref
from positions import PositionsTF

# organize pathways / files
PROJ_DIR = Path.home().joinpath('github/CambridgeSemiticsLab/Gesenius_data')
VERB_DIR = PROJ_DIR.joinpath('data/_private_/verb_data')
VERB_DIR_PUBLIC = PROJ_DIR.joinpath('data/_public_/verb_data')
lxx_file = VERB_DIR.joinpath('bhsa2lxx.json')
wlc_file = VERB_DIR.joinpath('bhsa2wlc.json')
esv_file = VERB_DIR.joinpath('bhsa2esv.json')
niv_file = VERB_DIR.joinpath('bhsa2niv.json')

# load translation texts
trans2text_file = PROJ_DIR.joinpath('data/_private_/GBI_alignment/verse2text.json')

# load datasets
bhsa2lxx = json.loads(lxx_file.read_text())
bhsa2wlc = json.loads(wlc_file.read_text())
bhsa2esv = json.loads(esv_file.read_text())
bhsa2niv = json.loads(niv_file.read_text())
trans2text = json.loads(trans2text_file.read_text())

In [2]:
# load BHSA features with genre module
locations = [
    '~/github/etcbc/bhsa/tf/c', 
    '~/github/etcbc/genre_synvar/tf/c',
    '~/github/etcbc/valence/tf/c'
]
TF = Fabric(locations)
extra_features = '''
domain txt ps gn 
nu genre sense
mother sp
'''
features = tf_tools.standard_features + extra_features
api = TF.load(features)
bhsa = use('bhsa', api=api)
F, E, T, L, Fs, = bhsa.api.F, bhsa.api.E, bhsa.api.T, bhsa.api.L, bhsa.api.Fs

from clause_relas import in_dep_calc as clause_relator

This is Text-Fabric 8.4.4
Api reference : https://annotation.github.io/text-fabric/cheatsheet.html

125 features found and 0 ignored
  0.00s loading features ...
   |     0.00s Dataset without structure sections in otext:no structure functions in the T-API
  6.39s All features loaded/computed - for details use loadLog()


In [3]:
# thanks to Martijn Naaijer
# for providing this handy list / code
period_dict = {}
ebh = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', '1_Samuel', '2_Samuel', '1_Kings', '2_Kings']
lbh = ['Esther', 'Daniel', 'Ezra', 'Nehemiah', '1_Chronicles', '2_Chronicles']
for book in ebh:
    period_dict[book] = 'EBH'
for book in lbh:
    period_dict[book] = 'LBH'
    
def map_book_collections(label_and_ranges):
    """Apply a label to a range of books."""
    label_dict = {}
    for label, book_start, book_end in label_and_ranges:
        bs_node = T.nodeFromSection((book_start,))
        be_node = T.nodeFromSection((book_end,))
        in_between = list(range(bs_node, be_node+1))
        whole_section = [bs_node] + in_between + [be_node]
        for book_node in whole_section:
            label_dict[T.sectionFromNode(book_node)[0]] = label 
    return label_dict

tripart = map_book_collections([
    ('Law', 'Genesis', 'Deuteronomy'), 
    ('Prophets', 'Joshua', 'Malachi'), 
    ('Writings', 'Psalms', '2_Chronicles')
])

subcollections = map_book_collections([
    ('Samuel', '1_Samuel', '2_Samuel'),
    ('Kings', '1_Kings', '2_Kings'),
    ('Chronicles', '1_Chronicles', '2_Chronicles'),
    ('Ezra-Neh', 'Ezra', 'Nehemiah'),
    ('Twelve', 'Hosea', 'Malachi'),
    ('Megilloth', 'Ruth', 'Esther'),])

In [60]:
tam_re = re.compile(r'(.*)\((.*)\.(.*)\.(.*)\)')
    
def split_TAM(TAM_tag):
    """Split TAM tag and return parts as dict"""
    tam_match = tam_re.match(TAM_tag)
    if tam_match:
        name, tense, aspect, modality = tam_match.groups()
        return {
            'tense': tense or np.nan,
            'aspect': aspect or np.nan,
            'modality': modality or np.nan,
            'TAM': f'{tense}.{aspect}.{modality}',
            'TAMtag': name.strip(),
        }
    else:
        return {
            'tense': np.nan,
            'aspect': np.nan,
            'modality': np.nan,
            'TAM': np.nan,
            'TAMtag': np.nan,
        }
    
def get_verbform(node):
    """Remap BHSA verb tense values to custom values
    
    Args:
        node: int representing BHSA node
        preceding_words: list of preceding node ints
    """
    
    # TODO: Consider whether to keep preceding words
    
    tense_map = {
            'impf': 'yqtl',
            'perf': 'qtl',
            'ptca': 'ptcp',
    }
    
    bhsa_tense = F.vt.v(node)
    verb_form = tense_map.get(bhsa_tense, bhsa_tense)
    P = PositionsTF(node, 'clause', api)
    
    # adjust weqatal
    if verb_form == 'qtl' and P.get(-1, 'lex') == 'W':
        return 'wqtl'
        
    return verb_form

def get_preceding_words(node, context='clause'):
    """Retrieves words from before a verb within a context"""
    context_node = L.u(node, context)[0]
    context_words = L.d(context_node, 'word')
    prec_words = context_words[:context_words.index(node)]
    return prec_words

def join_on(nodes, jchar='_', default=np.nan):
    """Join words on a char and ensure they are pre/appended with that char.
    
    The pre/appending provides easy-to-match word boundaries.
    """
    joined_string = f'{jchar}'.join(nodes)
    if not joined_string:
        return default
    else:
        return f'{jchar}{joined_string}{jchar}'

def map_domain(node):
    """Map domains to be more permissive with Q"""
    txt_type = F.txt.v(node)
    if 'Q' in txt_type:
        return 'Q'
    else:
        return txt_type[-1]
    
def tag_cl_verbform(clause_node):
    """Tag a verbform for a supplied clause."""
    verb = [
        w for w in L.d(clause_node, 'word')
            if F.pdp.v(w) == 'verb'
    ]
    if verb:
        verb = verb[0]
        prec_words = get_preceding_words(verb)
        return get_verbform(verb)
    else:
        return 'Ø'
    
def get_relative_data(clause_lookup):
    """Retrieve data on a given relative clause, if it exists."""
    rel_dat = {}
    if clause_lookup:
        cl_atom = rel_dat['cl_atom'] = clause_lookup[0]
        cl = rel_dat['cl'] = L.u(cl_atom, 'clause')[0]
        rel_dat['typ'] = F.typ.v(cl_atom)
        rel_dat['verb_type'] = tag_cl_verbform(cl_atom)
        rel_dat['domain'] = map_domain(cl)
        rel_dat['rela'] = clause_relator(cl)
        rel_dat['txt'] = T.text(cl_atom)
    return rel_dat

cl_type_res = [
    ('Way.*', 'W'),
    ('^[xX].*', 'x'),
    ('^W[xX].*', 'Wx'),
    ('^W.*', 'W'),
    ('^_W_.+', 'Wx'),
    ('^_W_$', 'W'),
]

cl_type_res = [(re.compile(search), replace) for search, replace in cl_type_res]

def search_replace_re(string, patterns, default=None):
    """Match a regex string"""
    for search, replace in patterns:
        if search.match(string):
            return replace
    return default

def simplify_cl_type(clause_atom, prec_lexs):
    """Simplify a clausetype string into (x|X|Ø)Verb"""
    
    typ = F.typ.v(clause_atom)
    
    # apply to verbs missing X|x data
    if typ in {'MSyn', 'CPen', 'Voct', 'InfC', 'Ellp', 'Ptcp'}:
        return search_replace_re(prec_lexs, cl_type_res, 'Ø')
        
    # apply to other types
    return search_replace_re(typ, cl_type_res, 'Ø')
    
def get_bhsa_data(node):
    """Compile all relevant BHSA data."""
    
    # data on this clause itself
    book, chapter, verse = T.sectionFromNode(node)
    ref_string = f'{book} {chapter}:{verse}'
    verse_node = L.u(node, 'verse')[0]
    clause_atom = L.u(node, 'clause_atom')[0]
    clause = L.u(node, 'clause')[0]
    sent = L.u(node, 'sentence')[0]
    clause_type = F.typ.v(clause)
    preceding_words = get_preceding_words(node)
    prec_lexes = join_on((F.lex.v(w) for w in preceding_words), default='Ø') 
    prec_pos = join_on((F.pdp.v(w) for w in preceding_words), default='Ø')
    domain2 = map_domain(clause)
    cl_type_simp = simplify_cl_type(clause_atom, prec_lexes)
    
    # build data on the mother/daughter clause
    mo_data = get_relative_data(E.mother.f(clause_atom))
    da_data = get_relative_data(E.mother.t(clause_atom))
    
    # collect preceding particles only
    particle_types = {'advb', 'prep', 'conj', 'prde', 'prin', 'inj', 'inrg'}
    prec_particles = join_on(
        (F.lex.v(w) for w in preceding_words
            if F.pdp.v(w) in particle_types)
    , default='Ø') 
    
    # map to verb form string
    verbform = get_verbform(node)
    
    bhsa_data = {
            'bhsa_node': node,
            'ref': ref_string, 
            'book': book, 
            'book_super': subcollections.get(book, book),
            'canon_part': tripart[book],
            'text_full': F.g_word_utf8.v(node),
            'text_plain': F.g_cons_utf8.v(node),
            'lex': F.lex_utf8.v(node),
            'lex_etcbc': F.lex.v(node),
            'gloss': F.gloss.v(node),
            'verb_form': verbform,
            'stem': F.vs.v(node),
            'person': F.ps.v(node),
            'gender': F.gn.v(node),
            'number': F.nu.v(node),
            'clause_atom': T.text(clause_atom),
            'clause': T.text(clause),
            'sentence': T.text(sent),
            'genre': F.genre.v(verse_node),
            'domain': F.domain.v(clause),
            'domain2': domain2,
            'period': period_dict.get(book, ''),
            'txt_type': F.txt.v(clause),
            'clause_type': clause_type,
            'cltype_simp': cl_type_simp,
            'clause_rela': clause_relator(clause),
            'mother_clause': mo_data.get('txt', np.nan),
            'mother_type': mo_data.get('typ', np.nan),
            'mother_verbtype': mo_data.get('verb_type', np.nan),
            'mother_rela': mo_data.get('rela', np.nan),
            'mother_domain2': mo_data.get('domain', np.nan),
            'daught_clause': da_data.get('txt', np.nan),
            'daught_type': da_data.get('typ', np.nan),
            'daught_verbtype': da_data.get('verb_type', np.nan),
            'daught_rela': da_data.get('rela', np.nan),
            'daught_domain2': da_data.get('domain', np.nan),
            'valence': F.sense.v(node),
            'prec_lexes': prec_lexes,
            'prec_pos': prec_pos,
            'prec_part': prec_particles,
    }
    
    return bhsa_data 


def build_dataset(bhsa_nodes):
    """Construct a dataset on select BHSA nodes."""
    
    dataset = []
    
    for node in bhsa_nodes:
        
        str_node = str(node)
        lxx_word = bhsa2lxx.get(str_node, {})
        wlc_word = bhsa2wlc.get(str_node, {})
        esv_word = bhsa2esv.get(str_node, {})
        niv_word = bhsa2niv.get(str_node, {})
        transs = [('esv', esv_word), ('niv', niv_word)]
        
        # add BHSA data
        data_row = get_bhsa_data(node)

        # add LXX data
        data_row.update({
            'lxx': lxx_word.get('utf8', np.nan),
            'lxx_tense': lxx_word.get('tense', np.nan),
            'lxx_voice': lxx_word.get('voice', np.nan),
            'lxx_mood': lxx_word.get('mood', np.nan),
            'lxx_person': lxx_word.get('person', np.nan),
            'lxx_number': lxx_word.get('number', np.nan),
        })
        
        if lxx_word:
            data_row['lxx_tm'] = lxx_word['tense'] + ' ' + lxx_word['mood'] 
        else:
            data_row['lxx_tm'] = np.nan
        
        # add TAM data from translations
        for trans, tdata in transs:
            ref_tuple = tuple(tdata.get('eng_ref', ''))
            data_row[f'{trans}'] = tdata.get('words', np.nan)
            data_row[f'{trans}_tags'] = tdata.get('tags', np.nan)
            data_row[f'{trans}_VBtags'] = tdata.get('vb_tags', np.nan)
            data_row[f'{trans}_verse'] = trans2text[trans].get(str(ref_tuple), np.nan)
            
            for tam_key, tam_data in split_TAM(tdata.get('TAM_cx', '')).items():
                tam_key = f'{trans}_{tam_key}'
                data_row[tam_key] = tam_data
            
            data_row[f'{trans}_TAMspan'] = tdata.get('TAM_span', np.nan)
            
        dataset.append(data_row)
        
    print(f'{len(dataset)} rows prepared!')
        
    return dataset

def build_allverb_data(verb_nodes):
    """Build a dataset with generic BHSA nodes (including all verbs)."""
    dataset = []
    for node in verb_nodes:
        dataset.append(
            get_bhsa_data(node)
        )
    return dataset

def xverb_collocations(verb_nodes, context='clause'):
    """For every verb tense, count lexemes in pre-verbal position."""
    
    col_data = collections.defaultdict(lambda: collections.Counter())
    for node in verb_nodes:
        
        lex = L.u(node, 'lex')[0]
        verbt = F.vt.v(node)
        preverb_words = get_preceding_words(node, context=context)
        verbf = get_verbform(node)
        
        # build count
        for w in preverb_words:
            if F.lex.v(w).endswith('/'): # NB: skip nominal items
                continue
            col_data[verbf][F.lex.v(w)] += 1
            
    col_df = pd.DataFrame.from_dict(col_data, orient='index')
            
    return col_df

# Qatal Dataset

We build a dataset which is specific to qatal.

In [61]:
qatals = [
    verb for verb in F.pdp.s('verb') 
        if F.vt.v(verb) == 'perf' 
        and F.language.v(verb) == 'Hebrew'
]

qatal_dataset = build_dataset(qatals)

qatal_df = pd.DataFrame(qatal_dataset)


# restrict to qatal forms
qatal_df = qatal_df[qatal_df.verb_form == 'qtl']

qatal_df.shape

20728 rows prepared!


(14303, 67)

In [62]:
qatal_df.head()

Unnamed: 0,bhsa_node,ref,book,book_super,canon_part,text_full,text_plain,lex,lex_etcbc,gloss,...,niv,niv_tags,niv_VBtags,niv_verse,niv_tense,niv_aspect,niv_modality,niv_TAM,niv_TAMtag,niv_TAMspan
0,3,Genesis 1:1,Genesis,Genesis,Law,בָּרָ֣א,ברא,ברא,BR>[,create,...,created,VBD,VBD,In the beginning God created the heavens and t...,PAST,,IND,PAST..IND,PAST,created
1,15,Genesis 1:2,Genesis,Genesis,Law,הָיְתָ֥ה,היתה,היה,HJH[,be,...,was,VBD,VBD,"Now the earth was formless and empty , darknes...",PAST,,IND,PAST..IND,PAST,was
2,47,Genesis 1:4,Genesis,Genesis,Law,טֹ֑וב,טוב,טוב,VWB[,be good,...,,,,,,,,,,
3,69,Genesis 1:5,Genesis,Genesis,Law,קָ֣רָא,קרא,קרא,QR>[,call,...,he called,PRP|VBD,VBD,"God called the light “ day , ” and the darknes...",PAST,,IND,PAST..IND,PAST,called
4,172,Genesis 1:10,Genesis,Genesis,Law,קָרָ֣א,קרא,קרא,QR>[,call,...,he called,PRP|VBD,VBD,"God called the dry ground “ land , ” and the g...",PAST,,IND,PAST..IND,PAST,called


# !!! TO FIX !!!

In [63]:
qatal_df[qatal_df.esv_TAM == 'PRES..IMPV'].head(1)

Unnamed: 0,bhsa_node,ref,book,book_super,canon_part,text_full,text_plain,lex,lex_etcbc,gloss,...,niv,niv_tags,niv_VBtags,niv_verse,niv_tense,niv_aspect,niv_modality,niv_TAM,niv_TAMtag,niv_TAMspan
32,1189,Genesis 3:1,Genesis,Genesis,Law,אָמַ֣ר,אמר,אמר,>MR[,say,...,Did say,VBD|VB,VBD|VB,Now the serpent was more crafty than any of th...,PRES,,IMPV,PRES..IMPV,IMPV,say


## Data Corrections

There are some parsing problems with some past tense verbs and imperative verbs caused by the English parser. 
These can be detected by a lack of consensus amongst the various versions. For instance, in
cases where an imperative is detected by the English parser, but that imperative is not
reflected in the other English version, or that imperative is not corroborated by a future
or imperative in the LXX.

#### Lack of consesus is of interest

To be sure, cases where there is a lack of consensus might also contain some properly tagged
phrases, reflecting cases of true disagreement in the witnesses rather than a parsing mistake. 
Manual inspection has revealed that these cases are very few compared with the number of false
positives however.

#### Remedy

To address these cases, we add a value to the data based on several manually configured
boolean conditions: `safe`. Thus, if `safe == True`, the example in question has been filtered
through these requirements. If the entire dataset is desired anyways, it can be retrieved by
ignoring `safe`.

In [64]:
bad_past_re = r'.*([Pp]ut|[Ss]et|[Cc]ut|[Ll]ay|[Cc]ast|[Ss]pread|[Ss]pit|[Rr]ead|rid)|darken'

bad_past_esv = qatal_df[
    (qatal_df.esv.str.match(bad_past_re, na=False))
    & (qatal_df.esv_TAM == 'PAST..IND')
    & (qatal_df.lxx_tm.str.match('.*(present|future)'))
].bhsa_node

bad_past_niv = qatal_df[
    (qatal_df.niv.str.match(bad_past_re, na=False))
    & (qatal_df.niv_TAM == 'PAST..IND')
    & (qatal_df.lxx_tm.str.match('.*(present|future)'))
].bhsa_node

bad_impv = qatal_df[
    (
        (~qatal_df.niv_TAM.isin(['PRES..IMPV', 'FUT..IND', 'PRES..MOD', 'PRES..IND']) & (qatal_df.esv_TAM.isin(['PRES..IMPV'])))
        | (qatal_df.niv_TAM.isin(['PRES..IMPV']) & (~qatal_df.esv_TAM.isin(['PRES..IMPV', 'FUT..IND', 'PRES..MOD', 'PRES..IND'])))
    )
    & (~qatal_df.lxx_tm.str.match('.*(future|impv)', na=False))
].bhsa_node

not_safe = [bad_past_esv, bad_past_niv, bad_impv]
not_safe_nodes = set(n for iset in not_safe for n in iset)

print(len(not_safe_nodes), 'nodes listed as unsafe...')

206 nodes listed as unsafe...


In [65]:
qatal_df['safe'] = ~qatal_df.bhsa_node.isin(not_safe_nodes)

In [66]:
qatal_df[qatal_df.safe].shape

(14097, 68)

In [67]:
qatal_df.to_csv(VERB_DIR.joinpath('qatal_dataset.csv'), index=False)
qatal_df.to_excel(VERB_DIR.joinpath('qatal_dataset.xlsx'), index=False, encoding='UTF-16')

## All Verb Dataset

In [68]:
all_verbs = []

for verb in F.pdp.s('verb'):
    if F.language.v(verb) != 'Hebrew':
        continue
    if F.vt.v(verb) in {'perf', 'impf', 'wayq', 'ptca', 'ptcp'}:
        all_verbs.append(verb)
        
print(len(all_verbs), 'ready for processing')

all_verbs_data = build_allverb_data(all_verbs)


av_df = pd.DataFrame(all_verbs_data)

av_df.to_csv(VERB_DIR_PUBLIC.joinpath('allverb_bhsa.csv'), index=False)

av_df.head()

56955 ready for processing


Unnamed: 0,bhsa_node,ref,book,book_super,canon_part,text_full,text_plain,lex,lex_etcbc,gloss,...,mother_domain2,daught_clause,daught_type,daught_verbtype,daught_rela,daught_domain2,valence,prec_lexes,prec_pos,prec_part
0,3,Genesis 1:1,Genesis,Genesis,Law,בָּרָ֣א,ברא,ברא,BR>[,create,...,,וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֨הוּ֙ וָבֹ֔הוּ,WXQt,qtl,Main,?,d-,_B_R>CJT/_,_prep_subs_,_B_
1,15,Genesis 1:2,Genesis,Genesis,Law,הָיְתָ֥ה,היתה,היה,HJH[,be,...,?,וְחֹ֖שֶׁךְ עַל־פְּנֵ֣י תְהֹ֑ום,NmCl,Ø,Main,?,--,_W_H_>RY/_,_conj_art_subs_,_W_
2,27,Genesis 1:2,Genesis,Genesis,Law,מְרַחֶ֖פֶת,מרחפת,רחף,RXP[,shake,...,?,,,,,,,_W_RWX/_>LHJM/_,_conj_subs_subs_,_W_
3,33,Genesis 1:3,Genesis,Genesis,Law,יֹּ֥אמֶר,יאמר,אמר,>MR[,say,...,?,יְהִ֣י אֹ֑ור,ZYqX,yqtl,Main,Q,--,_W_,_conj_,_W_
4,35,Genesis 1:3,Genesis,Genesis,Law,יְהִ֣י,יהי,היה,HJH[,be,...,N,,,,,,--,Ø,Ø,Ø


In [69]:
av_col_df = xverb_collocations(all_verbs)
av_col_df = av_col_df.fillna(0)
av_col_df.index.name = 'verb_form'
av_col_df.columns.name = 'lex_etcbc'
av_col_df.to_csv(VERB_DIR_PUBLIC.joinpath('xverb_lexcollocations.csv'))
av_col_df.head()

lex_etcbc,B,W,H,KJ,L,>CR,HNH,L>,MN,>P,...,<BD[,LHN,CLL=[,>JKKH,BWZ[,<WT[,CLC[,HJK,RC<[,CXV=[
verb_form,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
qtl,744.0,3795,1326.0,1838.0,371.0,2974.0,182.0,1827.0,325.0,32.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ptcp,96.0,1328,1511.0,246.0,85.0,358.0,367.0,78.0,51.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
yqtl,1144.0,6134,1540.0,1122.0,557.0,970.0,50.0,2746.0,503.0,51.0,...,1.0,2.0,1.0,4.0,1.0,1.0,1.0,2.0,1.0,1.0
wayq,0.0,14974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wqtl,0.0,6425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
