# Build verb dataset from pre-compiled sources

In [1]:
import sys
import json
import collections
import re
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 200)
from pathlib import Path
from tf.fabric import Fabric
from tf.app import use
import pandas as pd

# custom modules 
sys.path.append('../')
import tf_tools
from gbi_functions import id2ref
from positions import PositionsTF

# organize pathways / files
PROJ_DIR = Path.home().joinpath('github/CambridgeSemiticsLab/Gesenius_data')
VERB_DIR = PROJ_DIR.joinpath('data/_private_/verb_data')
VERB_DIR_PUBLIC = PROJ_DIR.joinpath('data/_public_/verb_data')
lxx_file = VERB_DIR.joinpath('bhsa2lxx.json')
wlc_file = VERB_DIR.joinpath('bhsa2wlc.json')
esv_file = VERB_DIR.joinpath('bhsa2esv.json')
niv_file = VERB_DIR.joinpath('bhsa2niv.json')

# load datasets
bhsa2lxx = json.loads(lxx_file.read_text())
bhsa2wlc = json.loads(wlc_file.read_text())
bhsa2esv = json.loads(esv_file.read_text())
bhsa2niv = json.loads(niv_file.read_text())

In [45]:
# load BHSA features with genre module
locations = [
    '~/github/etcbc/bhsa/tf/c', 
    '~/github/etcbc/genre_synvar/tf/c',
    '~/github/etcbc/valence/tf/c'
]
TF = Fabric(locations)
extra_features = '''
domain txt ps gn 
nu genre sense
mother sp
'''
features = tf_tools.standard_features + extra_features
api = TF.load(features)
bhsa = use('bhsa', api=api)
F, E, T, L, Fs, = bhsa.api.F, bhsa.api.E, bhsa.api.T, bhsa.api.L, bhsa.api.Fs

from clause_relas import in_dep_calc as clause_relator

This is Text-Fabric 8.4.4
Api reference : https://annotation.github.io/text-fabric/cheatsheet.html

125 features found and 0 ignored
  0.00s loading features ...
   |     0.00s Dataset without structure sections in otext:no structure functions in the T-API
  8.08s All features loaded/computed - for details use loadLog()


In [3]:
# thanks to Martijn Naaijer
# for providing this handy list / code
period_dict = {}
ebh = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', '1_Samuel', '2_Samuel', '1_Kings', '2_Kings']
lbh = ['Esther', 'Daniel', 'Ezra', 'Nehemiah', '1_Chronicles', '2_Chronicles']
for book in ebh:
    period_dict[book] = 'EBH'
for book in lbh:
    period_dict[book] = 'LBH'
    
def map_book_collections(label_and_ranges):
    """Apply a label to a range of books."""
    label_dict = {}
    for label, book_start, book_end in label_and_ranges:
        bs_node = T.nodeFromSection((book_start,))
        be_node = T.nodeFromSection((book_end,))
        in_between = list(range(bs_node, be_node+1))
        whole_section = [bs_node] + in_between + [be_node]
        for book_node in whole_section:
            label_dict[T.sectionFromNode(book_node)[0]] = label 
    return label_dict

tripart = map_book_collections([
    ('Law', 'Genesis', 'Deuteronomy'), 
    ('Prophets', 'Joshua', 'Malachi'), 
    ('Writings', 'Psalms', '2_Chronicles')
])

subcollections = map_book_collections([
    ('Samuel', '1_Samuel', '2_Samuel'),
    ('Kings', '1_Kings', '2_Kings'),
    ('Chronicles', '1_Chronicles', '2_Chronicles'),
    ('Ezra-Neh', 'Ezra', 'Nehemiah'),
    ('Twelve', 'Hosea', 'Malachi'),
    ('Megilloth', 'Ruth', 'Esther'),])

In [48]:
def has_adjacent_waw(node):
    """Check whether verb has an adjacent preceding waw."""
    context = PositionsTF(node, 'clause', api)
    prev_word = context.get(-1) or 0
    if F.lex.v(prev_word) == 'W':
        return True
    else:
        return False
    
def preceding_waw(node):
    """Find preceding waw and its distance."""
    clause_node = L.u(node, 'clause')[0]
    clause_words = L.d(clause_node, 'word')
    prev_nodes = clause_words[0:clause_words.index(node)]
    waw_node = [n for n in prev_nodes if F.lex.v(n) == 'W']
    waw_node = waw_node[0] if waw_node else None
    if waw_node:
        return {
            'preceding_waw': True,
            'wordsb4waw': abs(waw_node - (node-1)),
        }
    else:
        return {
            'preceding_waw': False,
            'wordsb4waw': np.nan,
        }
    
tam_re = re.compile(r'(.*)\((.*)\.(.*)\.(.*)\)')
    
def split_TAM(TAM_tag):
    """Split TAM tag and return parts as dict"""
    tam_match = tam_re.match(TAM_tag)
    if tam_match:
        name, tense, aspect, modality = tam_match.groups()
        return {
            'tense': tense or np.nan,
            'aspect': aspect or np.nan,
            'modality': modality or np.nan,
            'TAM': f'{tense}.{aspect}.{modality}',
            'TAMtag': name.strip(),
        }
    else:
        return {
            'tense': np.nan,
            'aspect': np.nan,
            'modality': np.nan,
            'TAM': np.nan,
            'TAMtag': np.nan,
        }
    
bhsa_tense_map = {
        'impf': 'yqtl',
        'perf': 'qtl',
        'ptca': 'ptcp',
}

def get_bhsa_data(node):
    """Compile all relevant BHSA data."""
    
    book, chapter, verse = T.sectionFromNode(node)
    ref_string = f'{book} {chapter}:{verse}'
    verse_node = L.u(node, 'verse')[0]
    clause_atom = L.u(node, 'clause_atom')[0]
    clause = L.u(node, 'clause')[0]
    mother_clause = E.mother.f(clause_atom)
    mother_type = F.typ.v(mother_clause[0]) if mother_clause else np.nan
    sent = L.u(node, 'sentence')[0]
    clause_type = F.typ.v(clause)
    verb_tense = F.vt.v(node)
    verb_tense_mod = bhsa_tense_map.get(verb_tense, verb_tense)  
    
    bhsa_data = {
            'bhsa_node': node,
            'ref': ref_string, 
            'book': book, 
            'book_super': subcollections.get(book, book),
            'canon_part': tripart[book],
            'text_full': F.g_word_utf8.v(node),
            'text_plain': F.g_cons_utf8.v(node),
            'lex': F.lex_utf8.v(node),
            'lex_etcbc': F.lex.v(node),
            'gloss': F.gloss.v(node),
            'verb_form': verb_tense_mod,
            'stem': F.vs.v(node),
            'person': F.ps.v(node),
            'gender': F.gn.v(node),
            'number': F.nu.v(node),
            'sentence': T.text(sent),
            'genre': F.genre.v(verse_node),
            'domain': F.domain.v(clause),
            'period': period_dict.get(book, ''),
            'txt_type': F.txt.v(clause),
            'clause_type': clause_type,
            'clause_rela': clause_relator(clause),
            'adjacent_waw': has_adjacent_waw(node),
            'mother_clause': T.text(mother_clause),
            'mother_type': mother_type,
            'valence': F.sense.v(node),
    }
    
    return bhsa_data 


def build_dataset(bhsa_nodes):
    """Construct a dataset on select BHSA nodes."""
    
    dataset = []
    
    for node in bhsa_nodes:
        
        str_node = str(node)
        lxx_word = bhsa2lxx.get(str_node, {})
        wlc_word = bhsa2wlc.get(str_node, {})
        esv_word = bhsa2esv.get(str_node, {})
        niv_word = bhsa2niv.get(str_node, {})
        transs = [('esv', esv_word), ('niv', niv_word)]
        
        # add BHSA data
        data_row = get_bhsa_data(node)

        # add LXX data
        data_row.update({
            'lxx': lxx_word.get('utf8', np.nan),
            'lxx_tense': lxx_word.get('tense', np.nan),
            'lxx_voice': lxx_word.get('voice', np.nan),
            'lxx_mood': lxx_word.get('mood', np.nan),
            'lxx_person': lxx_word.get('person', np.nan),
            'lxx_number': lxx_word.get('number', np.nan),
        })
        
        if lxx_word:
            data_row['lxx_tm'] = lxx_word['tense'] + ' ' + lxx_word['mood'] 
        else:
            data_row['lxx_tm'] = np.nan
        
        # add any preceding waw data
        data_row.update(preceding_waw(node))
        
        # add TAM data from translations
        for trans, tdata in transs:
            
            data_row[f'{trans}'] = tdata.get('words', np.nan)
            data_row[f'{trans}_tags'] = tdata.get('tags', np.nan)
            data_row[f'{trans}_VBtags'] = tdata.get('vb_tags', np.nan)
            
            for tam_key, tam_data in split_TAM(tdata.get('TAM_cx', '')).items():
                tam_key = f'{trans}_{tam_key}'
                data_row[tam_key] = tam_data
            
            data_row[f'{trans}_TAMspan'] = tdata.get('TAM_span', np.nan)
            
        dataset.append(data_row)
        
    print(f'{len(dataset)} rows prepared!')
        
    return dataset

def build_allverb_data(verb_nodes):
    """Build a dataset with generic BHSA nodes (including all verbs)."""
    dataset = []
    for node in verb_nodes:
        dataset.append(
            get_bhsa_data(node)
        )
    return dataset

def xverb_collocations(verb_nodes, context='clause'):
    """For every verb tense, count lexemes in pre-verbal position."""
    
    col_data = collections.defaultdict(lambda: collections.Counter())
    for node in verb_nodes:
        
        lex = L.u(node, 'lex')[0]
        context_node = L.u(node, context)[0]
        context_words = L.d(context_node, 'word')
        preverb_words = context_words[:context_words.index(node)]
        verbt = F.vt.v(node)
        verbf = bhsa_tense_map.get(verbt, verbt)
        
        # build count
        for w in preverb_words:
            if F.lex.v(w).endswith('/'): # NB: skip nominal items
                continue
            col_data[verbf][F.lex.v(w)] += 1
            
    col_df = pd.DataFrame.from_dict(col_data, orient='index')
            
    return col_df

# Qatal Dataset

We build a dataset which is specific to qatal.

In [31]:
qatals = [verb for verb in F.pdp.s('verb') if F.vt.v(verb) == 'perf' and F.language.v(verb) == 'Hebrew']

qatal_dataset = build_dataset(qatals)

qatal_df = pd.DataFrame(qatal_dataset)

qatal_df.to_csv(VERB_DIR.joinpath('qatal_dataset.csv'), index=False)

20728 rows prepared!


In [32]:
qatal_df.head()

Unnamed: 0,bhsa_node,ref,book,book_super,canon_part,text_full,text_plain,lex,lex_etcbc,gloss,...,esv_TAMspan,niv,niv_tags,niv_VBtags,niv_tense,niv_aspect,niv_modality,niv_TAM,niv_TAMtag,niv_TAMspan
0,3,Genesis 1:1,Genesis,Genesis,Law,בָּרָ֣א,ברא,ברא,BR>[,create,...,created,created,VBD,VBD,PAST,,IND,PAST..IND,PAST,created
1,15,Genesis 1:2,Genesis,Genesis,Law,הָיְתָ֥ה,היתה,היה,HJH[,be,...,was,was,VBD,VBD,PAST,,IND,PAST..IND,PAST,was
2,47,Genesis 1:4,Genesis,Genesis,Law,טֹ֑וב,טוב,טוב,VWB[,be good,...,,,,,,,,,,
3,69,Genesis 1:5,Genesis,Genesis,Law,קָ֣רָא,קרא,קרא,QR>[,call,...,called,he called,PRP|VBD,VBD,PAST,,IND,PAST..IND,PAST,called
4,172,Genesis 1:10,Genesis,Genesis,Law,קָרָ֣א,קרא,קרא,QR>[,call,...,called,he called,PRP|VBD,VBD,PAST,,IND,PAST..IND,PAST,called


## All Verb Dataset

In [33]:
all_verbs = []

for verb in F.pdp.s('verb'):
    if F.language.v(verb) != 'Hebrew':
        continue
    if F.vt.v(verb) in {'perf', 'impf', 'wayq', 'ptca', 'ptcp'}:
        all_verbs.append(verb)
        
print(len(all_verbs), 'ready for processing')

all_verbs_data = build_allverb_data(all_verbs)


av_df = pd.DataFrame(all_verbs_data)

av_df.to_csv(VERB_DIR_PUBLIC.joinpath('allverb_bhsa.csv'), index=False)

av_df.head()

56955 ready for processing


Unnamed: 0,bhsa_node,ref,book,book_super,canon_part,text_full,text_plain,lex,lex_etcbc,gloss,...,genre,domain,period,txt_type,clause_type,clause_rela,adjacent_waw,mother_clause,mother_type,valence
0,3,Genesis 1:1,Genesis,Genesis,Law,בָּרָ֣א,ברא,ברא,BR>[,create,...,prose,?,EBH,?,xQtX,Main,False,,,d-
1,15,Genesis 1:2,Genesis,Genesis,Law,הָיְתָ֥ה,היתה,היה,HJH[,be,...,prose,?,EBH,?,WXQt,Main,False,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...,xQtX,--
2,27,Genesis 1:2,Genesis,Genesis,Law,מְרַחֶ֖פֶת,מרחפת,רחף,RXP[,shake,...,prose,?,EBH,?,Ptcp,Main,False,וְחֹ֖שֶׁךְ עַל־פְּנֵ֣י תְהֹ֑ום,NmCl,
3,33,Genesis 1:3,Genesis,Genesis,Law,יֹּ֥אמֶר,יאמר,אמר,>MR[,say,...,prose,N,EBH,?N,WayX,Main,True,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...,xQtX,--
4,35,Genesis 1:3,Genesis,Genesis,Law,יְהִ֣י,יהי,היה,HJH[,be,...,prose,Q,EBH,?NQ,ZYqX,Main,False,וַיֹּ֥אמֶר אֱלֹהִ֖ים,WayX,--


In [49]:
av_col_df = xverb_collocations(all_verbs)
av_col_df = av_col_df.fillna(0)
av_col_df.index.name = 'verb_form'
av_col_df.columns.name = 'lex_etcbc'
av_col_df.to_csv(VERB_DIR_PUBLIC.joinpath('xverb_lexcollocations.csv'))
av_col_df.head()

lex_etcbc,B,W,H,KJ,L,>CR,HNH,L>,MN,>P,...,<BD[,LHN,CLL=[,>JKKH,BWZ[,<WT[,CLC[,HJK,RC<[,CXV=[
verb_form,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
qtl,744.0,10220,1326.0,1838.0,371.0,2974.0,182.0,1827.0,325.0,32.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ptcp,96.0,1328,1511.0,246.0,85.0,358.0,367.0,78.0,51.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
yqtl,1144.0,6134,1540.0,1122.0,557.0,970.0,50.0,2746.0,503.0,51.0,...,1.0,2.0,1.0,4.0,1.0,1.0,1.0,2.0,1.0,1.0
wayq,0.0,14974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
