# Build verb dataset from pre-compiled sources

In [1]:
import sys
import json
import collections
import re
import pandas as pd
pd.set_option('display.max_rows', 200)
from pathlib import Path
from tf.fabric import Fabric
from tf.app import use
import pandas as pd

# custom modules 
sys.path.append('../')
import tf_tools
from gbi_functions import id2ref
from positions import PositionsTF

# organize pathways / files
PROJ_DIR = Path.home().joinpath('github/CambridgeSemiticsLab/translation_traditions_HB')
VERB_DIR = PROJ_DIR.joinpath('data/_private_/verb_data')
lxx_file = VERB_DIR.joinpath('bhsa2lxx.json')
wlc_file = VERB_DIR.joinpath('bhsa2wlc.json')
esv_file = VERB_DIR.joinpath('bhsa2esv.json')
niv_file = VERB_DIR.joinpath('bhsa2niv.json')

# load datasets
bhsa2lxx = json.loads(lxx_file.read_text())
bhsa2wlc = json.loads(wlc_file.read_text())
bhsa2esv = json.loads(esv_file.read_text())
bhsa2niv = json.loads(niv_file.read_text())

In [2]:
# load BHSA features with genre module
locations = [
    '~/github/etcbc/bhsa/tf/c', 
    '~/github/etcbc/genre_synvar/tf/c',
    '~/github/etcbc/valence/tf/c'
]
TF = Fabric(locations)
extra_features = '''
domain txt ps gn 
nu genre sense
mother
'''
features = tf_tools.standard_features + extra_features
api = TF.load(features)
bhsa = use('bhsa', api=api)
F, E, T, L, Fs, = bhsa.api.F, bhsa.api.E, bhsa.api.T, bhsa.api.L, bhsa.api.Fs

from clause_relas import in_dep_calc as clause_relator

This is Text-Fabric 8.4.0
Api reference : https://annotation.github.io/text-fabric/cheatsheet.html

125 features found and 0 ignored
  0.00s loading features ...
   |     0.00s Dataset without structure sections in otext:no structure functions in the T-API
  6.20s All features loaded/computed - for details use loadLog()


In [3]:
# thanks to Martijn Naaijer
# for providing this handy list / code
period_dict = {}
ebh = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', '1_Samuel', '2_Samuel', '1_Kings', '2_Kings']
lbh = ['Esther', 'Daniel', 'Ezra', 'Nehemiah', '1_Chronicles', '2_Chronicles']
for book in ebh:
    period_dict[book] = 'EBH'
for book in lbh:
    period_dict[book] = 'LBH'
    
    
def map_book_collections(label_and_ranges):
    """Apply a label to a range of books."""
    label_dict = {}
    for label, book_start, book_end in label_and_ranges:
        bs_node = T.nodeFromSection((book_start,))
        be_node = T.nodeFromSection((book_end,))
        in_between = list(range(bs_node, be_node+1))
        whole_section = [bs_node] + in_between + [be_node]
        for book_node in whole_section:
            label_dict[T.sectionFromNode(book_node)[0]] = label 
    return label_dict

tripart = map_book_collections([
    ('Law', 'Genesis', 'Deuteronomy'), 
    ('Prophets', 'Joshua', 'Malachi'), 
    ('Writings', 'Psalms', '2_Chronicles')
])

subcollections = map_book_collections([
    ('Samuel', '1_Samuel', '2_Samuel'),
    ('Kings', '1_Kings', '2_Kings'),
    ('Chronicles', '1_Chronicles', '2_Chronicles'),
    ('Ezra-Neh', 'Ezra', 'Nehemiah'),
    ('Twelve', 'Hosea', 'Malachi'),
    ('Megilloth', 'Ruth', 'Esther'),])

In [4]:
def has_adjacent_waw(node):
    """Check whether verb has an adjacent preceding waw."""
    context = PositionsTF(node, 'clause', api)
    prev_word = context.get(-1) or 0
    if F.lex.v(prev_word) == 'W':
        return True
    else:
        return False
    
def has_preceding_waw(node):
    """Check whether a waw precedes a given node in a clause"""
    clause_node = L.u(node, 'clause')[0]
    clause_words = L.d(clause_node, 'word')
    prev_nodes = clause_words[0:clause_words.index(node)]
    prev_lexs = {F.lex.v(n) for n in prev_nodes}
    if 'W' in prev_lexs:
        return True
    else:
        return False
    
bhsa_tense_map = {
        'impf': 'yqtl',
        'perf': 'qtl',
}

def build_dataset(bhsa_nodes):
    """Construct a dataset on select BHSA nodes."""
    
    dataset = []
    
    for node in bhsa_nodes:
        
        str_node = str(node)
        lxx_word = bhsa2lxx[str_node]
        wlc_word = bhsa2wlc[str_node]
        esv_word = bhsa2esv[str_node]
        niv_word = bhsa2niv[str_node]
        transs = [('esv', esv_word), ('niv', niv_word)]
        
        # compile BHSA data
        book, chapter, verse = T.sectionFromNode(node)
        ref_string = f'{book} {chapter}:{verse}'
        verse_node = L.u(node, 'verse')[0]
        clause_atom = L.u(node, 'clause_atom')[0]
        clause = L.u(node, 'clause')[0]
        sent = L.u(node, 'sentence')[0]
        clause_type = F.typ.v(clause)
        verb_tense = F.vt.v(node)
        verb_tense = bhsa_tense_map.get(verb_tense, verb_tense)            
        
        data_row =  {
            'bhsa_node': node,
            'ref': ref_string, 
            'book': book, 
            'book_super': subcollections.get(book, book),
            'canon_part': tripart[book],
            'text_full': F.g_word_utf8.v(node),
            'text_plain': F.g_cons_utf8.v(node),
            'lex': F.lex_utf8.v(node),
            'lex_etcbc': F.lex.v(node),
            'gloss': wlc_word['gloss'],
            'verb_form': verb_tense,
            'stem': F.vs.v(node),
            'person': F.ps.v(node),
            'gender': F.gn.v(node),
            'number': F.nu.v(node),
            'sentence': T.text(sent),
            'genre': F.genre.v(verse_node),
            'domain': F.domain.v(clause),
            'period': period_dict.get(book, ''),
            'txt_type': F.txt.v(clause),
            'clause_type': clause_type,
            'clause_rela': clause_relator(clause),
            'adjecent_waw': has_adjacent_waw(node),
            'preceding_waw': has_preceding_waw(node),
            'valence': F.sense.v(node),
            'lxx': lxx_word['utf8'],
            'lxx_tense': lxx_word['tense'],
            'lxx_voice': lxx_word['voice'],
            'lxx_mood': lxx_word['mood'],
            'lxx_person': lxx_word.get('person',''),
            'lxx_number': lxx_word.get('number', ''),
        }
        
        for trans, tdata in transs:
            data_row[f'{trans}'] = tdata['words']
            data_row[f'{trans}_tags'] = tdata['tags']
            data_row[f'{trans}_VBtags'] = tdata['vb_tags']
            data_row[f'{trans}_TAMcx'] = tdata['TAM_cx']
            data_row[f'{trans}_TAMspan'] = tdata['TAM_span']
            
        dataset.append(data_row)
        
    print(f'{len(dataset)} rows prepared!')
        
    return dataset

## Build Dataset

We select the intersection of all sets of data.

In [5]:
eligible_verbs = set(bhsa2lxx) & set(bhsa2wlc) & set(bhsa2esv) & set(bhsa2niv) 
eligible_verbs = sorted(int(vb) for vb in eligible_verbs)

print(f'verbs eligible for export: {len(eligible_verbs)}')

verbs eligible for export: 59196


### Qatal Dataset

We build a dataset which is specific to qatal.

In [7]:
qatals = [verb for verb in eligible_verbs if F.vt.v(verb) == 'perf' and F.language.v(verb) == 'Hebrew']

qatal_dataset = build_dataset(qatals)

qatal_df = pd.DataFrame(qatal_dataset)

qatal_df.to_csv(VERB_DIR.joinpath('qatal_dataset.csv'), index=False)

18267 rows prepared!
