<a href="http://laf-fabric.readthedocs.org/en/latest/" target="_blank"><img align="left" src="images/laf-fabric-xsmall.png"/></a>
<a href="http://www.godgeleerdheid.vu.nl/etcbc" target="_blank"><img align="left" src="images/VU-ETCBC-xsmall.png"/></a>
<a href="http://www.persistent-identifier.nl/?identifier=urn%3Anbn%3Anl%3Aui%3A13-048i-71" target="_blank"><img align="left"src="images/etcbc4easy-small.png"/></a>
<a href="http://tla.mpi.nl" target="_blank"><img align="right" src="images/TLA-xsmall.png"/></a>
<a href="http://www.dans.knaw.nl" target="_blank"><img align="right"src="images/DANS-xsmall.png"/></a>

# Ketiv - Qere

In [1]:
import collections
import laf
from laf.fabric import LafFabric
from etcbc.preprocess import prepare
from etcbc.lib import Transcription
from etcbc.extra import ExtraData
fabric = LafFabric()

  0.00s This is LAF-Fabric 4.8.3
API reference: http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html
Feature doc: https://shebanq.ancient-data.org/static/docs/featuredoc/texts/welcome.html



## Create annotations from px file

In [10]:
version = '4c'
etcbc_source = 'etcbc{}'.format(version)
API=fabric.load(etcbc_source, '--', 'ketivqere', {
    "xmlids": {"node": False, "edge": False},
    "features": ('''
        otype monads label
        g_word g_cons
    ''',
    '''
    '''),
#    "prepare": prepare,
}, verbose='DETAIL')
exec(fabric.localnames.format(var='fabric'))

  0.00s LOADING API: please wait ... 
  0.00s DETAIL: COMPILING m: etcbc4c: UP TO DATE
  0.00s USING main: etcbc4c DATA COMPILED AT: 2016-11-09T09-49-17
  0.01s DETAIL: keep main: G.node_anchor_min
  0.01s DETAIL: keep main: G.node_anchor_max
  0.01s DETAIL: keep main: G.node_sort
  0.02s DETAIL: keep main: G.node_sort_inv
  0.02s DETAIL: keep main: G.edges_from
  0.02s DETAIL: keep main: G.edges_to
  0.02s DETAIL: keep main: F.etcbc4_db_otype [node] 
  0.02s DETAIL: keep main: F.etcbc4_ft_g_cons [node] 
  0.02s DETAIL: keep main: F.etcbc4_ft_g_word [node] 
  0.02s DETAIL: keep main: F.etcbc4_sft_label [node] 
  0.02s DETAIL: load main: F.etcbc4_db_monads [node] 
  0.90s INFO: DATA LOADED FROM SOURCE etcbc4c AND ANNOX  FOR TASK ketivqere AT 2016-11-09T10-40-53


# Making a verse index

In [11]:
msg("Making mappings between verse labels in KQ and verse nodes in LAF")
vlab2vnode = {}
for vs in F.otype.s('verse'):
    lab = F.label.v(vs)
    vlab2vnode[lab] = vs
msg("{} verses".format(len(vlab2vnode)))

  4.94s Making mappings between verse labels in KQ and verse nodes in LAF
  6.23s 23213 verses


# Method to read kq data

In [14]:
def read_kq(kq_file):
    msg("Reading Ketiv-Qere data")

    info = collections.defaultdict(lambda: [])
    not_found = set()
    missing = collections.defaultdict(lambda: [])
    missed = collections.defaultdict(lambda: [])

    error_limit = 100

    kq_handle = open(kq_file)

    ln = 0
    can = 0
    cur_label = None
    for line in kq_handle:
        ln += 1
        can += 1
        vlab = line[0:10]
        fields = line.rstrip('\n')[10:].split()
        (ketiv, qere) = fields[0:2]
        (qtrim, qtrailer) = Transcription.suffix_and_finales(qere)
        vnode = vlab2vnode.get(vlab, None)
        if vnode == None:
            not_found.add(vlab)
            continue
        info[vnode].append((ketiv.rstrip('-'), qtrim, qtrailer))        
    kq_handle.close()
    msg("Read {} ketiv-qere annotations".format(ln))

    data = []
    for vnode in info:
        wlookup = collections.defaultdict(lambda: [])
        wvisited = collections.defaultdict(lambda: -1)
        wnodes = L.d('word', vnode)
        for w in wnodes:
            gw = F.g_word.v(w)
            if '*' in gw:
                gw = F.g_cons.v(w)
                if gw == '': gw = '.'
                wlookup[gw].append(w)
        for (ketiv, qere, qtrailer) in info[vnode]:
            wvisited[ketiv] += 1
            windex = wvisited[ketiv]
            ws = wlookup.get(ketiv, None)
            if ws == None or windex > len(ws) - 1:
                missing[vnode].append((windex, F.monads.v(w), ketiv, qere))
                continue
            w = ws[windex]
            data.append((w, ketiv, qere, qtrailer))
        for ketiv in wlookup:
            if ketiv not in wvisited or len(wlookup[ketiv]) - 1 > wvisited[ketiv]:
                missed[vnode].append((len(wlookup[ketiv]) - (wvisited.get(ketiv, -1) + 1), F.monads.v(w), ketiv))
    msg("Parsed {} ketiv-qere annotations".format(len(data)))

    if not_found:
        msg("Could not find {} verses: {}".format(len(not_found), sorted(not_found)))
    else:
        msg("All verses entries found in index")
    if missing:
        msg("Could not locate ketivs in the text: {} verses".format(len(missing)))
        e = 0
        for vnode in sorted(missing):
            if e > error_limit: break
            vlab = F.label.v(vnode)
            for (windex, monad, ketiv, qere) in missing[vnode]:
                e += 1
                if e > error_limit: break
                print('NOT IN TEXT: {:<10} {{{:>6}}} {:<20} #{} {}'.format(vlab, monad, ketiv, windex, qere))
    else:
        msg("All ketivs found in the text")
    if missed:
        msg("Could not lookup qeres in the data: {} verses".format(len(missing)))
        e = 0
        for vnode in sorted(missed):
            if e > error_limit: break
            vlab = F.label.v(vnode)
            for (windex, monad, ketiv) in missed[vnode]:
                e += 1
                if e > error_limit: break
                print('NOT IN DATA: {:<10}  {{{:>6}}} {:<20} #{}'.format(vlab, monad, ketiv, windex))
    else:
        msg("All ketivs found in the data")
    return data

# Output the kq data

In [15]:
infile_name = '{}/{}/{}.{}'.format(API['data_dir'], 'kq', 'kq', etcbc_source)
data = read_kq(infile_name)
outf = outfile('kq.tsv')
for (w, ketiv, qere, qtrailer) in sorted(data):
    outf.write('{}\t{}\t{}\t{}\n'.format(str(w), ketiv, qere, qtrailer.replace('\n', '\\n')))
outf.close()

 8m 19s Reading Ketiv-Qere data
 8m 19s Read 1892 ketiv-qere annotations
 8m 19s Parsed 1892 ketiv-qere annotations
 8m 19s All verses entries found in index
 8m 19s All ketivs found in the text
 8m 19s All ketivs found in the data
