# Conversion of 1QIsaa to Text-Fabric

Conversion of 1QIsaa data to TF format. I work towards a full ETCBC encoding on the levels of characters, words, phrases, and clauses, for all the DSS.

In [37]:
import re
import collections
import json
import csv
from glob import glob
from tf.fabric import Fabric
from tf.convert.walker import CV
from tf.compose import modify

First, get the data.

In [38]:
import csv
word_dict = {}
phr_dict = {}
cl_dict = {}

# process words
with open("C:/Users/geitb/Documents/CACCHT/DSS2ETCBC/lexemes_pos_all_bib_books.csv") as file: # Use file to refer to the file object

    reader = csv.reader(file, delimiter=",")
    isa_index = 1
    for line in reader:
        if line[1] == '1Qisaa':
            word_dict[isa_index] = line
            isa_index += 1
            

# process phrases
with open("C:/Users/geitb/Documents/CACCHT/DSS2ETCBC/_1_q_isaa_phrases.csv") as file: # Use file to refer to the file object

    reader = csv.reader(file, delimiter=",")
    for i, line in enumerate(reader):
        phr_dict[i] = line

# process clauses
with open("C:/Users/geitb/Documents/CACCHT/DSS2ETCBC/_1_q_isaa_clause_boundaries.csv") as file: # Use file to refer to the file object

    reader = csv.reader(file, delimiter=",")
    for i, line in enumerate(reader):
        cl_dict[i] = line

for i in range(len(phr_dict)):
    if phr_dict[i][2] != cl_dict[i][2]:
        print(phr_dict[i], cl_dict[i])

In [41]:
def director(cv):
        
    '''
    Walks through 1QIsaa, and creates 
    '''
        
    # process books in order
    #for bo, book in bo2book.items():
    # process words
    bo = '1QIsaa'
    book = 'Isaiah'
    
    this_book = cv.node('book')
    cv.feature(this_book, book='1QIsaa')

            
    # keep track of when to trigger clause, chapter, and verse objects
    clause_track = 1 # keep counts of clauses
    prev_chap = 1 # start at 1
    prev_verse = 1 # start at 1
    this_chap = cv.node('chapter')
    this_verse = cv.node('verse')
    this_clause = cv.node('clause')      
        
    clause_end = False
        
    # iterate through words and construct objects
    for word in range(1,len(word_dict)):
            
        data = word_dict[word]
        cl_info = cl_dict[word]
        
        # segment out word data
        g_cons = data[5]
        lex = data[7]
        pos = data[9]

        chapt, verse, wrdnum = data[3], data[4], data[0]        
            
            
        # detect chapter boundary
        if prev_chap != chapt:
                
            # end verse
            cv.feature(this_verse, verse=prev_verse)
            cv.terminate(this_verse)
                
            # end chapter
            cv.feature(this_chap, chapter=prev_chap)
            cv.terminate(this_chap)
                
            # new chapter and verse begin
            this_chap = cv.node('chapter')
            prev_chap = chapt
            this_verse = cv.node('verse')
            prev_verse = verse
            
        # detect verse boundary
        elif prev_verse != verse:
            cv.feature(this_verse, verse=prev_verse)
            cv.terminate(this_verse)
            this_verse = cv.node('verse') # start a new verse
            prev_verse = verse
                
        if clause_end == True:
            cv.feature(this_clause, clause=clause_track)
            cv.terminate(this_clause)
            this_clause = cv.node('clause') # start a new clause
            clause_track += 1 # count clauses
            clause_end = False
                
        # detect clause boundary
        if cl_info[4] == 'e':
            clause_end = True
                
                
        # make word object
        this_word = cv.slot()
        cv.feature(this_word, 
                       sp=pos, 
                       lex=lex, 
                       g_cons=g_cons, 
                      )
        cv.terminate(this_word)
        
    # - end clause
    cv.feature(this_clause, clause=clause_track)
    cv.terminate(this_clause)
        
    # - end verse
    cv.feature(this_verse, verse=prev_verse)
    cv.terminate(this_verse)
        
    # - end chapter
    cv.feature(this_chap, chapter=prev_chap)
    cv.terminate(this_chap)
        
    # - end book
    cv.feature(this_book, book=book, book_code=bo)
    cv.terminate(this_book)

In [42]:
slotType = 'word'
otext = {'fmt:text-orig-full':'{g_cons} ',
         'sectionTypes':'book,chapter,verse,clause',
         'sectionFeatures':'book,chapter,verse,clause'}

generic = {'Name': 'DSSA',
           'Version': 0.1, # to be filled in
           'Editors': 'Martin Abegg',
           'Converter': 'Martijn Naaijer', 
           'Source:':'https://github.com/ETCBC/dss',
           'Note':'in progress'}

intFeatures = {'chapter', 'verse'}

featureMeta = {'book': {'description': 'A biblical book name'},
               'book_code': {'description': 'manuscript code'},
               'chapter': {'description': 'A chapter number'},
               'verse': {'description': 'A verse number'},
               'clause': {'description': 'A clause number'},
               'lex': {'description': 'lexeme in etcbc transcription'},
               'g_cons': {'description': 'surface structure of a word'},
               'sp': {'description': 'part of speech'}
              }

In [43]:
output_dir = 'C:/Users/geitb/Documents/CACCHT/DSS2ETCBC/tf'

TF = Fabric(locations=output_dir, silent=True)
cv = CV(TF)
    
good = cv.walk(director,
                   slotType,
                   otext=otext,
                   generic=generic,
                   intFeatures=intFeatures,
                   featureMeta=featureMeta,
                   warn=True,
                   force=False,)

  0.00s Importing data from walking through the source ...
   |     0.00s Preparing metadata... 
   |     0.00s No structure nodes will be set up
   |   SECTION   TYPES:    book, chapter, verse, clause
   |   SECTION   FEATURES: book, chapter, verse, clause
   |   STRUCTURE TYPES:    
   |   STRUCTURE FEATURES: 
   |   TEXT      FEATURES:
   |      |   text-orig-full       g_cons
   |     0.00s OK
   |     0.00s Following director... 
   |     0.16s "edge" actions: 0
   |     0.16s "feature" actions: 29818
   |     0.16s "node" actions: 7045
   |     0.16s "resume" actions: 0
   |     0.16s "slot" actions: 22772
   |     0.16s "terminate" actions: 29817
   |          1 x "book" node 
   |         67 x "chapter" node 
   |       5686 x "clause" node 
   |       1291 x "verse" node 
   |      22772 x "word" node  = slot type
   |      29817 nodes of all types
   |     0.17s OK
   |     0.00s Removing unlinked nodes ... 
   |      |     0.00s      1 unlinked "chapter" node: [1]
   |      







In [44]:
TF = Fabric(locations=output_dir)
api = TF.load('''

book chapter verse clause
g_cons lex sp


''')

classes = api.makeAvailableIn(globals())

This is Text-Fabric 7.9.0
Api reference : https://annotation.github.io/text-fabric/Api/Fabric/

11 features found and 0 ignored
  0.00s loading features ...
   |     0.02s T otype                from C:/Users/geitb/Documents/CACCHT/DSS2ETCBC/tf
   |     0.07s T oslots               from C:/Users/geitb/Documents/CACCHT/DSS2ETCBC/tf
   |     0.00s No section config in otext, the section part of the T-API cannot be used
   |     0.00s No structure info in otext, the structure part of the T-API cannot be used
   |     0.06s T g_cons               from C:/Users/geitb/Documents/CACCHT/DSS2ETCBC/tf
   |      |     0.01s C __levels__           from otype, oslots, otext
   |      |     0.14s C __order__            from otype, oslots, __levels__
   |      |     0.01s C __rank__             from otype, __order__
   |      |     0.15s C __levUp__            from otype, oslots, __rank__
   |      |     0.02s C __levDown__          from otype, __levUp__, __rank__
   |      |     0.07s C __boundary__

In [48]:
for w in F.otype.s('word'):
    ch = F.chapter.v(L.u(w, 'chapter')[0])
    ve = F.verse.v(L.u(w, 'verse')[0])
    if ch < 2 and ve < 5:
        print(w, ch, ve, F.sp.v(w), F.g_cons.v(w), F.lex.v(w))

1 1 1 subs XZWN XZWN/
2 1 1 nmpr JC<JHW JC<JHW/
3 1 1 subs BN BN/
4 1 1 nmpr >MWY >MWY/
5 1 1 conj >CR >CR
6 1 1 verb XZH XZH[
7 1 1 prep <L <L
8 1 1 nmpr JHWDH JHWDH/
9 1 1 conj W W
10 1 1 nmpr JRWCLM JRWCLM/
11 1 1 prep B B
12 1 1 subs JWMJ JWM/
13 1 1 nmpr <WZJH <ZJHW/
14 1 1 nmpr JWTM JWTM/
15 1 1 nmpr >XZ >XZ=/
16 1 1 nmpr JXZQJH JXZQJHW/
17 1 1 subs MLKJ MLK/
18 1 1 nmpr JHWDH JHWDH/
19 1 2 verb CM<W CM<[
20 1 2 subs CMJM CMJM/
21 1 2 conj W W
22 1 2 verb H>ZJNJ >ZN[
23 1 2 art H H
24 1 2 subs >RY >RY/
25 1 2 conj KJ> KJ
26 1 2 nmpr JHWH JHWH/
27 1 2 verb DBR DBR[
28 1 2 subs BNJM BN/
29 1 2 verb GDLTJ GDL[
30 1 2 conj W W
31 1 2 verb RWMMTJ RWM[
32 1 2 conj W W
33 1 2 prps HMH HM
34 1 2 verb PC<W PC<[
35 1 2 prep BJ B
36 1 3 verb JD< JD<[
37 1 3 subs CWR CWR/
38 1 3 verb QWNJHW QNH[
39 1 3 conj W W
40 1 3 subs XMWR XMWR/
41 1 3 subs >BWS >BWS/
42 1 3 subs B<LJW B<L/
43 1 3 nmpr JFR>L JFR>L/
44 1 3 nega LW> L>
45 1 3 verb JD< JD<[
46 1 3 conj W W
47 1 3 subs <MJ <M/
48 1 3 nega L

In [46]:
for cl in F.otype.s('clause'):
    words = L.d(cl, 'word')
    lexemes = [F.lex.v(w) for w in words]
    print(lexemes)

['XZWN/', 'JC<JHW/', 'BN/', '>MWY/']
['>CR', 'XZH[', '<L', 'JHWDH/', 'W', 'JRWCLM/', 'B', 'JWM/', '<ZJHW/', 'JWTM/', '>XZ=/', 'JXZQJHW/', 'MLK/', 'JHWDH/']
['CM<[']
['CMJM/']
['W', '>ZN[']
['H', '>RY/']
['KJ', 'JHWH/', 'DBR[']
['BN/', 'GDL[']
['W', 'RWM[']
['W', 'HM', 'PC<[', 'B']
['JD<[', 'CWR/', 'QNH[']
['W', 'XMWR/', '>BWS/', 'B<L/']
['JFR>L/', 'L>', 'JD<[']
['W', '<M/', 'L>', 'BJN[']
['HWJ', 'GWJ/', 'XV>[', '<M/', 'KBD/', '<WN/', 'ZR</', 'R<<[', 'BN/', 'CXT[']
['<ZB[', '>T', 'JHWH/']
['N>Y[', '>T', 'QDWC/', 'JFR>L/']
['ZWR[', '>XWR/']
['<L', 'MH', 'NKH[', '<WD/']
['JSP[', 'SRH/']
['KL/', 'R>C/', 'L', 'XLJ/']
['W', 'KL/', 'LBB/', 'DWJ=/']
['MN', 'KP/', 'RGL/', 'W', '<D', 'R>C/', '>JN/', 'B', 'MTM/']
['PY</', 'W', 'XBWRH/', 'W', 'MKH/', 'VRJ/']
['L>', 'ZRH[']
['W', 'L>', 'XBC[']
['W', 'L>', 'RKK[', 'B', 'CMN/']
['>RY/', 'CMMH/']
['<JR/', 'FRP[', '>C/']
['>DMH/']
['L', 'NGD/', 'ZR/', '>KL[', '>T']
['W', 'CMMH/', '<L', 'K', 'MHPKH/', 'ZR/']
['W', 'JTR[', 'BT/', 'YJWN==/', 'K', 'SKH/', 

['<L', 'NBW=/', 'W', '<L', 'MJDB>/', 'MW>B/', 'JLL[']
['B', 'KL/', 'R>C/', 'QRXH/']
['W', 'KL/', 'ZQN=/', 'GR<[']
['B', 'XWY/', 'XGR[', 'FQ/']
['<L', 'GG/', 'W', 'B', 'RXB==/', 'KL/', 'JLL[']
['W', 'JRD[', 'B', 'BKJ/']
['W', 'Z<Q[', 'XCBWN==/', 'W', '>L<LH/']
['<D', 'JHY/', 'CM<[', 'QWL/']
['<L', 'KN', 'XLY[', 'MW>B/', 'RW<[']
['NPC/', 'JR<[', 'L']
['LB/', 'L', 'MW>B/', 'Z<Q[']
['BRJX/', '<D', 'Y<R/', '<GLT_CLCJH/', '<GLT_CLCJH/']
['KJ', 'M<LH/', 'H', 'LWXJT/']
['B', 'BKJ/', '<LH[', 'B']
['KJ', 'DRK/', 'XRWNJM/', 'Z<QH/', 'CBR/', '<WR[']
['KJ', 'MJM/', 'NMRJM/', 'MCMH/', 'HJH[']
['KJ', 'JBC[', 'XYJR/']
['KLH[', 'DC>/']
['JRQ/', 'L>', 'HJH[']
['<L', 'KN', 'JTRH/', '<FH[']
['W', 'PQDH/']
['<L', 'NXL/', '<RBH=/', 'NF>[']
['KJ', 'NQP[', 'H', 'Z<QH/', '>T', 'GBWL/', 'MW>B/']
['<D', '>GLJM/', 'JLLH/']
['W', 'B>R_>JLJM/', 'B>R_>JLJM/', 'JLLH/']
['KJ', 'MJM/', 'DJMWN/', 'ML>[', 'DM/']
['KJ', 'CJT[', '<L', 'DJMWN/', 'JSP[']
['L', 'PLJVH/', 'MW>B/', '>RJH/']
['L', 'C>RJT/', '>DMH/']
['CLX[', 'KR

['W', 'H', 'KRML/', 'L', 'J<R/', 'XCB[']
['W', 'CM<[', 'B', 'JWM/', 'H', 'HW>', 'H', 'XRC=/', 'DBR/', 'SPR/']
['W', 'MN', '>PL=/', 'W', 'MN', 'XCK/', '<JN/', '<WR/', 'R>H[']
['W', 'JSP[', '<NW/', 'B', 'JHWH/', 'FMXH/']
['W', '>BJWN/', '>DM/', 'B', 'QDWC/', 'JFR>L/', 'GJL[']
['KJ', '>PS[', '<RJY/']
['W', 'KLH[', 'LY/']
['W', 'KRT[', 'KL/', 'CQD[', '>WN=/']
['XV>[', '>DM/', 'B', 'DBR/']
['W', 'L', 'JKX[', 'B', 'C<R/', 'QWC[']
['W', 'NVH[', 'B', 'THW/', 'YDJQ/']
['LKN', 'LKN']
['KH', '>MR[', 'JHWH/', '>L', 'BJT/', 'J<QB/']
['>CR', 'PDH[', '>T', '>BRHM/']
['L>', '<TH', 'BWC[', 'J<QB/']
['W', 'L>', '<TH', 'PNH/', 'XWR[']
['KJ', 'B', 'R>H[', 'JLD/', 'M<FH/', 'JD/', 'B', 'QRB/']
['QDC[', 'CM/']
['W', 'QDC[', '>T', 'QDWC/', 'J<QB/']
['W', '>T', '>LHJM/', 'JFR>L/', '<RY[']
['W', 'JD<[', 'T<H[', 'RWX/', 'BJNH/']
['W', 'RGN[', 'LMD[', 'LQX/']
['HWJ', 'BN/']
['SRR[']
['N>M/', 'JHWH/']
['L', '<FH[', '<YH/']
['W', 'L>', 'MN']
['W', 'L', 'NSK[', 'MSKH==/']
['W', 'L>', 'RWX/']
['LM<N', 'SPH[', 'XV>T/'

['W', 'KJ', '>MR[', '>L']
['<L', 'JHWH/', '>LHJM/', 'BVX[']
['H=', 'L>', 'HW>']
['>CR', 'SWR[', 'XZQJHW/', '>T', 'BMH/', 'W', '>T', 'MZBX/']
['W', '>MR[', 'L', 'JHWDH/', 'W', 'L', 'JRWCLM/']
['L', 'PNH/', 'H', 'MZBX/', 'H', 'ZH', 'XWH[']
['B', 'JRWCLM/']
['W', '<TH']
['<RB[', 'N>', '>T==', '>DWN/', 'H', 'MLK/', '>CWR/']
['W', 'NTN[', 'L', '>LP=/', 'SWS/']
['>M', 'JKL[']
['L', 'NTN[', 'L', 'RKB[', '<L']
['W', '>JK', 'CWB[', '>T', 'PNH/', 'PXH/', '>XD/', 'MN', '<BD/', '>DWN/', 'H', 'QVN=/']
['W', 'BVX[', 'L', '<L', 'MYRJM/', 'L', 'RKB/', 'W', 'L', 'PRC/']
['W', '<TH']
['H=', 'MN', 'BL<DJ', 'JHWH/', '<LH[', '<L', 'H', '>RY/', 'H', 'Z>T']
['L', 'CXT[']
['JHWH/', '>MR[', '>L']
['<LH[', '>L', 'H', '>RY/', 'H', 'Z>T']
['L', 'CXT[']
['W', '>MR[', '>LJQJM/', '>LJQJM/', 'W', 'CBN>/', 'W', 'JW>X/']
['DBR[', 'N>', '<M', '<BD/', '<M', '>RMJ=/']
['KJ', 'CM<[', '>NXNW']
['W', '>L=', 'DBR[', '>T', 'JHWDJ/', 'DBR/', 'H', '>LH', 'B', '>ZN/', 'H', '>JC/']
['H', 'JCB[', '<L', 'H', 'XWMH/']
['W', '>MR[', '

['W', 'RWX/']
['L', 'HLK[', 'B']
['>NJ', 'JHWH/', 'QR>[', 'B', 'YDQ/']
['W', 'XZQ[', 'B', 'JD/']
['W', 'NYR[']
['W', 'NTN[', 'L', 'BRJT/', '<M/', 'L', '>WR/', 'GWJ/']
['L', 'PQX[', '<JN/', '<WR/']
['L', 'JY>[', 'MN', 'MSGR/', '>SJR=/']
['W', 'MN', 'BJT/', 'KL>/', 'JCB[', 'XCK/']
['>NJ', 'JHWH/']
['HW>', 'W', 'CM/']
['W', 'KBWD/', 'L', '>XR=/', 'L>', 'NTN[']
['W', 'THLH/', 'L', 'PSJL/']
['H', 'R>CWN/']
['HNH', 'BW>[']
['W', 'H', 'XDC/', '>NJ', 'NGD[']
['B', 'VRM/', 'YMX[']
['CM<[', '>T']
['CJR[', 'L', 'JHWH/', 'CJR/', 'XDC/']
['W', 'THLH/', 'MN', 'QYH=/', 'H', '>RY/']
['JRD[', 'H', 'JM/', 'W', 'ML>=/']
['>J/', 'W', 'JCB[']
['NF>[', 'MDBR/', '<JR/', 'W', 'XYR/']
['JCB[', 'QDR/']
['W', 'RNN[', 'JCB[', 'SL</']
['MN', 'R>C/', 'HR/', 'YWX[']
['FJM[', 'L', 'JHWH/', 'KBWD/']
['W', 'THLH/', 'B', '>J/', 'NGD[']
['JHWH/', 'K', 'GBWR/', 'JY>[']
['K', '>JC/', 'MLXMH/', '<WR[', 'QN>H/']
['RW<[']
['>P', 'YRX[']
['<L', '>JB[', 'GBR[']
['XCH[', '>K', 'MN', '<WLM/']
['XRC[']
['>PQ[']
['K', 'JLD[', 'P<H[

['W', 'R>H[']
['KL/', 'QBY[']
['BW>[', 'L']
['XJ/', '>NJ']
['N>M/', 'JHWH/']
['KJ', 'KL/', 'K', '<DJ/', 'LBC[']
['W', 'QCR[', 'K', 'KLH/']
['KJ', 'XRBH=/', 'W', 'CMM[', 'W', '>RY/', 'HRJST/']
['KJ', '<TH', 'YRR[', 'MN', 'JCB[']
['W', 'RXQ[', 'BL<[']
['<WD/', '>MR[', 'B', '>ZN/', 'BN/', 'CKLJM/']
['YR/', 'L', 'H', 'MQWM/']
['NGC[', 'L']
['W', 'JCB[']
['W', '>MR[', 'B', 'LBB/']
['MJ', 'JLD[', 'L', '>T', '>LH']
['W', '>NJ', 'CKWL==/', 'W', 'GLMWD/']
['W', 'GLH[']
['W', 'SWR/']
['>LH']
['MJ', 'GDL[']
['HN', '>NJ', 'C>R[', 'L', 'BD/']
['>LH']
['>JPH', 'HM']
['KJ', 'KH', '>MR[', 'JHWH/']
['HNH', 'NF>[', '>L', 'GWJ/', 'JD/']
['W', '>L', 'H', '<M/', 'RWM[', 'NS/']
['W', 'BW>[', 'BN/', 'B', 'XYN/']
['W', 'BT/', '<L', 'KTP/', 'NF>[']
['W', 'HJH[', 'MLK/', '>MN[']
['W', 'FRH/', 'MJNQT/']
['>P/', '>RY/']
['XWH[', 'L']
['W', '<PR/', 'RGL/', 'LXK[']
['W', 'JD<[']
['KJ', '>NJ', 'JHWH/']
['>CR', 'L>', 'BWC[', 'QWH[']
['H=', 'LQX[', 'MN', 'GBWR/', 'MLQWX/']
['>M', 'CBJ/', '<RJY/', 'MLV[']
['KJ', 'KH', 

['>L=', 'XFK[']
['K', 'CWPR/', 'RWM[', 'QWL/']
['W', 'NGD[', 'L', '<M/', 'PC</']
['W', 'L', 'BJT/', 'J<QB/', 'XV>T/']
['>T', 'JWM/', 'W', 'JWM/', 'DRC[']
['W', 'D<T/', 'DRK/', 'XPY[', 'K', 'GWJ/']
['>CR', 'YDQH/', '<FH[']
['W', 'MCPV/', '>LHJM/', 'L>', '<ZB[']
['C>L[', 'MCPV/', 'YDQ/']
['QRBH/', '>LHJM/', 'XPY[']
['LMH', 'YWM[']
['W', 'L>', 'R>H[']
['<NH=[', 'NPC/']
['W', 'L>', 'JD<[']
['HN', 'B', 'JWM/', 'YWM/', 'MY>[', 'XPY/']
['W', 'KL/', '<YB=====/', 'NGF[']
['HN', 'L', 'RJB/', 'W', 'L', 'MYH=/', 'YWM[']
['W', 'L', 'NKH[', 'B', '>GRP/', 'RC<=/']
['L>', 'YWM[', 'K', 'JWM/']
['L', 'CM<[', 'B', 'MRWM/', 'QWL/']
['H=', 'K', 'ZH', 'HJH[', 'YWM/']
['BXR[']
['JWM/']
['<NH=[', '>DM/', 'NPC/']
['H=', 'L', 'KPP[', 'K', '>GMWN/', 'R>C/']
['FQ/', 'W', '>PR/', 'JY<[']
['H=', 'L', 'ZH', 'QR>[', 'YWM/', 'JWM/', 'RYWN/', 'L', 'JHWH/']
['H=', 'L>', 'ZH', 'H', 'YWM/']
['>CR', 'BXR[']
['PTX[', 'XRYB/', 'RC<=/']
['W', 'NTR[', '>GDH/', 'MWVH/']
['W', 'CLX[', 'RYY[', 'XPCJ/']
['W', 'KL/', 'MWVH/', 'NTQ[