# Convert to TF

In [1]:
import os, re
import pandas as pd

from tf.fabric import Fabric
from tf.convert.walker import CV

## Import files

Main source:

In [8]:
source = pd.read_excel('./data/corrected_data.xlsx')
source

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,line,ref,raw,C_disambig,g_cons_raw,lex,sp,g_cons
0,0,0,0,Genesis 1 1,BR>CJT,BR>CJT,B-R>CJT,B-R>CJT/,prep-subs,B-R>CJT
1,1,1,1,Genesis 1 1,BR>,BR>,BR>,BR>[,verb,BR>
2,2,2,2,Genesis 1 1,>LHJM,>LHJM,>LHJM,>LHJM/,subs,>LHJM
3,3,3,3,Genesis 1 1,>T,>T,>T,>T,prep,>T
4,4,4,4,Genesis 1 1,HCMJM,HCMJM,H-CMJM,H-CMJM/,art-subs,H-CMJM
...,...,...,...,...,...,...,...,...,...,...
20801,20801,20801,20875,Genesis 50 26,WJXNVW,WJXNVW,W-JXNVW,W-XNV[,conj-verb,W-JXNVW
20802,20802,20802,20876,Genesis 50 26,>TW,>TW,>TW,>T,prep,>TW
20803,20803,20803,20877,Genesis 50 26,WJWCM,WJWFM,W-JWCM,W-JFM[,conj-verb,W-JWFM
20804,20804,20804,20878,Genesis 50 26,B>RN,B>RN,B->RN,B->RWN/,prep-subs,B->RN


In [3]:
PATH = './data/'

annotations = {'sp.csv',
              }

for file in annotations:
    read_file = pd.read_csv(f'{PATH}{file}')
    for col in read_file.columns[1:]:
        source[col] = read_file[col]
        
source

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,line,ref,raw,C_disambig,g_cons_raw,lex,g_cons,sp
0,0,0,0,Genesis 1 1,BR>CJT,BR>CJT,B-R>CJT,B-R>CJT/,B-R>CJT,prep-subs
1,1,1,1,Genesis 1 1,BR>,BR>,BR>,BR>[,BR>,verb
2,2,2,2,Genesis 1 1,>LHJM,>LHJM,>LHJM,>LHJM/,>LHJM,subs
3,3,3,3,Genesis 1 1,>T,>T,>T,>T,>T,prep
4,4,4,4,Genesis 1 1,HCMJM,HCMJM,H-CMJM,H-CMJM/,H-CMJM,art-subs
...,...,...,...,...,...,...,...,...,...,...
20801,20801,20858,20875,Genesis 50 26,WJXNVW,WJXNVW,W-JXNVW,W-XNV[,W-JXNVW,conj-verb
20802,20802,20859,20876,Genesis 50 26,>TW,>TW,>TW,>T,>TW,prep
20803,20803,20860,20877,Genesis 50 26,WJWCM,WJWFM,W-JWCM,W-JFM[,W-JWFM,conj-verb
20804,20804,20861,20878,Genesis 50 26,B>RN,B>RN,B->RN,B->RWN/,B->RN,prep-subs


#### Export for corrections

In [6]:
source = source[['line','ref','raw','C_disambig','g_cons_raw','lex','sp']]

In [7]:
source.to_excel('./data/corrections_1.xlsx')

### Converting to Hebrew script

In [9]:
script = {'>': '\u05D0',
         'B': '\u05D1',
         'G': '\u05D2',
         'D': '\u05D3',
         'H': '\u05D4',
         'W': '\u05D5',
         'Z': '\u05D6',
         'X': '\u05D7',
         'V': '\u05D8',
         'J': '\u05D9',
         'k': '\u05DB',
         'K': '\u05DA',
         'L': '\u05DC',
         'm': '\u05DE',
         'M': '\u05DD',          
         'n': '\u05E0',
         'N': '\u05DF',
         'S': '\u05E1',
         '<': '\u05E2',
         'p': '\u05E4',
         'P': '\u05E3',
         'y': '\u05E6',
         'Y': '\u05E5',
         'Q': '\u05E7',
         'R': '\u05E8',
         'C': '\uFB2A',
         'F': '\uFB2B',
         'T': '\u05EA',
         '-':'-',
         ' ':' ',
         '/':'/',
         '[':'[',
         '=':'=',
         '_':' ',
         }

def last_letter(w):
   
    w = re.sub('[KMNPY]', lambda m: m.group(0).lower(),w) #Lower case certain letters
    w = re.sub('[kmnpy]\s', lambda m: m.group(0).upper(),w) #Upper case last letter
    return w

def convert(w):

    if w not in ['absent']:
        return ''.join([script[l] for l in w])

### Prepare the conversion

In [10]:
def reference(ref):
    bo, ch, ve = ref.split()
    return bo,ch,ve

In [11]:
DATA_FOLDER = 'tf'
VERSION = '0.3'

TF_PATH = f'./{DATA_FOLDER}/{VERSION}'
TF = Fabric(locations=TF_PATH, silent=True)

In [12]:
slotType = 'sign'

Metadata

In [14]:
generic = {
    'name': 'Samaritan Pentateuch - Genesis',
    'compiler': 'Martijn Naaijer and Christian Canu Højgaard',
    'source': 'Stefan Schorch',
    'version': '0.3',
    'purpose': 'first tests'
}

Representations

In [15]:
otext = {
    'fmt:text-orig-full': '{sign}',
    'sectionTypes': 'book,chapter,verse',
    'sectionFeatures': 'book,chapter,verse',
}

In [16]:
featureMeta = {
    'sign': {
        'description': 'consonantal letter',
    },
    'book': {
        'description': 'book title'
    },
    'chapter': {
        'description': 'chapter number',
    },
    'verse': {
        'description': 'verse number',
    },
    'g_cons_raw': {
        'description': 'word consonantal-transliterated (without disambiguation of Shin (C) and Sin (F))',
    },
    'g_cons': {
        'description': 'word consonantal-transliterated',
    },
    'g_cons_utf8': {
        'description': 'word in Hebrew script',
    },
    'trailer': {
        'description': 'interword material',
    },
    'lex': {
        'description': 'lexeme consonantal-transliterated',
    },
    'lex_utf8': {
        'description': 'lexeme in Hebrew script',
    },
    'language': {
        'description': 'language',
    },
    'sp': {
        'description': 'part of speech',
    },
}

In [17]:
intFeatures = {
  'chapter'
}

In [18]:
def director(cv):   
    label_dict = dict(
        book = '',
        chapter = '',
        verse = '',
        word = '',
    )
    
    node_dict = dict(
        book=None,
        chapter=None,
        verse=None,
        lex=None,
        word=None,
    )

    for line in source.iterrows():
        
        book_title, chapter_number, verse_number = reference(line[1]['ref'])
        
        if book_title != label_dict['book']:
            label_dict['book'] = book_title
            for ntp in ('word','verse','chapter','book'):
                cv.terminate(node_dict[ntp])
                node_dict[ntp] = None   
            node_dict['book'] = cv.node('book')
            cv.feature(
              node_dict['book'],
              book=book_title,
            )
            
        if chapter_number != label_dict['chapter']:
            label_dict['chapter'] = chapter_number
            for ntp in ('word','verse','chapter'):
                cv.terminate(node_dict[ntp])
                node_dict[ntp] = None
            node_dict['chapter'] = cv.node('chapter')
            cv.feature(
              node_dict['chapter'],
              chapter = chapter_number,
            )
         
        if verse_number != label_dict['verse']:
            label_dict['verse'] = verse_number
            for ntp in ('word','verse'):
                cv.terminate(node_dict[ntp])
                node_dict[ntp] = None   
            node_dict['verse'] = cv.node('verse')
            cv.feature(
              node_dict['verse'],
              verse = verse_number,
            )
        
        word_trailer = f"{line[1]['g_cons_raw']} ".split('-') #A trailer is added to the word
        
        #print(chapter_number, verse_number, line[1]['g_cons_raw'], line[1]['lex'], line[1]['sp'])

        for w in range(len(word_trailer)):
            if ' ' in word_trailer[w]:
                trailer = ' '
            else:
                trailer = ''
            if not word_trailer[w]:
                word_trailer[w] = ' '
            node_dict['word'] = cv.node('word')
            cv.feature(
                node_dict['word'],
                g_cons_raw = word_trailer[w].rstrip(' '),
                g_cons = line[1]['g_cons'].split('-')[w],
                g_cons_utf8 = convert(last_letter(line[1]['g_cons'].split('-')[w])),
                lex = line[1]['lex'].split('-')[w],
                lex_utf8 = convert(last_letter(line[1]['lex'].split('-')[w])),
                trailer = trailer,
                language = 'Hebrew',
                sp = line[1]['sp'].split('-')[w],
            )
            
            for letter in convert(last_letter(word_trailer[w])):                  
                s = cv.slot()
                cv.feature(s, 
                       sign=re.sub('[\uFB2A\uFB2B]','\u05E9',letter)
                    )
                
            cv.terminate(node_dict['word'])

    # just for informational purposes
    print('\nINFORMATION:', cv.activeTypes(), '\n')
  
    for ntp in ('word','verse','chapter','book'):
        cv.terminate(node_dict[ntp])

In [19]:
cv = CV(TF)

good = cv.walk(
    director,
    slotType,
    otext=otext,
    generic=generic,
    intFeatures=intFeatures,
    featureMeta=featureMeta,
    generateTf=True
)

good


INFORMATION: {'verse', 'chapter', 'book'} 



True

### Parsing

In [20]:
from tf.app import use

A = use(f"app:./app", hoist=globals())

In [80]:
T.nodeFromSection(('Genesis',1,'18'))

100368

In [4]:
A.displaySetup(extraFeatures=["lex", "sp"])

In [5]:
A.pretty(100368)

In [90]:
L.d(100368, 'word')

(102214,
 102215,
 102216,
 102217,
 102218,
 102219,
 102220,
 102221,
 102222,
 102223,
 102224,
 102225,
 102226,
 102227,
 102228,
 102229,
 102230,
 102231,
 102232,
 102233,
 102234,
 102235,
 102236,
 102237)

In [103]:
F.trailer.v(102218)

''

In [61]:
F.lex_utf8.v(105411)

'תבה/'

In [23]:
len(F.otype.s('word'))

28950

In [None]:
A.pretty(103873)

In [116]:
T.nodeFromSection(('Genesis',1,'7'))

100262

In [72]:
T.text(100262)

'ויעש אלהים את הרקיע ויבדל בין המים אשר מתחת לרקיע ובין המים אשר מעל לרקיע ויהי כן '

In [26]:
query = '''
word lex=KDR_L>MR/
'''

A.show(A.search(query), end=5)

  0.06s 0 results


### Stats

In [27]:
print(f'Number of words: {len(F.otype.s("word"))}')

Number of words: 28945


In [7]:
query = '''
word sp=absent
'''

A.search(query)

  0.02s 7 results


[(108291,), (108469,), (110584,), (111274,), (116255,), (119235,), (126391,)]