# Convert to TF

In [1]:
import os, re, collections
import pandas as pd

from tf.fabric import Fabric
from tf.convert.walker import CV

## Import files

Files:

In [17]:
CORRECTIONS = '../data/corrections_1.xlsx'
NEW_DATA = '../data/g_cons_raw-Exod-Deut.xlsx'

from tf.app import use
A = use('DT-UCPH/sp', hoist=globals(), checkout='clone', version='1.5.8')

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
,,,
book,1.0,100676.0,100.0
chapter,50.0,2013.52,100.0
verse,1533.0,65.67,100.0
word,29046.0,3.47,100.0
sign,100676.0,1.0,100.0


In [18]:
def importFiles(mode):
    
    if mode=='corrections':
        source = pd.read_excel(CORRECTIONS, keep_default_na=False)

        #Add lacking features from the very latest TF-version
        features = []
        for f in Fall():
            if f not in source.columns and not 'utf8' in f and f not in {'book','chapter','verse','language','sign','otype'}:
                source[f] = [eval(f'F.{f}.v({w})') for w in F.otype.s('word')]
                
        return source
    
    elif mode in {'add_feature','add_books'}:
               
        #Selecting features from the very latest TF-version
        features = []
        for f in Fall():
            if not 'utf8' in f and f not in {'book','chapter','verse','language','sign','otype'}:
                features.append(f)
        
        #Creating the original features
        feature_dict = collections.defaultdict(lambda: collections.defaultdict())
        for w in F.otype.s('word'):
            feature_dict[w]['ref'] = '''{} {} {}'''.format(*T.sectionFromNode(w))

            for f in features:
                feat = f'F.{f}.v({w})'
                feature_dict[w][f] = eval(feat)           
        df = pd.DataFrame(feature_dict).T
        df['trailer'] = [True if t else False for t in list(df.trailer)]
        
        if mode == 'add_feature':
        
            #Add new data + name derived from filename
            new_file = pd.read_csv(NEW_DATA, keep_default_na=False)
            name = re.sub('.csv','',NEW_DATA)
            name = name[name.rfind('/')+1:]
            df[name] = list(new_file[name])

            if 'MT_parsing' in new_file.columns:
                df['mt_feat'] = list(new_file['MT_parsing'])

            return df
        
        else: #add_books
            new_file = pd.read_excel(NEW_DATA)
            
            #Modify new file to split units into words ('-')
            new_ref = []
            new_prediction = []
            new_g_cons_raw = []
            new_trailer = []

            for n, row in new_file.iterrows():
                prediction_split = row['prediction'].split('-')
                g_cons_raw_split = row['g_cons_raw'].split('-')

                #Check if word division equals prediction
                if len(prediction_split) != len(g_cons_raw_split):
                    print("Error!", n, row['g_cons_raw'], row['prediction'])
                else:
                    for n in range(len(prediction_split)):
                        new_ref.append(row['ref'])
                        new_prediction.append(prediction_split[n])
                        new_g_cons_raw.append(g_cons_raw_split[n])
                        if n < len(prediction_split)-1:
                            new_trailer.append(False)
                        else:
                            new_trailer.append(True)

            new_df = pd.DataFrame([new_ref, new_prediction, new_g_cons_raw, new_trailer]).T
            new_df.columns = ['ref','prediction','g_cons_raw','trailer']
            
            #Fill empty feature columns in new_df with '?'
            for col in df.columns:
                if col not in new_df.columns:
                    new_df[col] = '?'
    
            return pd.concat([df, new_df])
        
    else:
        print('select mode!')
    
source = importFiles(mode='add_books')
source

Unnamed: 0,ref,g_cons,g_cons_raw,g_lex,g_nme,g_pfm,g_prs,g_uvf,g_vbe,g_vbs,...,mt_feat,nu,prediction,prs_gn,prs_nu,prs_ps,ps,sp,trailer,vt
102261,Genesis 1 1,B,B,B,,,,,,,...,True,,B,,,,,prep,False,
102262,Genesis 1 1,R>CJT,R>CJT,R>CJT,/,,,,,,...,True,sg,R>CJT/,,,,,subs,True,
102263,Genesis 1 1,BR>,BR>,BR>,,,,,[,,...,True,sg,BR>[/,,,,p3,verb,True,perf
102264,Genesis 1 1,>LHJM,>LHJM,>LH,/JM,,,,,,...,True,pl,>LH(J(M/JM,,,,,subs,True,
102265,Genesis 1 1,>T,>T,>T,,,,,,,...,True,,>T,,,,,prep,True,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87626,Deuteronomy 34 12,?,MCH,?,?,?,?,?,?,?,...,?,?,MCH=/,?,?,?,?,?,True,?
87627,Deuteronomy 34 12,?,L,?,?,?,?,?,?,?,...,?,?,L,?,?,?,?,?,False,?
87628,Deuteronomy 34 12,?,<JNJ,?,?,?,?,?,?,?,...,?,?,<JN/J=,?,?,?,?,?,True,?
87629,Deuteronomy 34 12,?,KL,?,?,?,?,?,?,?,...,?,?,KL/,?,?,?,?,?,True,?


### Converting to Hebrew script

In [29]:
script = {'>': '\u05D0',
         'B': '\u05D1',
         'G': '\u05D2',
         'D': '\u05D3',
         'H': '\u05D4',
         'W': '\u05D5',
         'Z': '\u05D6',
         'X': '\u05D7',
         'V': '\u05D8',
         'J': '\u05D9',
         'k': '\u05DB',
         'K': '\u05DA',
         'L': '\u05DC',
         'm': '\u05DE',
         'M': '\u05DD',          
         'n': '\u05E0',
         'N': '\u05DF',
         'S': '\u05E1',
         '<': '\u05E2',
         'p': '\u05E4',
         'P': '\u05E3',
         'y': '\u05E6',
         'Y': '\u05E5',
         'Q': '\u05E7',
         'R': '\u05E8',
         'C': '\uFB2A',
         'F': '\uFB2B',
         'T': '\u05EA',
         '-':'-',
         ' ':' ',
         '/':'/',
         '[':'[',
         '=':'=',
         '_':' ',
         '!':'!',
         ']':']',
         '+':'+',
         '~':'~',
         '?':'?',
         }

def last_letter(w, trailer):
   
    if w:
        w = re.sub('[KMNPY]', lambda m: m.group(0).lower(),w) #Lower case certain letters
        if trailer:
            last_letter = re.sub('[kmnpy]', lambda m: m.group(0).upper(),w[-1]) #Upper case last letter
            return w[:-1]+last_letter
        else:
            return w

def convert(w):

    if w and w not in ['absent']:
        return ''.join([script[l] for l in w])

### Prepare the conversion

In [20]:
def reference(ref):
    bo, ch, ve = ref.split()
    return bo,ch,ve

In [21]:
DATA_FOLDER = 'tf'
VERSION = '2.0'
DESCRIPTION = 'Adding Exodus to Deuteronomy'

TF_PATH = f'../{DATA_FOLDER}/{VERSION}'
TF = Fabric(locations=TF_PATH, silent=True)

In [22]:
slotType = 'sign'

Metadata

In [23]:
generic = {
    'name': 'The Samaritan Pentateuch',
    'createdBy': "Stefan Schorch in colloboration with Evelyn Burkhardt, Ulrike Hirschfelder, Irina Wandrey and József Zsengellér",
    'convertedBy': 'Martijn Naaijer and Christian Canu Højgaard',
    'source': "Stefan Schorch's data files, personal communication",
    'licence': "Creative Commons Attribution-NonCommercial 4.0 International License",
    "licenceUrl": "http://creativecommons.org/licenses/by-nc/4.0/",
    'version': VERSION,
    'purpose': DESCRIPTION
}

Representations

In [24]:
otext = {
    'fmt:text-orig-full': '{sign}',
    'sectionTypes': 'book,chapter,verse',
    'sectionFeatures': 'book,chapter,verse',
}

In [25]:
featureMeta = {
    'sign': {
        'description': 'consonantal letter',
    },
    'book': {
        'description': 'book title'
    },
    'chapter': {
        'description': 'chapter number',
    },
    'verse': {
        'description': 'verse number',
    },
    'g_cons_raw': {
        'description': 'word consonantal-transliterated (without disambiguation of Shin (C) and Sin (F))',
    },
    'g_cons': {
        'description': 'word consonantal-transliterated',
    },
    'g_cons_utf8': {
        'description': 'word in Hebrew script',
    },
    'trailer': {
        'description': 'interword material',
    },
    'prediction': {
        'description': 'neural network prediction',
    }, 
    'lex': {
        'description': 'lexeme consonantal-transliterated',
    },
    'lex_utf8': {
        'description': 'lexeme in Hebrew script',
    },
    'language': {
        'description': 'language',
    },
    'sp': {
        'description': 'part of speech',
    },
    'g_lex': {
        'description': 'realized lexeme',
    },
    'g_lex_utf8': {
        'description': 'realized lexeme in Hebrew script',
    },
    'g_nme': {
        'description': 'realized nominal ending consonantal',
    },
    'g_nme_utf8': {
        'description': 'realized nominal ending consonantal in Hebrew script',
    },
    'g_vbe': {
        'description': 'realized verbal ending consonantal',
    },
    'g_vbe_utf8': {
        'description': 'realized verbal ending consonantal in Hebrew script',
    },
    'g_pfm': {
        'description': 'realized verbal preformative consonantal',
    },
    'g_pfm_utf8': {
        'description': 'realized verbal preformative consonantal in Hebrew script',
    },
    'g_vbs': {
        'description': 'realized verbal stem consonantal',
    },
    'g_vbs_utf8': {
        'description': 'realized verbal stem consonantal in Hebrew script',
    },
    'g_prs': {
        'description': 'realized pronominal suffix consonantal',
    },
    'g_prs_utf8': {
        'description': 'realized pronominal suffix consonantal in Hebrew script',
    },
    'g_uvf': {
        'description': 'realized univalent final',
    },
    'g_uvf_utf8': {
        'description': 'realized univalent final in Hebrew script',
    },
    'vt': {
        'description': 'verbal tense',
    },
    'ps': {
        'description': 'grammatical person',
    },
    'prs_ps': {
        'description': 'pronominal suffix person',
    },
    'nu': {
        'description': 'grammatical number',
    },
    'prs_nu': {
        'description': 'pronominal suffix number',
    },
    'gn': {
        'description': 'gender',
    },
    'prs_gn': {
        'description': 'pronominal suffix gender',
    },
    'mt_feat': {
        'description': 'features imposed from MT',
    },
}

In [31]:
intFeatures = {
  'chapter', 'verse'
}

In [27]:
def director(cv):   
    label_dict = dict(
        book = '',
        chapter = '',
        verse = '',
        word = '',
    )
    
    node_dict = dict(
        book=None,
        chapter=None,
        verse=None,
        lex=None,
        word=None,
    )

    for line in source.iterrows():
        
        book_title, chapter_number, verse_number = reference(line[1]['ref'])
        
        if book_title != label_dict['book']:
            label_dict['book'] = book_title
            for ntp in ('word','verse','chapter','book'):
                cv.terminate(node_dict[ntp])
                node_dict[ntp] = None   
            node_dict['book'] = cv.node('book')
            cv.feature(
              node_dict['book'],
              book=book_title,
            )
            
        if chapter_number != label_dict['chapter']:
            label_dict['chapter'] = chapter_number
            for ntp in ('word','verse','chapter'):
                cv.terminate(node_dict[ntp])
                node_dict[ntp] = None
            node_dict['chapter'] = cv.node('chapter')
            cv.feature(
              node_dict['chapter'],
              chapter = chapter_number,
            )
         
        if verse_number != label_dict['verse']:
            label_dict['verse'] = verse_number
            for ntp in ('word','verse'):
                cv.terminate(node_dict[ntp])
                node_dict[ntp] = None   
            node_dict['verse'] = cv.node('verse')
            cv.feature(
              node_dict['verse'],
              verse = verse_number,
            )
            
        #print(reference(line[1]['ref']))
        
        node_dict['word'] = cv.node('word')
        
        if line[1]['trailer'] == False:
            trailer = ''
        else:
            trailer = ' '
        
        cv.feature(
            node_dict['word'],
            g_cons_raw = line[1]['g_cons_raw'],
            g_cons = line[1]['g_cons'],
            g_cons_utf8 = convert(last_letter(line[1]['g_cons'], trailer)),
            lex = line[1]['lex'],
            lex_utf8 = convert(last_letter(line[1]['lex'], trailer)),
            trailer = trailer,
            prediction = line[1]['prediction'],
            g_lex = line[1]['g_lex'],
            g_lex_utf8 = convert(last_letter(line[1]['g_lex'], trailer)),
            g_nme = line[1]['g_nme'],
            g_nme_utf8 = convert(last_letter(line[1]['g_nme'], trailer)),
            g_vbe = line[1]['g_vbe'],
            g_vbe_utf8 = convert(last_letter(line[1]['g_vbe'], trailer)),
            g_pfm = line[1]['g_pfm'],
            g_pfm_utf8 = convert(last_letter(line[1]['g_pfm'], trailer)),
            g_vbs = line[1]['g_vbs'],
            g_vbs_utf8 = convert(last_letter(line[1]['g_vbs'], trailer)),
            g_prs = line[1]['g_prs'],
            g_prs_utf8 = convert(last_letter(line[1]['g_prs'], trailer)),
            g_uvf = line[1]['g_uvf'],
            g_uvf_utf8 = convert(last_letter(line[1]['g_uvf'], trailer)),
            language = 'Hebrew',
            sp = line[1]['sp'],
            vt = line[1]['vt'],
            ps = line[1]['ps'],
            prs_ps = line[1]['prs_ps'],
            nu = line[1]['nu'],
            prs_nu = line[1]['prs_nu'],
            gn = line[1]['gn'],
            prs_gn = line[1]['prs_gn'],
            mt_feat = str(line[1]['mt_feat']),
            )
        
        signs = f"{convert(last_letter(line[1]['g_cons'], trailer))}{trailer}"
        if signs == 'None':
            signs = ' '
        
        for letter in signs:                  
            s = cv.slot()
            cv.feature(s, 
                    sign=re.sub('[\uFB2A\uFB2B]','\u05E9',letter)
                )
                
        cv.terminate(node_dict['word'])

    # just for informational purposes
    print('\nINFORMATION:', cv.activeTypes(), '\n')
  
    for ntp in ('word','verse','chapter','book'):
        cv.terminate(node_dict[ntp])

In [32]:
cv = CV(TF)

good = cv.walk(
    director,
    slotType,
    otext=otext,
    generic=generic,
    intFeatures=intFeatures,
    featureMeta=featureMeta,
    generateTf=True
)

good

  0.00s Importing data from walking through the source ...
   |     0.00s Preparing metadata... 
   |     0.01s No structure nodes will be set up
   |   SECTION   TYPES:    book, chapter, verse
   |   SECTION   FEATURES: book, chapter, verse
   |   STRUCTURE TYPES:    
   |   STRUCTURE FEATURES: 
   |   TEXT      FEATURES:
   |      |   text-orig-full       sign
   |     0.01s OK
   |     0.00s Following director... 

INFORMATION: {'chapter', 'verse', 'book'} 

   |       24s "edge" actions: 0
   |       24s "feature" actions: 372541
   |       24s "node" actions: 122710
   |       24s "resume" actions: 0
   |       24s "slot" actions: 249831
   |       24s "terminate" actions: 128944
   |          5 x "book" node 
   |        187 x "chapter" node 
   |     249831 x "sign" node  = slot type
   |       5841 x "verse" node 
   |     116677 x "word" node 
   |     372541 nodes of all types
   |       24s OK
   |     0.00s checking for nodes and edges ... 
   |     0.00s OK
   |     0.00s 

True