# Match the French dataset with Hebrew dataset

In [5]:
import re
import csv
import json
from pathlib import Path
from tf.fabric import Fabric
from Levenshtein import distance as levdist

FRENCH_FILE = "../../_private_/French/all verbs NBS11.CSV"
BHSA2FRENCH = "../../_private_/French/bhsa2french.json"
BHSA_DATA = "/Users/cody/github/etcbc/bhsa/tf/c/"

In [6]:
books = [
'Genesis',
'Exodus',
'Leviticus',
'Numbers',
'Deuteronomy',
'Joshua',
'Judges',
'Ruth',
'1_Samuel',
'2_Samuel',
'1_Kings',
'2_Kings',
'1_Chronicles',
'2_Chronicles',
'Ezra',
'Nehemiah',
'Esther',
'Job',
'Psalms',
'Proverbs',
'Ecclesiastes',
'Song_of_songs',
'Isaiah',
'Jeremiah',
'Lamentations',
'Ezekiel',
'Daniel',
'Hosea',
'Joel',
'Amos',
'Obadiah',
'Jonah',
'Micah',
'Nahum',
'Habakkuk',
'Zephaniah',
'Haggai',
'Zechariah',
'Malachi',
]

int2book = {i+1: book for i, book in enumerate(books)}

In [20]:
for i in reversed([1, 2, 3]):
    print(i)

3
2
1


In [108]:
ref_re = re.compile(r'(\d\d\d)(\d\d\d)(\d\d\d)(\d\d)(\d\d\d)')
def parse_refstring(string):
    """Parse a refstring from UBS.
    
    String consists of:
        BBBCCCVVVSSWWW
    where:
        B = Book, C = chapter, V = Verse, 
        S = segment (can be ignored), W = Word
    """
    data = ref_re.match(string).groups()
    return [int(s) for s in data]

class BhsaWord:
    def __init__(self, node, dist):
        """Store BHSA word node and distance from target word."""
        self.node = node
        self.dist = dist

def match_french(frenchpath, outpath, bhsa_path):
    """Match the French data to our dataset."""
    
    # load the French dataset
    with open(frenchpath, 'r') as infile:
        reader = csv.reader(infile, delimiter='\t')
        french_data = list(reader)
    
    # load the BHSA Hebrew data for matching the Hebrew text
    TF = Fabric(locations=bhsa_path)
    API = TF.load('g_word_utf8')
    F, T, L = API.F, API.T, API.L
    
    # match the Hebrew verbs in the French data with the 
    # Hebrew verbs in BHSA
    # we treat the ref strings as unique ID's
    # we use 2 dicts; one to hold ID 2 BHSA node mappings
    # another to hold the IDs 2 french data
    french2bhsa = {}
    french2data = {}
    
    for row in french_data:
        
        # parse French data
        wid, wstring, wparse, wfrench = row
        bk, ch, vs, sg, wnum = parse_refstring(wid)
        french2data[wid] = {
            'wid': wid,
            'string': wstring,
            'parse': wparse,
            'French': wfrench,
        }
        
        # look up BHSA data and get the verse node
        tf_book = int2book[bk]
        vrs_node = T.nodeFromSection((tf_book, ch, vs))
        
        if vrs_node is None:
            raise Exception((tf_book, ch, vs), wid, wstring)
        
        # get the closest matching word from the verse
        french2bhsa[wid] = BhsaWord(0, float('inf')) # initialize with dummy word
        for word_node in L.d(vrs_node, 'word'):
            bhsa_txt = F.g_word_utf8.v(word_node)
            dist = levdist(bhsa_txt, wstring)
            if french2bhsa[wid].dist > dist:
                french2bhsa[wid] = BhsaWord(word_node, dist)
                
    # iterate over both french dicts and assemble
    # into one BHSA dict
    bhsa2french = {}
    for wid, bhsa_word in french2bhsa.items():
        bhsa_node = bhsa_word.node 
        if bhsa_node != 0:
            bhsa2french[bhsa_node] = french2data[wid]

    # the linking is complete
    with open(outpath, 'w') as outfile:
        json.dump(bhsa2french, outfile, indent=2, ensure_ascii=False)

In [110]:
match_french(FRENCH_FILE, BHSA2FRENCH, BHSA_DATA)

This is Text-Fabric 8.4.5
Api reference : https://annotation.github.io/text-fabric/cheatsheet.html

114 features found and 0 ignored
  0.00s loading features ...
   |     0.00s Dataset without structure sections in otext:no structure functions in the T-API
  4.82s All features loaded/computed - for details use loadLog()


# Double Check

In [97]:
from tf.app import use
A = use('bhsa', hoist=globals())

In [111]:
with open(BHSA2FRENCH, 'r') as infile:
    bhsa2french = json.load(infile)

In [112]:
i = 0
for bhsa_node, data in bhsa2french.items():
    if i > 100:
        break
    print(T.text(int(bhsa_node)), '->', data['string'])
    i += 1

בָּרָ֣א  -> בָּרָ֣א
הָיְתָ֥ה  -> הָיְתָ֥ה
טֹ֑וב  -> טֹ֑וב
קָ֣רָא  -> קָ֣רָא
קָרָ֣א  -> קָרָ֣א
טֹֽוב׃  -> טֹֽוב
טֹֽוב׃  -> טֹֽוב
טֹֽוב׃  -> טֹֽוב
שָׁרְצ֨וּ  -> שָׁרְצ֨וּ
טֹֽוב׃  -> טֹֽוב
טֹֽוב׃  -> טֹֽוב
בָּרָ֣א  -> בָּרָ֣א
בָּרָ֥א  -> בָּרָ֥א
נָתַ֨תִּי  -> נָתַ֨תִּי
עָשָׂ֔ה  -> עָשָׂ֔ה
עָשָׂ֑ה  -> עָשָׂ֑ה
עָשָֽׂה׃  -> עָשָֽׂה
שָׁבַת֙  -> שָׁבַת֙
בָּרָ֥א  -> בָּרָ֥א
יָצָֽר׃  -> יָצָֽר
מָצָ֥א  -> מָצָ֥א
לָקַ֥ח  -> לָקַ֥ח
דָבַ֣ק  -> דָבַ֣ק
הָי֖וּ  -> הָי֖וּ
הָיָ֣ה  -> הָיָ֣ה
עָשָׂ֖ה  -> עָשָׂ֖ה
אָמַ֣ר  -> אָמַ֣ר
אָמַ֣ר  -> אָמַ֣ר
הְיִיתֶם֙  -> הְיִיתֶם֙
שָׁמַ֖עְתִּי  -> שָׁמַ֖עְתִּי
אָכָֽלְתָּ׃  -> אָכָֽלְתָּ
נָתַ֣תָּה  -> נָתַ֣תָּה
נָֽתְנָה־ -> נָֽתְנָה
עָשִׂ֑ית  -> עָשִׂ֑ית
עָשִׂ֣יתָ  -> עָשִׂ֣יתָ
אָמַ֗ר  -> אָמַ֗ר
אָמַ֗ר  -> אָמַ֗ר
שָׁמַעְתָּ֮  -> שָׁמַעְתָּ֮
אָכַלְתָּ֖  -> אָכַלְתָּ֖
הָֽיְתָ֖ה  -> הָֽיְתָ֖ה
הָיָה֙  -> הָיָה֙
לָקַח֙  -> לָקַח֙
אָכַ֖ל  -> אָכַ֖ל
חַ֥י  -> חַ֥י
יָדַ֖ע  -> יָדַ֖ע
קָנִ֥יתִי  -> קָנִ֥יתִי
שָׁעָ֑ה  -> שָׁעָ֑ה
חָ֣רָה  -> חָ֣רָה
נָפְל֥וּ  -> נָפְל֥וּ
יָדַ֔עְתִּ