# Porting from BHSA C to 2021

In [1]:
import json
import csv

from pathlib import Path
from tf.fabric import Fabric
from pprint import pprint

In [15]:
TF_DATA =  Path('/Users/cody/github/etcbc/bhsa/tf/')
REPO = Path('/Users/cody/github/BH_time_collocations/')
FUNCT_EDITS = '/Users/cody/Downloads/edit_function.json'
TIME_PARSINGS = REPO / 'data/data/parsing/time_parsings.json'

In [16]:
load_features = ' '.join([
    'function',
])

tf_c = Fabric(
    locations=[TF_DATA / 'c'],
)

api_c = tf_c.load(load_features)

tf_2021 = Fabric(
    locations=[TF_DATA / '2021'],
)

api_2021 = tf_2021.load(load_features + ' omap@c-2021')

In [17]:
with open(FUNCT_EDITS, 'r') as infile:
    function_edits = json.load(infile)

In [18]:
api_2021.EdgeString('omap@c-2021').f(881665)

((881696, None),)

In [19]:
def get_2021_nodes(node):
    return api_2021.EdgeString('omap@c-2021').f(node)

## Check for any slot diffs

In [24]:
slot_diffs = []

for slot in api_c.F.otype.s('word'):
    slots_2021 = get_2021_nodes(slot)
    if len(slots_2021) > 1:
        slot_diffs.append((slot, slots_2021))

In [25]:
len(slot_diffs)

6

In [32]:
for slot, diff in slot_diffs:
    cl = api_c.L.u(slot, 'clause')[0]
    ph_functs = set(
        api_c.F.function.v(ph)
        for ph in api_c.L.d(cl, 'phrase')
    )
    print(api_c.T.text(slot))
    print([api_2021.T.text(s) for s, _ in diff])
    print('Time in clause:', 'Time' in ph_functs)
    print(slot, diff)
    print()

אֲרַ֥ם נַֽהֲרַ֖יִם 
['אֲרַ֥ם ', 'נַֽהֲרַ֖יִם ']
Time in clause: False
11325 ((11325, 1), (11326, 1))

אֲרַ֥ם נַהֲרַ֖יִם 
['אֲרַ֥ם ', 'נַהֲרַ֖יִם ']
Time in clause: False
105981 ((105982, 1), (105983, 1))

אֲרַ֣ם נַהֲרָ֑יִם 
['אֲרַ֣ם ', 'נַהֲרָ֑יִם ']
Time in clause: False
128871 ((128873, 1), (128874, 1))

אֲרַ֣ם נַהֲרַיִם֮ 
['אֲרַ֣ם ', 'נַהֲרַיִם֮ ']
Time in clause: False
320252 ((320255, 1), (320256, 1))

אֲרַ֨ם נַהֲרַ֜יִם 
['אֲרַ֨ם ', 'נַהֲרַ֜יִם ']
Time in clause: False
401289 ((401293, 1), (401294, 1))

אֲרַ֤ם מַעֲכָה֙ 
['אֲרַ֤ם ', 'מַעֲכָה֙ ']
Time in clause: False
401292 ((401297, 1), (401298, 1))



## Check for Time phrase diffs

In [54]:
obsoleted = []
split = []

for phrase in api_c.F.function.s('Time'):
    phrase_2021 = get_2021_nodes(phrase)
    
    # get splits
    if len(phrase_2021) > 1:
        split.append((api_c.T.text(phrase), phrase, phrase_2021))
        continue
    
    # get funct comparison
    funct_2021 = api_2021.F.function.v(phrase_2021[0][0])
    if funct_2021 != 'Time':
        obsoleted.append((
            api_c.T.sectionFromNode(phrase),
            api_c.T.text(phrase),
            phrase,
            phrase_2021[0][0],
            funct_2021,
        ))

In [55]:
len(split)

1

In [56]:
len(obsoleted)

7

In [57]:
split

[('מִן־הַיֹּ֥ום הַזֶּ֖ה וָמָ֑עְלָה מִיֹּום֩ עֶשְׂרִ֨ים וְאַרְבָּעָ֜ה לַתְּשִׁיעִ֗י לְמִן־הַיֹּ֛ום ',
  831763,
  ((831794, 12), (831795, 7)))]

In [58]:
obsoleted

[(('Exodus', 14, 27), 'בֹּ֨קֶר֙ ', 674321, 674354, 'Subj'),
 (('Isaiah', 1, 26), 'כְּבָרִ֣אשֹׁנָ֔ה ', 774349, 774379, 'Adju'),
 (('Isaiah', 1, 26), 'כְּבַתְּחִלָּ֑ה ', 774352, 774382, 'Adju'),
 (('Isaiah', 9, 3), 'כְּיֹ֥ום מִדְיָֽן׃ ', 775985, 776017, 'Adju'),
 (('Joel', 3, 5), 'בְּהַר־צִיֹּ֨ון וּבִירוּשָׁלִַ֜ם ', 825329, 825360, 'Loca'),
 (('Jonah', 3, 4), 'מַהֲלַ֖ךְ יֹ֣ום אֶחָ֑ד ', 828081, 828112, 'Modi'),
 (('Psalms', 113, 6), 'בַּשָּׁמַ֥יִם וּבָאָֽרֶץ׃ ', 849296, 849330, 'Cmpl')]

## Port over the function edits and manually select those to keep

In [9]:
rows = []
header = [
    'ref', 'text', 'c_node', '2021_node', 
    'c_funct', '2021_funct', 'my_funct',
]

book_limit = 426596  # restrict with new corpus boundary

for c_node, edit in function_edits.items():
    c_node = int(c_node)
    book_node = api_c.L.u(c_node, 'book')[0]
    if book_node >= book_limit:
        continue
    my_function = edit['function']
    nodes_2021 = [
        node for (node, _)
        in api_2021.EdgeString('omap@c-2021').f(c_node)
    ]
    functs_2021 = [
        api_2021.F.function.v(node) for node
        in nodes_2021
    ]
    book, ch, vs = api_c.T.sectionFromNode(c_node)
    ref = f'{book} {ch}:{vs}'
    
    rows.append([
        ref,
        api_c.T.text(c_node),
        c_node,
        nodes_2021,
        api_c.F.function.v(c_node),
        functs_2021,
        edit['function'],
    ])

In [10]:
with open('curate_function_edits.csv', 'w') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(header)
    writer.writerows(rows)

In [8]:
with open('curated_function_edits.csv', 'r') as infile:
    reader = csv.reader(infile)
    annotated_funct_changes = list(reader)[1:]

In [4]:
annotated_funct_changes[0]

['Exodus 12:18',
 'בָּרִאשֹׁ֡ן ',
 '673337',
 '[673370]',
 'Adju',
 "['Adju']",
 'Time']

In [None]:

# Exodus 12:18, Adju->Time 
# mislabeled as Adju
 673370: "Time",

# Exodus 12:18, Adju->Time 
# mislabeled as Adju
 673373: "Time",

# Exodus 12:18, Adju->Time 
# mislabled as Adju
 673378: "Time",
    
# 1_Samuel 1:23, Time->Modi 
# no reviewed sources took this as temporal
 731946: "Modi",


In [30]:
for edit in annotated_funct_changes:
    ref = edit[0]
    text = edit[1]
    ph_node = eval(edit[3])[0]
    old_funct = eval(edit[-2])[0]
    new_funct = edit[-1]
    note = function_edits[edit[2]]['note']
    print(
f"""
# {old_funct}->{new_funct} 
# {note}
 {ph_node}: "{new_funct}",\
""")


# Adju->Time 
# mislabeled as Adju
 673370: "Time",

# Adju->Time 
# mislabeled as Adju
 673373: "Time",

# Adju->Time 
# mislabled as Adju
 673378: "Time",

# Adju->Time 
# Deut 31:10, mislabled as Adju (חג)
 714507: "Time",

# Time->Time 
# 1 Sam 25: 15, mislabled as Loca
 741508: "Time",

# Time->Modi 
# no reviewed sources took this as temporal
 731946: "Modi",

# Time->PreC 
# Gen 9:29; phrase is pred. complement
 654059: "PreC",

# Time->Adju 
# Josh 4:18; waters flowed 'as before'; not temporal
 716817: "Adju",

# Time->Adju 
# 2 Kgs 13:5; 'as before'; not temporal
 769441: "Adju",

# Time->Adju 
# 1 Sam 19:7; 'as before'; not temporal
 738950: "Adju",

# Time->Adju 
# 1 Sam 21:6; 'as before'; not temporal
 739973: "Adju",

# Time->Adju 
# Exod 30:10; This is frequentive not time location / duration
 679412: "Adju",

# Time->Adju 
# Exod 30:10; This is frequentive not time location / duration
 679414: "Adju",

# Time->Adju 
# Lev 16:34; This is frequentive not time location / d

In [13]:
new_rows = []

header = ['passage', 'text', 'phrase_node', 'old_function', 'new_function', 'note']

for edit in annotated_funct_changes:
    new_rows.append([
        edit[0],
        edit[1],
        eval(edit[3])[0],
        eval(edit[-2])[0],
        edit[-1],
        function_edits[edit[2]]['note'],
    ])

In [14]:
new_rows[0]

['Exodus 12:18', 'בָּרִאשֹׁ֡ן ', 673370, 'Adju', 'Time', 'mislabeled as Adju']

In [32]:
with open('function_edits_2021.csv', 'w') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(header)
    writer.writerows(new_rows)

## Port over temporal function annotations

In [16]:
with open(TIME_PARSINGS, 'r') as infile:
    time_parsings_raw = json.load(infile)
    
# subset the time parsings to those within corpus
time_parsings = {}
for cl, data in time_parsings_raw.items():
    cl_node = int(cl)
    book = api_c.L.u(cl_node, 'book')[0]
    if book >= book_limit:
        continue
    else:
        time_parsings[cl_node] = data

In [17]:
len(time_parsings)

2004

In [59]:
# check for diffs in clause
# we will set these aside to be solved later
diffs = []
diff_cl_set = set()

for cl_node in time_parsings:
    cl_2021 = [cl for cl, _ in get_2021_nodes(cl_node)]
    if len(cl_2021) > 1:
        diffs.append((cl_node, cl_2021))
        diff_cl_set.add(cl_node)

In [60]:
len(diffs)

2

In [61]:
diffs

[(435108, [435113, 435114]), (455537, [455553, 455554])]

In [62]:
# examine multiple phrase clauses

multi_phrase = []

for cl in time_parsings:
    if cl in diff_cl_set:
        continue
    cl_node = int(cl)
    data = time_parsings[cl]
    if len(data['phrase_nodes']) > 1:
        multi_phrase.append((cl_node, data))

In [75]:
multi_phrase[:1]

[(428143,
  {'times': [{'times': [3107],
     'slots': [3106, 3107, 3108, 3109],
     'quals': ['durative'],
     'tenses': [['FUT', [3108, 3109, 'ADJV']]],
     'preps': [['L', 3106]],
     'functions': ['dist_fut']},
    {'times': [{'times': [3116],
       'slots': [3115, 3116],
       'quants': [['NUMQ', 3115]],
       'quals': ['durative']},
      {'times': [3119],
       'slots': [3118, 3119],
       'quants': [['NUMQ', 3118]],
       'quals': ['durative']}],
     'slots': [3115, 3116, 3118, 3119],
     'functions': ['atelic_ext'],
     'quals': ['durative']}],
   'slots': [3106, 3107, 3108, 3109, 3115, 3116, 3118, 3119],
   'functions': ['distfut_ext'],
   'phrase_nodes': [653347, 653351]})]

It appears as if the majority of these have an equal number of `'times'` to `'phrase_nodes'`, which means we can match temporal function along that axis. Let's see if that's true.

In [80]:
# check to see how many have matching times

no_match = []
multi_functs = []

for cl, parsing in multi_phrase:
    n_times = len(parsing['times'])
    n_phrases = len(parsing['phrase_nodes'])
    if n_times != n_phrases:
        no_match.append((cl, parsing))
    for time in parsing['times']:
        if isinstance(time, dict) and len(time['functions']) > 1:
            multi_functs.append(cl)
        
no_match

[(434753,
  {'times': [34789],
   'slots': [34786, 34787, 34788, 34789, 34790, 34791, 34792, 34794],
   'refs': [['CALNUM', [34787, 34788, 'CARDC']],
    ['MONTH',
     {'times': [34792],
      'slots': [34790, 34791, 34792, 34794],
      'refs': [['THIS', 34794], ['THE', 34791]],
      'preps': [['L', 34790]],
      'functions': ['reference'],
      'ref': 'MONTH'}]],
   'functions': ['anterior_dur'],
   'preps': [['<D', 34786]],
   'quals': ['durative'],
   'phrase_nodes': [673175, 673176]}),
 (454827,
  {'times': [145729],
   'slots': [145728, 145729, 145731, 145732],
   'refs': [['LREF',
     {'times': [145732],
      'slots': [145731, 145732],
      'quals': ['durative'],
      'functions': ['reference'],
      'preps': [['L', 145731]]}],
    ['THE', 145728]],
   'functions': ['simultaneous'],
   'phrase_nodes': [734287, 734289]})]

In [86]:
time_parsings[428143]

{'times': [{'times': [3107],
   'slots': [3106, 3107, 3108, 3109],
   'quals': ['durative'],
   'tenses': [['FUT', [3108, 3109, 'ADJV']]],
   'preps': [['L', 3106]],
   'functions': ['dist_fut']},
  {'times': [{'times': [3116],
     'slots': [3115, 3116],
     'quants': [['NUMQ', 3115]],
     'quals': ['durative']},
    {'times': [3119],
     'slots': [3118, 3119],
     'quants': [['NUMQ', 3118]],
     'quals': ['durative']}],
   'slots': [3115, 3116, 3118, 3119],
   'functions': ['atelic_ext'],
   'quals': ['durative']}],
 'slots': [3106, 3107, 3108, 3109, 3115, 3116, 3118, 3119],
 'functions': ['distfut_ext'],
 'phrase_nodes': [653347, 653351]}

These are cases where L serves to indicate an anchoring reference point. We can thus take the one time function and assign it to the first time.

In [134]:
# preparing the temporal function annotations

def get_ref(node):
    return '{} {}:{}'.format(*api_c.T.sectionFromNode(node))


def get_2021_ph_with_exception(node):
    ph_2021 = get_2021_nodes(node)
    if len(ph_2021) > 1:
        raise Exception(node, ph_2021)
    else:
        return ph_2021[0][0]


rows = []

for cl, parsing in time_parsings.items():
    
    cl_2021 = get_2021_nodes(cl)[0][0]
    
    # skip clauses that have diff boundaries compared to 2021
    if cl in diff_cl_set:
        continue
        
    phrases = parsing['phrase_nodes']
    times = parsing['times']
    
    if isinstance(times[0], int):
        ph = phrases[0]
        ref = get_ref(ph)
        funct = parsing['functions'][0]
        rows.append(
            (ref, cl, get_2021_ph_with_exception(ph), funct)
        )
    else:
        if len(phrases) == len(times):
            temp_functs = [time['functions'][0] for time in times]
            for phrase, funct in zip(phrases, temp_functs):
                ref = get_ref(phrase)
                rows.append(
                    (ref, cl, get_2021_ph_with_exception(phrase), funct)
                )
        else:
            ph = phrases[0]
            ref = get_ref(ph)
            funct = parsing['functions'][0]
            rows.append(
                (ref, cl, get_2021_ph_with_exception(ph), funct)
            )

In [135]:
len(rows)

2063

In [136]:
api_c.T.sectionFromNode(652870)

('Genesis', 5, 6)

In [143]:
time_parsings[427805]

{'times': [1326],
 'slots': [1325, 1326, 1327, 1328],
 'refs': [['GEN', [1327, 1328, 'DEF']]],
 'functions': ['anterior_dur, simultaneous'],
 'preps': [['L', 1325]],
 'phrase_nodes': [652298]}

In [144]:
api_c.T.text(1326)

'ר֣וּחַ '

In [138]:
rows

[('Genesis 1:1', 427553, 651573, 'simultaneous'),
 ('Genesis 2:2', 427680, 651954, 'simultaneous'),
 ('Genesis 2:2', 427682, 651960, 'simultaneous'),
 ('Genesis 2:5', 427693, 651988, 'anterior'),
 ('Genesis 2:5', 427694, 651993, 'anterior'),
 ('Genesis 3:8', 427805, 652329, 'anterior_dur, simultaneous'),
 ('Genesis 3:14', 427835, 652411, 'atelic_ext'),
 ('Genesis 3:17', 427851, 652458, 'atelic_ext'),
 ('Genesis 3:22', 427866, 652506, 'simultaneous'),
 ('Genesis 3:22', 427870, 652517, 'anterior_dur, simultaneous'),
 ('Genesis 4:3', 427887, 652566, 'posterior'),
 ('Genesis 4:11', 427916, 652659, 'simultaneous'),
 ('Genesis 4:14', 427927, 652690, 'simultaneous'),
 ('Genesis 4:26', 427975, 652845, 'simultaneous'),
 ('Genesis 5:3', 427984, 652872, 'atelic_ext'),
 ('Genesis 5:6', 427993, 652901, 'atelic_ext'),
 ('Genesis 5:7', 427995, 652910, 'atelic_ext'),
 ('Genesis 5:9', 428000, 652923, 'atelic_ext'),
 ('Genesis 5:10', 428002, 652932, 'atelic_ext'),
 ('Genesis 5:12', 428007, 652945, 'atel