# Parse g_cons

In [2]:
import os, re
import pandas as pd
import numpy as np

In [22]:
source = pd.read_csv('results_predictions_SP_input.txt', sep='\t', header=None)
source.columns = ['line','ref','raw','prediction']
source.index = source.line
source

Unnamed: 0_level_0,line,ref,raw,prediction
line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,Genesis 1 1,BR>CJT,B-R>CJT/
1,1,Genesis 1 1,BR>,BR>[/
2,2,Genesis 1 1,>LHJM,>LH(J(M/JM
3,3,Genesis 1 1,>T,>T
4,4,Genesis 1 1,HCMJM,H-CMJ(M/(JM
...,...,...,...,...
20875,20875,Genesis 50 26,WJXNVW,Wn-J!XNV[W
20876,20876,Genesis 50 26,>TW,>T+W
20877,20877,Genesis 50 26,WJWCM,Wn-J!(H](J&WCM[
20878,20878,Genesis 50 26,B>RN,B->RN/


In [19]:
def g_cons_trailer(w):

    retain_special = '-<>'
    w = re.sub(r'\([\w]|\([><]','', w) #Remove suffixes, e.g. (H
    w = re.sub('[^A-Z'+retain_special+']', '', w) #Retain only capital letters plus special letters
    
    return '-'.join(w.split('-')) #split string according to hyphen.

def check_corrections(df):
    
    df['test'] = [re.sub('-', '', g_cons_trailer(w)) for w in list(df.parsed)]
    mismatches = df[df.raw != df.test]
    
    if len(mismatches) > 0:
        return mismatches
    else:
        return "All mismatches corrected"

In [20]:
g_cons_trailer('W->HL/JM~H')

'W->HLJMH'

#### Make predictions and test parsing:

In [23]:
source['parsed'] = [g_cons_trailer(w) for w in list(source.prediction)]
source['test'] = [re.sub('-', '', g_cons_trailer(w)) for w in list(source.prediction)]
source

Unnamed: 0_level_0,line,ref,raw,prediction,parsed,test
line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,Genesis 1 1,BR>CJT,B-R>CJT/,B-R>CJT,BR>CJT
1,1,Genesis 1 1,BR>,BR>[/,BR>,BR>
2,2,Genesis 1 1,>LHJM,>LH(J(M/JM,>LHJM,>LHJM
3,3,Genesis 1 1,>T,>T,>T,>T
4,4,Genesis 1 1,HCMJM,H-CMJ(M/(JM,H-CMJM,HCMJM
...,...,...,...,...,...,...
20875,20875,Genesis 50 26,WJXNVW,Wn-J!XNV[W,W-JXNVW,WJXNVW
20876,20876,Genesis 50 26,>TW,>T+W,>TW,>TW
20877,20877,Genesis 50 26,WJWCM,Wn-J!(H](J&WCM[,W-JWCM,WJWCM
20878,20878,Genesis 50 26,B>RN,B->RN/,B->RN,B>RN


In [24]:
mismatches = source[source.raw != source.test]
mismatches

Unnamed: 0_level_0,line,ref,raw,prediction,parsed,test
line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
174,174,Genesis 1 14,L>TWT,L->T==+W,L->TW,L>TW
179,179,Genesis 1 15,LM>WRWT,L-!M>&WR[/W,L-M>WRW,LM>WRW
442,442,Genesis 2 2,WJKL,Wn-J!(HLK[,W-JLK,WJLK
607,607,Genesis 2 14,HCLJCJ,H]CL&JC[,HCLJC,HCLJC
614,614,Genesis 2 14,HRBJ<J,H]RB&J<[,HRBJ<,HRBJ<
...,...,...,...,...,...,...
20347,20347,Genesis 49 23,WJMRRHW,Wn-J!MRRR[+HW,W-JMRRRHW,WJMRRRHW
20351,20351,Genesis 49 23,XYJM,XY/,XY,XY
20399,20399,Genesis 49 27,BNJMJM,BN/JM,BNJM,BNJM
20422,20422,Genesis 49 28,KBRKTW,KBR(H/T+W,KBRTW,KBRTW


#### Import and check corrections:

In [33]:
corrections = pd.read_excel('./data/corrections.xlsx')
source['parsed'] = list(corrections['g_cons_raw'])

In [39]:
corrections

Unnamed: 0.1,Unnamed: 0,line,ref,raw,C_disambig,g_cons_raw,lex
0,0,0,Genesis 1 1,BR>CJT,BR>CJT,B-R>CJT,B-R>CJT/
1,1,1,Genesis 1 1,BR>,BR>,BR>,BR>[
2,2,2,Genesis 1 1,>LHJM,>LHJM,>LHJM,>LHJM/
3,3,3,Genesis 1 1,>T,>T,>T,>T
4,4,4,Genesis 1 1,HCMJM,HCMJM,H-CMJM,H-CMJM/
...,...,...,...,...,...,...,...
20858,20858,20875,Genesis 50 26,WJXNVW,WJXNVW,W-JXNVW,W-XNV[
20859,20859,20876,Genesis 50 26,>TW,>TW,>TW,>T
20860,20860,20877,Genesis 50 26,WJWCM,WJWCM,W-JWCM,W-JCM[
20861,20861,20878,Genesis 50 26,B>RN,B>RN,B->RN,B->RN/


In [35]:
check_corrections(source)

'All mismatches corrected'

In [36]:
source['g_cons_raw'] = source['parsed']
source = source[['ref','raw','g_cons_raw']]

## Disambiguate C and F

Make dataset:

In [38]:
#source[['book','chapter','verse']] = source['ref'].str.split(' ', 2, expand=True)
#source[['book','chapter','verse','raw']].to_csv('./data/SP_input_disambiguate_C_2.txt', sep='\t', header=None, index=False)

Merge corrections:

In [40]:
from tf.app import use
A = use('app:./app', hoist=globals())

In [41]:
for v in F.otype.s('verse'):
    bo, ch, ve = T.sectionFromNode(v)
    ref = f"{bo} {ch} {ve}"
    lenA = len(corrections[corrections.ref == ref])
    lenB = len(source[source.ref == ref])
    
    if lenA != lenB:
        print(ref)

In [42]:
source['C_disambig'] = list(corrections.C_disambig)
source

Unnamed: 0_level_0,ref,raw,g_cons_raw,book,chapter,verse,C_disambig
line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Genesis 1 1,BR>CJT,B-R>CJT,Genesis,1,1,BR>CJT
1,Genesis 1 1,BR>,BR>,Genesis,1,1,BR>
2,Genesis 1 1,>LHJM,>LHJM,Genesis,1,1,>LHJM
3,Genesis 1 1,>T,>T,Genesis,1,1,>T
4,Genesis 1 1,HCMJM,H-CMJM,Genesis,1,1,HCMJM
...,...,...,...,...,...,...,...
20875,Genesis 50 26,WJXNVW,W-JXNVW,Genesis,50,26,WJXNVW
20876,Genesis 50 26,>TW,>TW,Genesis,50,26,>TW
20877,Genesis 50 26,WJWCM,W-JWCM,Genesis,50,26,WJWCM
20878,Genesis 50 26,B>RN,B->RN,Genesis,50,26,B>RN


If the disambiguated units do not contain both F and C, the disambiguation can safely be imposed upon g_cons_raw to create a new feature: g_cons

In [43]:
f'Number of cases with both F and C: {len([p for p in source.C_disambig if "F" in p and "C" in p])}'

'Number of cases with both F and C: 4'

In [44]:
g_cons = []

for n, row in source.iterrows():
    if 'F' in row['C_disambig']:
        g_cons.append(re.sub('C','F', row['g_cons_raw']))
        if 'F' in row['C_disambig'] and 'C' in row['C_disambig']:
            print(f"{n}\t{row['C_disambig']}")
    else:
        g_cons.append(row['g_cons_raw'])
        
source['g_cons'] = g_cons

11456	JCFKR
14082	WJCFKR
19004	JCFKR
20274	JCFKR


In [45]:
source = source[['ref','raw','g_cons_raw','g_cons']]

In [46]:
source[source.ref == 'Genesis 36 25']

Unnamed: 0_level_0,ref,raw,g_cons_raw,g_cons
line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14454,Genesis 36 25,W>LH,W->LH,W->LH
14455,Genesis 36 25,BNJ,BNJ,BNJ
14456,Genesis 36 25,<NH,<NH,<NH
14457,Genesis 36 25,DJCWN,DJCWN,DJCWN
14458,Genesis 36 25,W>HLJBMH,W->HLJBMH,W->HLJBMH
14459,Genesis 36 25,BT,BT,BT
14460,Genesis 36 25,<NH,<NH,<NH


## Export

In [47]:
source.to_csv('./data/g_cons.txt', sep='\t')