# Lexeme and language

The lex feature is based on two sources in order to enhance the accuracy: 1) the MT g_cons feature whenever the SP g_cons agrees with MT, and 2) a probabilistic model based on MT to account for the remaining lexemes.

In [1]:
import os
import pandas as pd
import numpy as np
from re import sub, findall
import collections

### 1. g_cons to lex

This procedure searches for a match between SP and MT in terms of shared g_cons. The search pattern does not take into account the order of the words but looks through the whole verse.

In [3]:
from tf.app import use
MT = use('etcbc/bhsa')

In [4]:
def checkMT(ref, SPw):
    
    bo, ch, ve = ref
    try:
        int(ve)
    except:
        return 'False'
    
    ref = MT.api.T.nodeFromSection((bo, int(ch), int(ve)))
    words = MT.api.L.d(ref, 'word')
    
    lex = 'False'
    for w in words:
        if SPw == MT.api.F.g_cons.v(w):
            lex =  MT.api.F.lex.v(w)
            break
    
    return str(lex)

In [6]:
g_cons_df = pd.read_csv('./data/g_cons.txt', sep='\t')
g_cons_df

Unnamed: 0,line,ref,raw,g_cons_raw,g_cons
0,0,Genesis 1 1,BR>CJT,B-R>CJT,B-R>CJT
1,1,Genesis 1 1,BR>,BR>,BR>
2,2,Genesis 1 1,>LHJM,>LHJM,>LHJM
3,3,Genesis 1 1,>T,>T,>T
4,4,Genesis 1 1,HCMJM,H-CMJM,H-CMJM
...,...,...,...,...,...
20858,20875,Genesis 50 26,WJXNVW,W-JXNVW,W-JXNVW
20859,20876,Genesis 50 26,>TW,>TW,>TW
20860,20877,Genesis 50 26,WJWCM,W-JWCM,W-JWCM
20861,20878,Genesis 50 26,B>RN,B->RN,B->RN


In [7]:
g_cons_lex = []

for n, row in g_cons_df.iterrows():
    
    unit_str = ''
    for w in row['g_cons'].split('-'):
        unit_str += checkMT(row['ref'].split(), w) + '-'
    g_cons_lex.append(unit_str.rstrip('-'))
    
#g_cons_lex

In [8]:
n=0
e=0
for u in g_cons_lex:
    for w in u.split('-'):
        if w == 'False':
            e+=1
        n+=1
        
print(f"Lexemes matched: {round((n-e)/n*100, 2)}%")

Lexemes matched: 91.28%


### 2. Probabilistic model to lex

The remaining lexemes are gleaned from the probabilistic model.

In [9]:
source = pd.read_csv('results_predictions_SP_input.txt', sep='\t', header=None)
source.columns = ['line','ref','raw','prediction']
source = source.set_index('line')

Merge with disambiguated C:

In [14]:
corrections = pd.read_excel('data/corrections.xlsx')

source['C_disambig'] = list(corrections.C_disambig)
source

Unnamed: 0_level_0,ref,raw,prediction,C_disambig,lex
line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Genesis 1 1,BR>CJT,B-R>CJT/,BR>CJT,B-R>CJT/
1,Genesis 1 1,BR>,BR>[/,BR>,BR>[
2,Genesis 1 1,>LHJM,>LH(J(M/JM,>LHJM,>LHJM/
3,Genesis 1 1,>T,>T,>T,>T
4,Genesis 1 1,HCMJM,H-CMJ(M/(JM,HCMJM,H-CMJM/
...,...,...,...,...,...
20875,Genesis 50 26,WJXNVW,Wn-J!XNV[W,WJXNVW,W-XNV[
20876,Genesis 50 26,>TW,>T+W,>TW,>T
20877,Genesis 50 26,WJWCM,Wn-J!(H](J&WCM[,WJWCM,W-JCM[
20878,Genesis 50 26,B>RN,B->RN/,B>RN,B->RN/


In [15]:
def lex(w):
    
    w = sub('\(','', w) #Retain root letter
    w = sub('!?[A-Z]*=?!','', w) #Remove verbal prefixes and possible disambiguation of prefixes
    w = sub('\]?[A-Z]*\]', '', w) #Remove verbal stem
    w = sub('&[A-Z]', '', w) #Remove matres lectionis
    w = sub('\[/[A-Z]*', '[', w) #Remove nominal suffix from participle and infinitive
    w = sub('\[[A-Z]*','[', w) #Remove verbal suffixes
    w = sub('/[A-Z]*','/', w) #Remove nominal suffixes
    w = sub('\+[A-Z]*','',w) #Remove pronominal suffixes
    w = sub('\~[A-Z]*','',w) #Remove pronominal suffixes
    w = sub(':?[a-z]', '', w) #Remove state and verbal stem
    
    return w

# A list of forms for testing purposes
input_forms = ["BR>[", "B-R>CJT/", "!M!RXP[/TK:d", "W:n-!J!>MR[", 'B-!H!](N]BR>[/+M','W-!T=!](N]R>H[','CN(J(M/J=+HM','>B/~J',
               'L-!M>&WR[/W', 'H-CMJ(M/(JM', '!J!HJ(H[', 'W:n-!J!HJ(H[', 'W-L-!!]H]BD&JL[/:c', 'K-DMWT/+NW',
               'Wn-J!(H](J&WCM[','W:n-!J!JFM[','W-HCT]XWH[','W-HT]BRK[','W->HL/JM~H']

[lex(w) for w in input_forms]

['BR>[',
 'B-R>CJT/',
 'RXP[',
 'W->MR[',
 'B-BR>[',
 'W-R>H[',
 'CNJM/=',
 '>B/',
 'L-M>R[',
 'H-CMJM/',
 'HJH[',
 'W-HJH[',
 'W-L-BDL[',
 'K-DMWT/',
 'W-JCM[',
 'W-JFM[',
 'W-XWH[',
 'W-BRK[',
 'W->HL/']

In [16]:
lexemes = []

for n, row in source.iterrows():

    lexeme = lex(row['prediction'])
    
    if 'F' in row['C_disambig']:
        lexemes.append(sub('C','F', lexeme))
        if 'F' in row['C_disambig'] and 'C' in row['C_disambig']:
            print(f"{n}\t{row['C_disambig']}")
    else:
        lexemes.append(lexeme)

source['lex'] = lexemes

11456	JCFKR
14082	WJCFKR
19004	JCFKR
20274	JCFKR


Apply corrections:

In [17]:
source = source[['ref','raw','lex']]

In [18]:
source[source.ref == 'Genesis 18 6']

Unnamed: 0_level_0,ref,raw,lex
line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5672,Genesis 18 6,WJMHR,W-MHR[
5673,Genesis 18 6,>BRHM,>BRHM/
5674,Genesis 18 6,H>HLH,H->HL/
5675,Genesis 18 6,>L,>L
5676,Genesis 18 6,CRH,FRH/
5677,Genesis 18 6,WJ>MR,W->MR[
5678,Genesis 18 6,MHRJ,MN-HR/
5679,Genesis 18 6,CLC,CLC/
5680,Genesis 18 6,SJM,SJM/
5681,Genesis 18 6,QMX,QMX/


#### Merge with g_cons to lex source (cf. above):

In [19]:
source['g_cons_lex'] = g_cons_lex
source

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  source['g_cons_lex'] = g_cons_lex


Unnamed: 0_level_0,ref,raw,lex,g_cons_lex
line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Genesis 1 1,BR>CJT,B-R>CJT/,B-R>CJT/
1,Genesis 1 1,BR>,BR>[,BR>[
2,Genesis 1 1,>LHJM,>LHJM/,>LHJM/
3,Genesis 1 1,>T,>T,>T
4,Genesis 1 1,HCMJM,H-CMJM/,H-CMJM/
...,...,...,...,...
20875,Genesis 50 26,WJXNVW,W-XNV[,W-XNV[
20876,Genesis 50 26,>TW,>T,>T
20877,Genesis 50 26,WJWCM,W-JCM[,W-False
20878,Genesis 50 26,B>RN,B->RN/,B-False


How consistent is the probalistic model?

In [20]:
source['lex'].compare(source['g_cons_lex'])

Unnamed: 0_level_0,self,other
line,Unnamed: 1_level_1,Unnamed: 2_level_1
10,W-B,W-BHW/
11,W-XCK[,W-XCK/
17,MRXPT/,RXP[
32,VWB/,VWB[
49,BQR/,BQR=/
...,...,...
20867,MZH,False
20873,W-<FR/,W-<FR=/
20874,CNJM/=,CNH/
20877,W-JCM[,W-False


In [21]:
lex_result = []

for n, row in source.iterrows():
    
    lex = ''
    i=0
    
    if len(row['g_cons_lex'].split('-')) != len(row['lex'].split('-')):
        lex_result.append(row['lex'])
    
    else:
        for w in row['g_cons_lex'].split('-'):
            if w == 'False':
                lex += f"{row['lex'].split('-')[i]}-"
                i+=1
            else:
                lex += f"{w}-"
                i+=1
            
        lex_result.append(lex.rstrip('-'))
    
source['lex_result'] = lex_result

### Compare with g_cons

Test whether the lex and g_cons parsings result in the same amount of words. If not, they have to manually corrected.

In [22]:
source[source.ref == 'Genesis 24 20']

Unnamed: 0_level_0,ref,raw,lex,g_cons_lex,lex_result
line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8281,Genesis 24 20,WTMHR,W-MHR[,W-MHR[,W-MHR[
8282,Genesis 24 20,WTWRD,W-JRD[,W-False,W-JRD[
8283,Genesis 24 20,KDH,K,KD/,KD/
8284,Genesis 24 20,<L,<L,False,<L
8285,Genesis 24 20,HCQWT,H-CQH/,H-False,H-CQH/
8286,Genesis 24 20,WTRY,W-RWY[,W-RWY[,W-RWY[
8287,Genesis 24 20,<WD,<WD/,<WD/,<WD/
8288,Genesis 24 20,>L,>L,>L,>L
8289,Genesis 24 20,HB>R,H-B>R/,H-B>R/,H-B>R/
8290,Genesis 24 20,LC>B,L-C>B[,L-C>B[,L-C>B[


In [23]:
g_cons = pd.read_csv('./data/g_cons.txt', sep='\t')

source['g_cons'] = list(g_cons.g_cons)
source

Unnamed: 0_level_0,ref,raw,lex,g_cons_lex,lex_result,g_cons
line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Genesis 1 1,BR>CJT,B-R>CJT/,B-R>CJT/,B-R>CJT/,B-R>CJT
1,Genesis 1 1,BR>,BR>[,BR>[,BR>[,BR>
2,Genesis 1 1,>LHJM,>LHJM/,>LHJM/,>LHJM/,>LHJM
3,Genesis 1 1,>T,>T,>T,>T,>T
4,Genesis 1 1,HCMJM,H-CMJM/,H-CMJM/,H-CMJM/,H-CMJM
...,...,...,...,...,...,...
20875,Genesis 50 26,WJXNVW,W-XNV[,W-XNV[,W-XNV[,W-JXNVW
20876,Genesis 50 26,>TW,>T,>T,>T,>TW
20877,Genesis 50 26,WJWCM,W-JCM[,W-False,W-JCM[,W-JWCM
20878,Genesis 50 26,B>RN,B->RN/,B-False,B->RN/,B->RN


In [24]:
for n, row in source.iterrows():
    if len(findall('-', row['lex'])) != len(findall('-', row['g_cons'])):
        print(row['ref'], row['lex'], row['g_cons'])

Genesis 1 6 MN-BDJL/ MBDJL
Genesis 2 14 CLC[ H-CLJCJ
Genesis 2 14 RB<[ H-RBJ<J
Genesis 5 15 MN-H-L>L/ MHLL>L
Genesis 5 16 MN-H-LL>L/ MHLL>L
Genesis 5 26 K-CLC/ CLC
Genesis 6 16 ML<ML/ M-L-M<LH
Genesis 8 4 H-RV/ HRRV
Genesis 12 3 MN-BRK/= MBRKJK
Genesis 12 8 H-<J/ W-H-<J
Genesis 12 16 H-JVJB/ HJJVB
Genesis 15 7 >WR/ M->WR
Genesis 15 18 MNH/ M-NHR
Genesis 17 1 W W-JR>
Genesis 20 9 MN-<FJM/ M<FJM
Genesis 24 5 H-CB[ HHCB
Genesis 24 8 MCBH/ M-CBW<TJ
Genesis 27 28 W-MCNJM/= W-M-CMNJ
Genesis 29 27 B<D/ B-<BDH
Genesis 30 42 W-B-H-<VJP/ W-B-H<VJP
Genesis 31 15 KRT[ K--NKRJWT
Genesis 32 23 MN-<BR==/ M<BR
Genesis 37 32 KWT[ H-KJTNT
Genesis 43 3 H-<D/ H<D
Genesis 43 3 H-<D H<D
Genesis 43 10 H-TMWMH/ HTMHMNW
Genesis 43 11 MN-<V/ M<V
Genesis 46 33 MN-<F/ M<FJKM
Genesis 47 3 MN-<F/ M<FJKM
Genesis 47 21 H-<BJD/ H<BJD
Genesis 49 11 KSH/ K-SWTW
Genesis 49 28 KBRH/ K-BRKTW


In [25]:
mistakes_df = source[['ref','raw','lex']]
mistakes_df['cor'] = ['' for w in range(len(mistakes_df))]
mistakes_df.to_csv('lex_mistakes.txt', sep='\t')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mistakes_df['cor'] = ['' for w in range(len(mistakes_df))]


## Export

In [28]:
source = source[['ref','raw','lex_result']]
source.columns = ['ref','raw','lex']

In [29]:
source.to_csv('./data/lex.txt', sep='\t')