# Parsing morphemes

This notebook contains scripts to extract SP morphemes from MT (whenever possible) and the predictions from the neural network model.

The procedure is in three steps:
1. Imposing features from MT whenever the SP consonantal text and lexeme match MT
2. The remaining features are extracted from the neural network predictions
3. The data are merged with priority given to the data imposed from MT

In [1]:
import os
import pandas as pd
import numpy as np
import re
import collections

In [2]:
from tf.app import use
MT = use('etcbc/bhsa')
SP = use('DT-UCPH/sp:hot', hoist=globals())

## 1. g_cons to feature

In [3]:
FEATURE = 'g_lex'
MT.load(FEATURE)

True

In [4]:
def checkMT(ref, SPw, SPlex):
    
    bo, ch, ve = ref
    try:
        int(ve)
    except:
        return 'False'
    
    ref = MT.api.T.nodeFromSection((bo, int(ch), int(ve)))
    words = MT.api.L.d(ref, 'word')
    
    MT_feat = 'False'
    for w in words:
        if SPw == MT.api.F.g_cons.v(w) and SPlex == MT.api.F.lex.v(w):
            MT_feat =  eval(f"MT.api.F.{FEATURE}.v({w})")
            if MT_feat == '':
                MT_feat = 'absent'
            break
    
    return str(re.sub('[\_AEIOU\@\.\:\;\-]','',MT_feat))

In [5]:
MT_feat = []
MT_parsing = []

for w in SP.api.F.otype.s('word'):
    
    g_cons = SP.api.F.g_cons.v(w)
    lex = SP.api.F.lex.v(w)
    ref = SP.api.T.sectionFromNode(w)
    
    check_MT = checkMT(ref, g_cons, lex)
    MT_feat.append(check_MT)
    
    if check_MT == 'False':
        MT_parsing.append('')
    else:
        MT_parsing.append('True')

In [6]:
MT_feat

['B',
 'R>CJT',
 'BR>',
 '>LH',
 '>T',
 'H',
 'CMJ',
 'W',
 '>T',
 'H',
 '>RY',
 'W',
 'H',
 '>RY',
 'HJ',
 'THW',
 'W',
 'BHW',
 'W',
 'XCK',
 '<L',
 'PN',
 'THWM',
 'W',
 'RWX',
 '>LH',
 'RXP',
 '<L',
 'PN',
 'H',
 'MJ',
 'W',
 '>MR',
 '>LH',
 'HJ',
 '>WR',
 'W',
 'HJ',
 '>WR',
 'W',
 'R>',
 '>LH',
 '>T',
 'H',
 '>WR',
 'KJ',
 'VWB',
 'W',
 'BDL',
 '>LH',
 'BJN',
 'H',
 '>WR',
 'W',
 'BJN',
 'H',
 'XCK',
 'W',
 'QR>',
 '>LH',
 'L',
 '>WR',
 'JWM',
 'W',
 'L',
 'XCK',
 'QR>',
 'LJLH',
 'W',
 'HJ',
 '<RB',
 'W',
 'HJ',
 'BQR',
 'JWM',
 '>XD',
 'W',
 '>MR',
 '>LH',
 'HJ',
 'RQJ<',
 'B',
 'TWK',
 'H',
 'MJ',
 'W',
 'HJ',
 'BDJL',
 'BJN',
 'MJ',
 'L',
 'MJ',
 'W',
 '<F',
 '>LH',
 '>T',
 'H',
 'RQJ<',
 'W',
 'BDL',
 'BJN',
 'H',
 'MJ',
 '>CR',
 'M',
 'TXT',
 'L',
 'RQJ<',
 'W',
 'BJN',
 'H',
 'MJ',
 '>CR',
 'M',
 '<L',
 'L',
 'RQJ<',
 'W',
 'HJ',
 'KN',
 'W',
 'QR>',
 '>LH',
 'L',
 'RQJ<',
 'CMJ',
 'W',
 'HJ',
 '<RB',
 'W',
 'HJ',
 'BQR',
 'JWM',
 'CNJ',
 'W',
 '>MR',
 '>LH',
 'QW',
 'H',


In [8]:
n=0
e=0
for u in MT_feat:
    if u == 'False':
        e+=1
    n+=1
        
print(f"Lexemes matched: {round((n-e)/n*100, 2)}%")

Lexemes matched: 93.53%


## 2. Probabilistic model to feature

#### Opdating prediction dataset

In [None]:
def TF2table():
    g_cons_string = ''
    for w in SP.api.F.otype.s('word'):
        if not SP.api.F.trailer.v(w):
            trailer = '-'
        else:
            trailer = ' '
        g_cons_string += f'{SP.api.F.g_cons.v(w)}{trailer}'
        
    return g_cons_string.split(' ')  

In [None]:
data = pd.read_csv('./data/results_predictions_SP_input_corrected_09.11.22.txt', sep='\t', header=None)
data.columns = [0,'ref','raw','prediction']


g_cons_string = pd.DataFrame(TF2table())
data = pd.concat([data, g_cons_string], ignore_index=True, axis=1)
data.columns = [0, 'ref', 'raw', 'prediction', 'g_cons']

for n, row in data.iterrows():
    if len(row['g_cons'].split('-')) != len(row['prediction'].split('-')):
        display(row)
        display(data[data.ref == row['ref']])
        break

In [None]:
for n, row in data.iterrows():
    g_cons = re.sub('\-','',row['g_cons'])
    g_cons = re.sub('\_',' ', g_cons)
    g_cons = re.sub('F','C', g_cons)
    if g_cons != row['raw']:
        display(row)
        display(data[data.ref == row['ref']])
        break

#### Importing data

In [46]:
def g_nme(w):
    
    w = re.sub('\([A-Z]','', w) #Remove root letter
    w = re.search('/[A-Z]*', w)
    
    if w: return w.group()
    
def g_vbe(w):
    
    w = re.sub('\([A-Z]','', w) #Remove root letter
    w = re.search('\[[A-Z]*', w)
    
    if w: return w.group()
    
def g_pfm(w):
    
    w = re.search('!?[A-Z]*\!', w)
    
    if w:
   
        if w.group()[0] != '!': #For some reason, the predictions never prefix the preformative with an !
            return f"!{w.group()}"
        else:
            return w.group()
        
def g_vbs(w):

    w = re.sub('\([A-Z]','', w) #Remove root letter
    w = re.search('\\]?[A-Z]*\\]', w)
    
    if w:
        if w.group()[0] != ']': #For some reason, the predictions rarely prefix the preformative with an ]
            return f"]{w.group()}"
        else:
            return w.group()
        
def g_prs(w):
    
    w = re.sub('\([A-Z]','', w) #Remove root letter
    w = re.search('\+[A-Z]*', w)
    
    if w: return w.group()
    
def g_lex(w):
    
    w = re.sub('\([A-Z\>\<]','', w) #Remove unrealized root letter
    w = re.sub('!?[A-Z\>\<]*=?!','', w) #Remove verbal prefixes and possible disambiguation of prefixes
    w = re.sub('\]?[A-Z]*\]', '', w) #Remove verbal stem
    w = re.sub('&', '', w) #Retain matres lectionis
    w = re.sub('\[/[A-Z\>\<]*', '', w) #Remove nominal suffix from participle and infinitive
    w = re.sub('\[[A-Z\>\<]*','', w) #Remove verbal suffixes
    w = re.sub('/[A-Z\>\<]*','', w) #Remove nominal suffixes
    w = re.sub('\+[A-Z\>\<]*','',w) #Remove pronominal suffixes
    w = re.sub('\~[A-Z\>\<]*','',w) #Remove univalent final
    w = re.sub(':?[a-z]', '', w) #Remove state and verbal stem
    w = re.sub('=','', w) #Remove lexical disambiguation
    
    return w
    
def g_uvf(w):
    
    w = re.sub('\([A-Z]','', w) #Remove preformative
    w = re.search('\~[A-Z]*', w)

    if w: return w.group()
    

# A list of forms for testing purposes
input_forms = ["BR>[", "B-R>CJT/", "!M!RXP[/TK:d", "W:n-!J!>MR[", 'B-!H!](N]BR>[/+M','W-!T=!](N]R>H[','CN(J(M/J=+HM','>B/~J',
               'L-!M>&WR[/W', 'H-CMJ(M/(JM', '!J!HJ(H[', 'W:n-!J!HJ(H[', 'W-L-!!]H]BD&JL[/:c', 'K-DMWT/+NW',
               'Wn-J!(H](J&WCM[','W:n-!J!JFM[W','W-HCT]XWH[','W-HT]BRK[','W->HL/JM~H','(<NC','<YBWN/J+K']

[g_lex(w) for w in input_forms]

['BR>',
 'B-R>CJT',
 'RXP',
 'W->MR',
 'B-BR>',
 'W-R>H',
 'CN',
 '>B',
 'L-M>WR',
 'H-CMJ',
 'HJ',
 'W-HJ',
 'W-L-BDJL',
 'K-DMWT',
 'W-WCM',
 'W-JFM',
 'W-XWH',
 'W-BRK',
 'W->HL',
 'NC',
 '<YBWN']

In [47]:
pred_feat = [g_lex(SP.api.F.prediction.v(w)) if g_lex(SP.api.F.prediction.v(w)) else 'absent' for w in SP.api.F.otype.s('word')]

In [48]:
collections.Counter(pred_feat)

Counter({'B': 871,
         'R>CJT': 3,
         'BR>': 10,
         '>LH': 316,
         '>T': 1180,
         'H': 1653,
         'CMJ': 39,
         'W': 4215,
         '>RY': 316,
         'HJ': 213,
         'THW': 1,
         'XCK': 8,
         '<L': 301,
         'PN': 160,
         'THWM': 4,
         'RWX': 12,
         'MRXP': 1,
         'MJ': 72,
         '>MR': 610,
         '>WR': 9,
         'R>': 91,
         'KJ': 291,
         'VWB': 45,
         'BDL': 2,
         'BJN': 74,
         'QR>': 126,
         'L': 1360,
         'JWM': 83,
         'LJLH': 26,
         '<RB': 16,
         'BQR': 35,
         '>XD': 43,
         'RQJ<': 9,
         'TWK': 16,
         'MBDJL': 1,
         '<C': 66,
         '>CR': 422,
         'M': 369,
         'TXT': 32,
         'KN': 56,
         'CM': 258,
         'CN': 228,
         'QW': 1,
         '>L': 370,
         'MQWM': 47,
         'R>H': 28,
         'JBC': 3,
         'MQWH': 1,
         'JM': 91,
         'DC>': 3,
     

## 3. Merge data:

In [49]:
df = pd.DataFrame([pred_feat, MT_feat, MT_parsing]).T
df.columns = ['pred_feat','MT_feat','MT_parsing']
df

Unnamed: 0,pred_feat,MT_feat,MT_parsing
0,B,B,True
1,R>CJT,R>CJT,True
2,BR>,BR>,True
3,>LH,>LH,True
4,>T,>T,True
...,...,...,...
29045,WCM,False,
29046,B,B,True
29047,>RN,False,
29048,B,B,True


In [50]:
feat_result = []

for n, row in df.iterrows():
    
    if not row['MT_parsing']:
        feat_result.append(row['pred_feat'])
    else:
        feat_result.append(row['MT_feat'])
        
feat_result = [re.sub('absent', '', w) for w in feat_result]    
df[FEATURE] = feat_result

## Export

In [51]:
FEATURE

'g_lex'

In [52]:
export_df = df[[FEATURE,'MT_parsing']]
export_df.to_csv(f'./data/{FEATURE}.csv', index=0)