# Convert the Lexicon to a Standardized JSON Lexicon Format

In [52]:
import collections
import unicodedata
import re
import json
import pandas as pd
from pathlib import Path
glossary_file = 'bar glossary general.txt'

## Developing standard Lexicon and Analyzed Word JSONs

We need to match words in the text with their lexemes. We should also allow for some
additional data to be added to words later on.

A nice model to follow is that of the Eep Talstra Centre for Bible and Computer, which
encodes data about Hebrew words in two formats: 

* lexicon - 2 columns, first is the lexical form of a word, second is lexicon data (example [here](https://github.com/ETCBC/data_creation/blob/master/grammar_libraries/synvar_library/hebrew/lexicon))
* anwb (or anzb) an analytical word book - 2 columns, first is an inflected surface
    form of a word, second is a tag containing parsing data as well as the lexical form
    of the word. The anwb enables the link to the lexicon file. (example [here](https://raw.githubusercontent.com/ETCBC/data_creation/master/grammar_libraries/standard_library/hebrew/anwb))
    
We will implement a similar approach but with JSON files. The format is simple: a key and dictionary of values.
For the lexicon, the key is a lexeme string with linked attributes. For the inflections data, the key is an
inflected form, while the attributes contains a link to the lexeme as well as any localized data.

**lexicon**

In [14]:
lexicon_eg = {
    
    'ʾabaya': {
        'pos': 'noun',
        'gn': 'm',
        'gloss': "a man's cloak",
        'examples': 'ʾabàyele də́rya b-réšaˈ He put his cloak over her (A26:50)...',
    },
    
    # ...
}

**inflections**

**NB: An inflection can contain attributes that can overwrite attributes from the lexicon.**

In [51]:
inflections_eg = {
    
    'ʾabbaya.NOUN': {
        'lex': 'ʾabaya', 
        'nu': 'sg',
    },
    'ʾabaye': {
        'lex': 'ʾabaya.NOUN', 
        'nu': 'pl',
    },
    'ʾabbaye': {
        'lex': 'ʾabaya.NOUN',
        'nu': 'pl',
    },
    
    #...
}

All text will be stored in decomposed Unicode form.

# Build the standards

We now load the glossary information we have for Barwar. We need to make a few things.

* word grammar standards - standard codes and acceptable tag values for word grammar 
    (e.g. nu=number, lex=lexeme, etc.)
* parse grammatical descriptions
* convert lexicon file
* convert inflections file

First, we load the glossary data and begin inspecting it to construct the standards.

In [20]:
def normalize_entry(text):
    """Normalize text by removing style elements/quirks"""
    # remove trailing spaces
    text = text.strip()
    # remove style asterixes
    text = text.replace('*', '')
    # remove begin/end single quotes
    text = re.sub('^\u0027|\u0027$', '', text)
    text = unicodedata.normalize('NFC', text)
    return text
    
with open(glossary_file ,'r') as infile:
    glosses = infile.read()
    glosses = re.split('\n\n', glosses)
    gloss_data = []
    for lemma in glosses:
        lemma_data = {}
        for dataline in lemma.split('\n'):
            key, value = dataline.split(':', 1)
            key,value = key.strip(), value.strip()
            lemma_data[key] = value
            
        gloss_data.append(lemma_data)
        
# clean up and prep nena gloss data for 
# matching with instances in the text
nena_glosses = {}
for gl_id, gloss in enumerate(gloss_data):
    data = {}
    data['lemmas'] = tuple(
        normalize_entry(ent) for ent in re.split('[,;]', gloss['lemma'])
    )
    data['lemma'] = normalize_entry(gloss['lemma'])
    data['grm_desc'] = normalize_entry(gloss.get('grm_desc', ''))
    forms = eval(gloss.get('forms', '[]'))
    data['forms'] = [
        (fdata, normalize_entry(form)) 
            for fdata, forms in forms
                for form in re.split('[;,]', forms)
    ]
    data['examples'] = unicodedata.normalize('NFC',
        re.sub('^\u0027|\u0027$', '', gloss.get('examples',''))
    )
    data['trans'] = normalize_entry(
        re.sub('\(§[\d.,;\s§]+\)', '', gloss.get('trans',''))
    )
    
    data['lang'] = normalize_entry(gloss.get('lang',''))
    data['ref'] = normalize_entry(gloss.get('ref', ''))
    nena_glosses[gl_id] = data
    
# prepare matchset for matching surface forms
gloss_matchset = []

for lem_id, lemdat in nena_glosses.items():
    for lemma in lemdat['lemmas']:
        lemma = lemma.strip('-')
        gloss_matchset.append(
            (re.compile(f'^{lemma}$'), (lem_id, ''))
        )
    for form in lemdat['forms']:
        form_type = form[0]
        for formstring in form[1:]:
            if not formstring:
                continue
            formstring = formstring.strip('-')
            gloss_matchset.append(
                (re.compile(f'^{formstring}$'), (lem_id, form_type))
            )

In [21]:
print(f'{len(gloss_matchset)} surface forms ready for match attempts...')

6674 surface forms ready for match attempts...


In [22]:
nena_glosses[0]

{'lemmas': ('ʾabaya', 'ʾabbaya'),
 'lemma': 'ʾabaya, ʾabbaya',
 'grm_desc': 'n.m.',
 'forms': [('pl.', 'ʾabaye'), ('pl.', 'ʾabbaye')],
 'examples': '*ʾabàyele də́rya b-réšaˈ* He put his cloak over her (A26:50); *ʾɛ́-ga kúlla ʾaġăwáθa šexìyeˈ kúlla b-ʾabbàye-wawaˈ* At that time all aghas and elders wore the abbaya gown (A26:47); *ʾíman ṱ-ila-nabólle bə̀draˈ m-gu-ʾằra,ˈ mattíwa ... ʾabàya y-amrə́xwaleˈ* When they took it (the rice) to the threshing floor from that ground, they would lay down what we called a cloak (B5:80).',
 'trans': 'man’s cloak',
 'lang': 'A.',
 'ref': ''}

In [26]:
lex_df = pd.DataFrame(nena_glosses.values())

lex_df.shape

(3687, 8)

In [27]:
lex_df.head()

Unnamed: 0,lemmas,lemma,grm_desc,forms,examples,trans,lang,ref
0,"(ʾabaya, ʾabbaya)","ʾabaya, ʾabbaya",n.m.,"[(pl., ʾabaye), (pl., ʾabbaye)]",*ʾabàyele də́rya b-réšaˈ* He put his cloak ove...,man’s cloak,A.,
1,"(ʾabona,)",ʾabona,n.m.,"[(pl., ʾabone)]",,bishop,A.,
2,"(ʾabresəm,)",ʾabresəm,,[],,,,habresəm
3,"(ʾăbu,)",ʾăbu,part.,[],*ʾăbu-bə̀rqaˈ* electrician(s) (B10:50); *băyán...,attributive particle (A. literally: father of),,
4,"(ʾAðər,)",ʾAðər,n.m.,[],,March,,


In [43]:
lex_df.grm_desc.value_counts().head(60)

n.m.                1492
n.f.                1043
adj.                 330
                     196
n.pl.                116
part.                 73
adj. invar.           71
adv.                  69
n.m./adj.             51
num.                  39
prep.                 31
n.m./f.               26
n.m                   22
m.                    19
interj.               18
mod.                   7
pron. pl.              6
n.m.,                  5
pron. fs.              4
pron. ms.              4
n.f./adj.              4
n.f                    4
n.pl.tan.              3
m.n.                   3
n.f./m.                3
f.                     3
mod. adv.              3
n.f.,                  3
pron.                  3
n.m. /adj.             2
adj. m.                2
pl.                    2
pron. cs.              2
part. pron.            2
n.m/adj.               2
adj. cs.               2
n.f. pl.               1
n.f./adj..             1
adj.f.                 1
adj./adv. invar.       1


In [None]:
re_to_tag = [
    ('', '')

]

In [49]:
lex_df[lex_df.grm_desc.str.contains('num.')]

Unnamed: 0,lemmas,lemma,grm_desc,forms,examples,trans,lang,ref
64,"(ʾarba, ʾarpa)","ʾarba, ʾarpa",num.,"[(f., ʾarbe)]",*ʾurzət ʾarba* the fourth man; *baxtət ʾarbe* ...,four,,
65,"(ʾarbassər,)",ʾarbassər,num.,[],,fourteen,,
66,"(ʾarbaθn–,)",ʾarbaθn–,num.,[],base for pronominal suffixes: *ʾarbaθnən* the ...,four of,,
67,"(ʾarbi,)",ʾarbi,num.,[],*b-ʾarbìye* in the (19)40s (B5:161),forty,,
124,"(ʾawwal, ʾawwəl)","ʾawwal, ʾawwəl",num.,[],*ʾáwwəl dórta* the first round (B10:13); *ʾáww...,first,A.,
165,"(ʾəč̣č̣a,)",ʾəč̣č̣a,num.,"[(f., ʾəčča)]",*ʾurzət ʾəč̣č̣a* the ninth man; *baxtət ʾəčča*...,nine,,
166,"(ʾəč̣č̣assər,)",ʾəč̣č̣assər,num.,[],,nineteen,,
167,"(ʾəč̣č̣aθn–,)",ʾəč̣č̣aθn–,num.,[],base for pronominal suffixes: *ʾəč̣č̣aθnən* th...,nine of,,
168,"(ʾəč̣č̣i,)",ʾəč̣č̣i,num.,[],,ninety,,
199,"(ʾəsri,)",ʾəsri,num.,[],,twenty,,


## Word grammar 

In [57]:
word_grammar = Path('/Users/cody/github/CambridgeSemiticsLab/nena_corpus/standards/word_grammar.json')

grammar_data = {
    'pos': {
        'desc': 'part of speech',
        'tags': {
            'NOUN': 'noun',
            'VERB': 'verb',
            'PREP': 'preposition',
            'ADJV': 'adjective',
            'ADVB': 'adverb',
            'PART': 'particle',
            'MODI': 'non-attributive modifier',
            'PRON': 'pronoun',
            'INTJ': 'interjection',
            'NUMR': 'numeral',
        }
    },
    'form': {
        'desc': 'variability of word form',
        'tags': {
            'INVAR': 'lexeme is invariable',
        }
    },
    'st': {
        'desc': 'state',
        'tags': {
            'C': 'construct'
        }
    },
    'gn': {
        'desc': 'gender',
        'tags': {
            'F': 'feminine',
            'M': 'masculine',
            'C': 'common',
        }
    },
    'nu': {
        'desc': 'number',
        'tags': {
            'SG': 'singular',
            'PL': 'plural',
        }
    },
    'nu_class': {
        'desc': 'semantic class of number',
        'tags': {
            'TANT': 'pluralis tantum: noun with only a plural form'
        }
    },
    'trns': {
        'desc': 'transitivity',
        'tags': {
            'TR': 'transitive',
            'INTR': 'intransitive',
        }
    },
    'syn': {
        'desc': 'synonymy/antonymy',
        'tags': {
            'AN': 'antonym',
            'SN': 'synonym',
        }
    },
    'stem': {
        'desc': 'stem of a verb',
        'tags': {
            'I': 'stem I verb',
            'II': 'stem II verb',
            'III': 'stem III verb',
            'Q': 'quadriliteral verb',
        }
    },
    'tense': {
        'desc': 'grammatical tense of a verb',
        'tags': {
            'IMP': 'imperative',
        }
    }
}

with open(word_grammar, 'w') as outfile:
    json.dump(grammar_data, outfile, ensure_ascii=False, indent=4)