# Convert the Lexicon to a Standardized JSON Lexicon Format

In [29]:
import collections
import unicodedata
import re
import json
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 100)
from pathlib import Path
glossary_file = 'bar glossary general.json'

## Developing standard Lexicon and Analyzed Word JSONs

We need to match words in the text with their lexemes. We should also allow for some
additional data to be added to words later on.

A nice model to follow is that of the Eep Talstra Centre for Bible and Computer, which
encodes data about Hebrew words in two formats: 

* lexicon - 2 columns, first is the lexical form of a word, second is lexicon data (example [here](https://github.com/ETCBC/data_creation/blob/master/grammar_libraries/synvar_library/hebrew/lexicon))
* anwb (or anzb) an analytical word book - 2 columns, first is an inflected surface
    form of a word, second is a tag containing parsing data as well as the lexical form
    of the word. The anwb enables the link to the lexicon file. (example [here](https://raw.githubusercontent.com/ETCBC/data_creation/master/grammar_libraries/standard_library/hebrew/anwb))
    
We will implement a similar approach but with JSON files. The format is simple: a key and dictionary of values.
For the lexicon, the key is a lexeme string with linked attributes. For the inflections data, the key is an
inflected form, while the attributes contains a link to the lexeme as well as any localized data.

**lexicon**

In [2]:
lexicon_eg = {
    
    'ʾabaya.1': {
        'pos': 'noun',
        'gn': 'm',
        'gloss': "a man's cloak",
        'entry': 'ʾabàyele də́rya b-réšaˈ He put his cloak over her (A26:50)...',
    },
    
    # ...
}

**inflections**

**NB: An inflection can contain attributes that can overwrite attributes from the lexicon.**

Inflections can also match multiple lexemes. Each individual form receives an inflection entry.

In [355]:
inflections_eg = [
    {
        'form': 'ʾabbaya',
        'lex': 'ʾabaya', 
        'nu': 'sg',
    },
    {
        'form':'ʾabaye',
        'lex': 'ʾabaya', 
        'nu': 'pl',
    },
    {   
        'form': 'ʾabbaye',
        'lex': 'ʾabaya',
        'nu': 'pl',
    },
    
    #...
]

All text will be stored in decomposed Unicode form. Inflected forms should be written
without accents on vowels.

# Build the standards

We now load the glossary information we have for Barwar. We need to make a few things.

* word grammar standards - standard codes and acceptable tag values for word grammar 
    (e.g. nu=number, lex=lexeme, etc.)
* parse grammatical descriptions
* convert lexicon file
* convert inflections file

First, we load the glossary data and begin inspecting it to construct the standards.

In [412]:
def norm_entry(text):
    """Normalize text by removing style elements/quirks"""
    # remove trailing spaces
    text = text.strip()
    # remove style asterixes
    text = text.replace('*', '')
    # remove end of word -
    text = re.sub('[\-–—]$', '', text)
    # replace weird hyphens
    text = re.sub('—|–', '-', text)
    # replace weird tags
    text = text.replace('<|>', ' | ')
    # remove begin/end single quotes
    text = re.sub('^\u0027|\u0027$', '', text)
    text = unicodedata.normalize('NFD', text)
    return text

with open(glossary_file, 'r') as infile:
    raw_glossary_data = json.load(infile)

In [413]:
raw_glossary_data[0]

{'lemma': 'ʾabaya, ʾabbaya',
 'grm_desc': 'n.m.',
 'forms': [['pl.', 'ʾabaye, ʾabbaye']],
 'trans': '**man’s cloak**',
 'lang': 'A.',
 'examples': '*ʾabàyele də́rya b-réša*<|> He put his cloak over her (A26:50); *ʾɛ́-ga kúlla ʾaġăwáθa šexìye*<|> *kúlla b-ʾabbàye-wawa*<|> At that time all aghas and elders wore the abbaya gown (A26:47); *ʾíman ṱ-ila-nabólle bə̀dra*<|> *m-gu-ʾằra,*<|> *mattíwa ... ʾabàya y-amrə́xwale*<|> When they took it (the rice) to the threshing floor from that ground, they would lay down what we called a cloak (B5:80).',
 'raw_entry': '*ʾabaya, ʾabbaya* n.m. (pl. *ʾabaye, ʾabbaye*) (A.) **man’s cloak** | *ʾabàyele də́rya b-réša*<|> He put his cloak over her (A26:50); *ʾɛ́-ga kúlla ʾaġăwáθa šexìye*<|> *kúlla b-ʾabbàye-wawa*<|> At that time all aghas and elders wore the abbaya gown (A26:47); *ʾíman ṱ-ila-nabólle bə̀dra*<|> *m-gu-ʾằra,*<|> *mattíwa ... ʾabàya y-amrə́xwale*<|> When they took it (the rice) to the threshing floor from that ground, they would lay down what 

**Some complicated forms will not be added due to ambiguities in the entries.
Those are selected and excluded below.**

In [414]:
bads = []

for lem in raw_glossary_data:
    if {'annotations', 'inbetweens', 'tail'} & set(lem.keys()):
        bads.append(lem)
    if 'grm_desc' not in lem:
        bads.append(lem)

glossary_data = [lem for lem in raw_glossary_data if lem not in bads]


print(len(bads), 'bad entries found')
print(len(glossary_data), 'good entries found')

378 bad entries found
3352 good entries found


## Analyzing Grammatical Data

In [415]:
lex_df = pd.DataFrame(glossary_data).fillna('')

lex_df.shape

(3352, 9)

In [416]:
lex_df.head()

Unnamed: 0,lemma,grm_desc,forms,trans,lang,examples,raw_entry,meanings,numeral
0,"ʾabaya, ʾabbaya",n.m.,"[[pl., ʾabaye, ʾabbaye]]",**man’s cloak**,A.,*ʾabàyele də́rya b-réša*<|> He put his cloak o...,"*ʾabaya, ʾabbaya* n.m. (pl. *ʾabaye, ʾabbaye*)...",,
1,ʾabona,n.m.,"[[pl., ʾabone]]",**bishop**,A.,,*ʾabona* n.m. (pl. *ʾabone*) (A.) **bishop**,,
2,ʾăbu,part.,,**attributive particle** (A. literally: father...,,*ʾăbu-bə̀rqa*<|> electrician(s) (B10:50); *băy...,*ʾăbu* part. **attributive particle** (A. lite...,,
3,ʾAðər,n.m.,,**March**,,,*ʾAðər* n.m. **March**,,
4,ʾadət,n.f.,"[[pl., ʾadətte, ʾadəttaθa]]",**custom**,,*ʾáxni ʾádət díyən hàtxɛla*<|> Our custom ...,"*ʾadət* n.f. (pl. *ʾadətte, ʾadəttaθa*) **cust...",,


In [417]:
lex_df[lex_df.grm_desc.str.contains('n.m. adv./adj.')]

Unnamed: 0,lemma,grm_desc,forms,trans,lang,examples,raw_entry,meanings,numeral
2284,rəḥqa,n.m. adv./adj.,,**distance; distant.**,,,*rəḥqa* n.m. adv./adj. **distance; distant.** ...,[(1) **distance** | *xá-bena xzéle m-rə́ḥqa ...,


In [418]:
lex_df.grm_desc.value_counts()

n.m.                1460
n.f.                1011
adj.                 309
n.pl.                110
adj. invar.           71
adv.                  66
part.                 65
n.m./adj.             49
num.                  37
n.m./f.               26
n.m                   22
m.                    19
interj.               18
prep.                 11
mod.                   7
pron. pl.              6
n.f./adj.              4
pron. fs.              4
n.f                    4
pron. ms.              4
f.                     3
n.f./m.                3
m.n.                   3
mod. adv.              3
n.pl.tan.              3
pron.                  3
n.m/adj.               2
adj. m.                2
pron. cs.              2
adj. cs.               2
n.m. /adj.             2
part., prep.           1
imper. pl.             1
pl. tan.               1
n.f./adv.              1
adj. adv. mod.         1
pl.                    1
n.f./adj..             1
part. pron.            1
pl.tan.                1


## Word grammar 

In [419]:
word_grammar = Path('/Users/cody/github/CambridgeSemiticsLab/nena_corpus/standards/word_grammar.json')

grammar_data = {
    'pos': {
        'desc': 'part of speech',
        'tags': {
            'NOUN': 'noun',
            'VERB': 'verb',
            'PREP': 'preposition',
            'ADJV': 'adjective',
            'ADVB': 'adverb',
            'PART': 'particle',
            'MODI': 'non-attributive modifier',
            'PRON': 'pronoun',
            'INTJ': 'interjection',
            'NUMR': 'numeral',
        }
    },
    'form': {
        'desc': 'variability of word form',
        'tags': {
            'INVAR': 'lexeme is invariable',
        }
    },
    'st': {
        'desc': 'state',
        'tags': {
            'C': 'construct',
            'A': 'absolute',
        }
    },
    'gn': {
        'desc': 'gender',
        'tags': {
            'F': 'feminine',
            'M': 'masculine',
            'MF': 'either masculine or feminine',
            'C': 'common',
        }
    },
    'nu': {
        'desc': 'number',
        'tags': {
            'SG': 'singular',
            'PL': 'plural',
        }
    },
    'nu_class': {
        'desc': 'semantic class of number',
        'tags': {
            'TANT': 'pluralis tantum: noun with only a plural form'
        }
    },
    'trns': {
        'desc': 'transitivity',
        'tags': {
            'TR': 'transitive',
            'ITR': 'intransitive',
        }
    },
    'syn': {
        'desc': 'synonymy/antonymy',
        'tags': {
            'AN': 'antonym',
            'SN': 'synonym',
        }
    },
    'stem': {
        'desc': 'stem of a verb',
        'tags': {
            'I': 'stem I verb',
            'II': 'stem II verb',
            'III': 'stem III verb',
            'Q': 'quadriliteral verb',
        }
    },
    'tense': {
        'desc': 'grammatical tense of a verb',
        'tags': {
            'IMP': 'imperative',
        }
    }
}

with open(word_grammar, 'w') as outfile:
    json.dump(grammar_data, outfile, ensure_ascii=False, indent=4)

## Process Glossary Data


In [420]:
def test_pattern(pattern):
    print('MATCHED:')
    for tag in lex_df.grm_desc.unique():
        if type(tag) == str:
            if re.search(pattern, tag):
                print(tag)
    print()
    print('NOT MATCHED')
    for tag in lex_df.grm_desc.unique():
        if type(tag) == str:
            if not re.search(pattern, tag):
                print(tag)

In [421]:
# a series of regex patterns to parse grammatical data
re_to_gram_raw = [
    [
        'pos',
        [
            (r'\.n\.|^n\.', 'NOUN'),
            (r'adj', 'ADJV'),
            (r'adv', 'ADVB'),
            (r'part', 'PART'),
            (r'num', 'NUMR'),
            (r'interj', 'INTJ'),
            (r'prep', 'PREP'),
            (r'mod', 'MODI'),
            (r'pron', 'PRON'),
            (r'imper', 'VERB'),
        ]
    ],
    [
        'form',
        [
            (r'inv', 'INVAR')
        ]
    ],
    [
        'st',
        [
            (r'cst', 'C'),
            (r'abs', 'A'),
        ]
    ],
    [
        'gn',
        [
            (r'[^\w]m[^\w]|^m[^\w]|[^\w]m$|[^\w]ms\.|^ms\.|mpl', 'M'),
            (r'[^\w]f[^\w]|^f[^\w]|[^\w]f$|[^\w]fs\.|^fs\.|fpl', 'F'),
            (r'[^\w]cs[^\w]|^cs[^\w]|[^\w]cs$', 'C'),
        ]
    ],
    [
        'nu',
        [
            (r'[mf]s|sing|sg', 'SG'),
            (r'pl', 'PL'),
        ]
    ],
    [
        'nu_class',
        [
            ('tan', 'TANT'),
        ]
    ],
    [
        'trns',
        [
            ('tr\.', 'TR'),
            ('intr\.', 'ITR'),
        ]
    ],
    [
        'syn',
        [
            ('anton\.', 'AN'),
            ('synon\.', 'SN')
        ]
    ],
    [
        'tense',
        [
            ('imper\.', 'IMP')
        ]
    ]
]

re_to_gram = []

for gram, patts in re_to_gram_raw:
    patts_comp = [(re.compile(patt), repl) for patt, repl in patts]
    re_to_gram.append([gram, patts_comp])

In [422]:
# test_pattern(r'tan')

### Construct Lexicon Entries

In [423]:
parse_tags('n.f./m.')

{'pos': ['NOUN'], 'gn': 'MF'}

In [424]:
def parse_tags(tag_string):
    """Parse grammatical tags"""
    parsing = collections.defaultdict(list)
    for cat, patts2vals in re_to_gram:
        for patt, val in patts2vals:
            if patt.search(tag_string):
                parsing[cat].append(val)
    parsing = clean_duplicates(parsing, tag_string)
    return parsing

def clean_duplicates(parsed_data, tag):
    """Clean any duplicate information in a parsed string"""
    new_data = {}
    for cat, values in parsed_data.items():

        if cat != 'pos' and len(values) == 1:
            new_data[cat] = values[0]
            
        elif cat == 'pos' and len(values) == 1:
            new_data['pos'] = values
            
        elif cat == 'pos':
            new_data[cat] = [values[0]]
            new_data['pos_context'] = values
        
        elif cat == 'gn':
            if 'M' in values and 'F' in values:
                new_data['gn'] = 'MF'
            else:
                raise Exception(f'multiple genders found in {tag}')
        
        else:
            raise Exception(f'multiple {cat} values found in {values}')

    return new_data
        
def add_lex_number(form, pos, lnum):
    """Build a lexical entry with lex disambig number
    
    Count begins at 2
    """
    if lnum != 1:
        lnum = str(lnum)
        return '.'.join([form, lnum])
    else:
        return '.'.join([form])

def get_lex_id(form, pos, lexicon):
    """Compile a lexical entry based on surface form.
    
    If form is already witnessed in a dictionary, add
    disambiguator numbers.
    """
    lnum = 1
    lex_id = add_lex_number(form, pos, lnum)
    while lex_id in lexicon:
        lnum += 1
        lex_id = add_lex_number(form, pos, lnum)
    return lex_id

def clean_empty_values(data_dict):
    return {k:v for k, v in data_dict.items() if v}

def build_lexeme(form, gloss_data, parsed_data,
                 lexicon, default_pos=''):
    """Build a lexeme from its parsed data and gloss data.
    
    Args:
        parsed_data: parsing data from the grammatical
            tag
        gloss_data: data direct from the glossary
        default_pos: supply part of speech value
            if missing; some forms only have gender,
            and the pos is inherited from the entry
    """

    form = norm_entry(form)
    gloss = norm_entry(gloss_data.get('trans',''))
    meanings = '; '.join(gloss_data.get('meanings',[]))
    entry = norm_entry(gloss_data.get('examples','') + meanings)
    lang = norm_entry(gloss_data.get('lang',''))
    parsed_data_no_pos = {k:v for k,v in parsed_data.items() if k != 'pos'}
    
    for pos in parsed_data.get('pos', [default_pos]):
        if not pos:
            raise Exception('blank pos encountered')

        # add data in desired order / headings
        lex_data = {
            'form': form,
            'pos': pos,
        }
        lex_data.update(parsed_data_no_pos)
        lex_data.update({
            'gloss': gloss,
            'entry': entry,
            'lang': lang,
        })
        lex_data = clean_empty_values(lex_data)
        lex_id = get_lex_id(form, pos, lexicon)
        yield (lex_id, lex_data)
        
def build_inflection(form, parsed_data, new_parses={}, default_pos=''):
    """Build an inflected form for the inflections file"""
    form = normalize_entry(form)
    parsed_data_no_pos = {k:v for k,v in parsed_data.items() if k != 'pos'}
    for pos in parsed_data.get('pos', [default_pos]):
        data = {'form': form, 'pos': pos}
        data.update(parsed_data_no_pos)
        data.update(new_parses) # write over with localized parses
        yield data

In [425]:
def split_lem(lemma):
    """Splits a composite lemma into multiple forms"""
    return re.split('\s*;\s*|\s*,\s*', lemma)

def split_forms(form):
    """Split forms into words"""
    grm_tag, wordforms = form
    words = wordforms.split(',')
    words = [word.strip() for word in words]
    for word in words:
        yield (grm_tag, words)

In [429]:
lexicon = {}
inflections = []

for lem in glossary_data:

    grm_desc = lem['grm_desc']
    parsed_grm = parse_tags(grm_desc)

    lemma_forms = split_lem(lem['lemma'])
    primary_lemma = lemma_forms[0]

    lex2data = list(build_lexeme(
        primary_lemma, 
        lem, 
        parsed_grm, 
        lexicon, 
        'NOUN' # default pos for when only gender is given
    ))
    lexemes = []
    for lexeme, lex_data in lex2data:
        lexemes.append(lexeme)
        lexicon[lexeme] = lex_data
    
    
    for lexeme in lexemes:
        parsed_grm['lex'] = lexeme
        for form in lemma_forms:
            for inflection in build_inflection(form, parsed_grm, default_pos='NOUN'):
                inflections.append(inflection)

        for form in lem.get('forms', []):
            for grm_tag, wordforms in split_forms(form):
                form_parsed_grm = parse_tags(grm_tag)
                for form in wordforms:
                    for inflection in build_inflection(form, parsed_grm, form_parsed_grm, 'NOUN'):
                        inflections.append(inflection)

In [430]:
with open('lexicon.json', 'w') as outfile:
    json.dump(lexicon, outfile, ensure_ascii=False, indent=4)

In [431]:
with open('inflections.json', 'w') as outfile:
    json.dump(inflections, outfile, ensure_ascii=False, indent=4)