# Convert Barwar lexicon

In [10]:
import os
import re
import json
from html_to_nena import html_elements, Text, element_totext, normalize_styles, text_tostring, str_replace

In [2]:
# # Replace substrings provided in `replace`
# if replace:
#     s = str_replace(s, replace)
replace = {
    # standardizing substutions
    '\u2011': '\u002d',  # non-breaking hyphen to normal hyphen-minus
    '\u01dd': '\u0259',  # 'ǝ' "turned e" to 'ə' schwa
    '\uf1ea': '\u003d',  # SIL deprecated double hyphen to '=' equals sign
    '\u2026': '...',  # '…' horizontal ellipsis to three dots
    'J\u0335': '\u0248',  # 'J' + short combining stroke to 'Ɉ' J with stroke
    'J\u0336': '\u0248',  # J' + long combining stroke to 'Ɉ' J with stroke
    '<y>': '\u02b8',  # superscript small letter y
    # special corrections for lexicon
    '*\u02b8*': '\u02b8',  # superscript small letter y in italic context
    '<|>': '\u02c8',  # superscript vertical bar
    '*\u02c8 *': '\u02c8 ',  # superscript vertical bar in italic text
    '*\u02c8': '\u02c8*',  # superscript vertical bar after italic text
    ' |** ': '** | ',  # vertical bar accidentally marked bold
    ' **|** ': ' | ',  # vertical bar accidentally marked bold
    ' *| ': ' | *',  # vertical bar accidentally marked italic
    ' | (': ' (',  # superfluous vertical bar before numbered translations
    '**; **': '; ', # unmarked punctuation between marked text
    '*, *': ', ', # unmarked punctuation between marked text
    '**.**': '.', # dot accidentally marked bold
}

replace = {re.escape(pattern): replace for pattern, replace in replace.items()}

filename = 'bar glossary general.html'

In [55]:
# these are the strings that can make up the grammatical categories of words
# in a non-capturing group (starting with ?:) of alternatives, sorted longest first
grm_cats = '(?:abs|and|adj|adv|cst|cs|fpl|fs|f|imper|interj|invar|mod|ms|m|num|n|part|pl|prep|pron|sing|tant|tan)'

languages = '(?:A|E|K|P|Ṭiy|Ṭiy|T|Urm|C.Syr)'

def parselemma(s):
        
    raw_entry = s
        
    # first try to strip off numbered list of meanings: (1) ... (2) ... etc
    result = re.split(' (\(\d+\)) ', s)
    # if not found, try other pattern for numbered list: 1. ... 2. ... etc
    # (in Barwar lexicon: only for one lemma: 'kapora')
    if len(result) == 1:
        result = re.split(' (\d+\.) ', s)
    # if there are any meanings, recombine with their numbers in list of strings;
    # otherwise, empty list
    meanings = [f'{result[i]} {result[i+1]}' for i in range(1, len(result), 2)]
    # reassign s to part of string before numbered meanings (if any)
    s = result[0]
    
    # then try to strip off any examples, separated from the lemma by '|'
    result = re.split('\s*\|\s*', s, 1)
    # examples will be empty string if there are none
    examples = result[1] if len(result) > 1 else ''
    # reassign s to part of string before examples (if any)
    s = result[0]
    
    # split on the first bold marker `**` to get the translation.
    # since bold and italic text messes up the markers,
    # cannot just select from start to end.
    result = re.split('\s*(\*\*)\s*', s, 1)
    if len(result) > 1:
        # small fix for when opening brackets of translation aren't marked bold
        if result[0].endswith(' ('):
            result[0] = result[0][:-2]
            result[2] = '(' + result[2]
        trans = result[1] + result[2]
    else:
        trans = ''
    s = result[0]
    
    # split on ' → ' for references
    result = re.split('\s*→\s*', s, 1)
    # reference will be empty string if there is none
    reference = result[1] if len(result) > 1 else ''
    # reassign s to part of string before reference (if any)
    s = result[0]
    
    # now, we will continue by looking at the start of the string for the lemma head
    result = re.split('^\*([^*]+)\*\s*', s)
    if len(result) == 1:
        raise ValueError('No lemma at start of string:', s)
    lemma = result[1]
    s = result[2]
    
    # some lemmata have a roman numeral following it (though it seems unnecessary)
    result = re.split('^\(([ivx]+)\)\s*', s)
    numeral = result[1] if len(result) > 1 else ''
    # last element of result contains remaining string, whether numeral is found or not
    s = result[-1]
    
    # now s should start with a grammatical category of a word, such as 'n.f.'.
    # remove grammatical categories from the start of the string until there are no more
#     grm_desc = ''
#     while s:
#         result = re.split(f'^({grm_cats}(?:[ /.,]+|$))', s)
#         if len(result) == 1:
#             break
#         # if there is a match, element 0 is always empty (since match is from start of string);
#         # element 1 contains the match, element 2 the remainder of the string
#         grm_desc += result[1]
#         s = result[2]
#     grm_desc = grm_desc.rstrip()
    # like above, but with repeated non-capturing group instead of while loop
    result = re.split(f'^((?:{grm_cats}(?:[.,/ ]+|$))+)', s)
    # element 0 is always empty (bcs of '^' in regex); if match, element 1 contains match
    grm_desc = result[1].rstrip() if len(result) > 1 else ''
    # last element of result contains remaining string, whether match is found or not
    s = result[-1]
        
    # split on round brackets to get annotations for lemma.
    # even elements in position >= 2 should be empty
    # e.g.: '*word* n.m. (pl. *words*) (E.)' results in:
    # ['*word* n.m.', 'pl. *words*', '', 'E.', '']
    result = re.split('\s*\(([^)]+)\)\s*', s)
    # list of annotations, if any, else empty
    annotations = [result[i] for i in range(1, len(result), 2)]
    
    # list of things in between (should all be empty strings! just checking!)
    inbetweens = [result[i] for i in range(2, len(result), 2) if result[i]]
    
    # everything after lemma and before annotations is the tail
    tail = result[0]
    
    # extract forms from annotations
    resultlist = []
    forms = []
    for e in annotations:
        # split word forms in format 'f. *worda, wordb*, pl. *wordc*' from start of string
        result = re.split(fr'^((?:(?:{grm_cats}\.?\s+)+\*[^*]+\*(?:[,;]?\s|$))+)', e)
        match = result[1] if len(result) > 1 else None
        resultlist.append(result[-1])
        if match:
            # split matched string into seperate word forms and their grammatical decribers
            result = re.split(fr'((?:{grm_cats}\.?\s+)+)\*([^*]+)\*(?:[,;]?\s|$)', match)
            # the above example results in:
            # ['', 'f.', 'worda, wordb', '', 'pl.', 'ʾwordc', '']
            # The first regex selecting the whole string makes sure that nothing can occur
            # in the elements 0, 3, 6 etc., so we can safely skip those
            forms += [(result[i].rstrip(), result[i+1]) for i in range(1, len(result), 3)]
    annotations = [e for e in resultlist if e]

    # extract languages from annotations
    resultlist = []
    lang = ''
    for e in annotations:
        # make sure that only one language string is returned
        if lang:
            resultlist.append(e)
            continue 
        result = re.split(fr'^((?:{languages}\.)(?:/{languages}\.)*)', e)
        if not lang and len(result) > 1:
            lang = result[1]
            break 
        resultlist.append(result[-1])
    annotations = [e for e in resultlist if e]
    
    # TODO most stuff is extracted. Some stuff however does not fit the regexes.
    # (Remains of) unconventional notations can be seen in the tail, annotations
    # and (especially) the inbetweens variables. Some of these may require manual
    # corrections, others may be parsed by more sophisticated regexes.
    # (e.g.: alternative forms in the notation: 'sing. *wordform* f.')
    
    # TODO Both `meanings` and `examples` are mostly structured to some degree.
    # They could be parsed further (e.g. `meanings` usually contain examples).
    
    # TODO The `grm_desc` should be parsed further, now they are just strings,
    # e.g. 'n.m.' for the head word, and 'f.' for an alternative form.
    # It should be clearer that the head word is e.g. noun, masc, sing and that
    # the alternative form is e.g. noun, fem, sing.
    
    parsed = {
        'lemma': lemma,
        'numeral': numeral,
        'grm_desc': grm_desc,
        'forms': forms,
        'trans': trans,
        'tail': tail,
        'lang': lang,
        'annotations': annotations,
        'inbetweens': inbetweens,
        'ref': reference,
        'examples': examples,
        'meanings': meanings,
        'raw_entry': raw_entry,
    }
    parsed = {k:v for k,v in parsed.items() if v}
    
    return parsed

In [60]:
lemmata = []

for e in html_elements(filename):
    t = Text(element_totext(e))
    t = normalize_styles(t, can_have_emphasis=lambda c: not c.isspace())
    s = text_tostring(t)
    s = str_replace(s, replace)

    if not s or s.isdigit():
        continue
    elif not s.startswith('*'):
        continue
        if s.startswith('/'):
            print('##', s)
        else:
            print('#', s)
    else:
        parsed = parselemma(s)
        #lemma_str = '\n'.join(f'{k}: {v!r}' for k, v in parsed.items() if v)
        lemmata.append(parsed)

In [64]:
with open('bar glossary general.json', 'w') as outfile:
    json.dump(lemmata, outfile, ensure_ascii=False, indent=4)

In [58]:
get_form('pažgir')

{'lemma': 'pažgir',
 'grm_desc': 'n.f.',
 'forms': [('pl.', 'pažgire')],
 'trans': '**towel**',
 'lang': 'K.',
 'raw_entry': '*pažgir* n.f. (pl. *pažgire*) (K.) **towel**'}

In [65]:
bads = [l for l in lemmata if {'annotations', 'inbetweens', 'tail'} & set(l.keys())]

len(bads)

164