# Convert Barwar Glossary for Verbs into Inflections and Lexicon

In [1]:
import re
import collections
from bs4 import BeautifulSoup
import mammoth
import unicodedata as ud
from pprint import pprint

with open('bar glossary verbs_modded.docx', 'rb') as docx_file:
    result = mammoth.convert_to_html(docx_file)
    html = result.value # The generated HTML
    messages = result.messages # Any messages, such as warnings during conversion
    
soup = BeautifulSoup(html, 'html.parser') # parse HTML
elements = list(soup) # grab elements
entries = [
    e for e in elements[40:]
        if len(list(e.children)) > 3
]

len(entries)

977

In [51]:
entries_parsed = collections.defaultdict(dict)
stem_marker = re.compile(r'(^|\s)(I+|Q)($|\s)')
stem_marker_end = re.compile(r'(I+|Q)$')
stem_marker_plain = re.compile(r'(I+|Q)')
defi_number = re.compile(r'\(\d+\)')
ref_tag = re.compile(r'\(.+:.+\)')
sent_end = re.compile(r'(?<!\.)\.(?!\.)|;|[!?]+|:')
lex_number = re.compile(r'\([iv]+\)')
lang_tag = re.compile(r'\([A-Z]+\.?\)')
stress_accents = re.compile(r'\u0300|\u0301')

def get_string(element, strip=True, normalize=True):
    """Retrieve a string regardless of element's class type"""
    string = getattr(element, 'text', str(element))
    string = string.strip() if strip else string
    string = ud.normalize('NFD', string) if normalize else string
    return string

def match_inflection(element, debug=False):
    """Identify inflection markers"""
    prev_sib = element.previous_sibling
    next_sib = element.next_sibling

    rules = [
        element.name == 'em',
        get_string(prev_sib).endswith('('),
        get_string(next_sib).startswith(')'),
    ]
    if debug: 
        print(rules)
    return all(rules)

def match_inton_end(element):
    """Match ending | intonation marker"""
    rules = [
        getattr(element,'name','') == 'sup',
        '|' in get_string(element),
    ]
    return all(rules)

def match_sentence(element, debug=False):
    """Match instances of example texts"""
    text = get_string(element)
    next_sib = element.next_sibling
    any_rules = [
        sent_end.search(text),
        stress_accents.search(text),
        '|' in text,
        match_inton_end(next_sib),
        ref_tag.search(text),
    ]
    rules = [
        bool(text),
        any(any_rules),
    ]
    if debug:
        print('any', any_rules)
        print('all', rules)
    return all(rules)

def match_wordform(element, debug=False):
    """Match a wordform of a lexeme."""
    text = get_string(element)
    prev_sib = element.previous_sibling
    next_sib = element.next_sibling 
    rules = [
        bool(text),
        getattr(element,'name','') == 'em',
        not match_inflection(element),
        not match_sentence(element),
        '→' not in text,
        not get_string(prev_sib).endswith('→'),
        not get_string(next_sib).startswith('-'),
    ]
    if next_sib:
        rules.append(not match_sentence(next_sib))
    if debug:
        print(rules)
    return all(rules)
    
def match_stem(element, debug=False):
    """Match a stem marker"""
    text = get_string(element)
    rules = [
        stem_marker.match(text) or stem_marker_end.search(text),
        not sent_end.search(text),
    ]
    if debug:
        print(rules)
    return all(rules)
    
def match_gloss(element):
    """Identify element containing gloss."""
    prev_sib = element.previous_sibling
    psib_text = get_string(prev_sib)
    rules = [
        getattr(element,'name','') == 'strong',
        getattr(element,'text','').strip().startswith('to'),
        not defi_number.search(psib_text),
    ]
    return all(rules)
    
def match_lex_number(element, debug=False):
    """Identify a lexeme number"""
    text = get_string(element)
    return bool(lex_number.match(text))

def match_lang_tag(element, debug=False):
    text = get_string(element)
    return bool(lang_tag.search(text))
    
def findall_types(element):
    """Run all match tests as parsings."""
    match_set = [
        ('cons', match_wordform),
        ('stem', match_stem),
        ('lex_number', match_lex_number),
        ('inflections', match_inflection), 
        ('language', match_lang_tag),
        ('gloss', match_gloss),
    ]
    matches = []
    for child in element.children:
        for tag, matcher in match_set:
            if matcher(child):
                matches.append((tag, get_string(child, normalize=False)))
    return matches

In [52]:
# test mass parsings

parses = []
for entry in entries:
    parsed_entry = findall_types(entry)
    parses.append((parsed_entry, entry))

In [70]:
# cluster into forms

root2stems = {}

def cluster_stems(parsing):
    """Cluster data by stem and map to root string.
    
    Data is parsed as a list of elements. The elements
    here are split up by stem and delivered as a dictionary
    with a root:stem_lists mapping, where root is the first
    word form in the entry and stem_lists is a list where
    each correspons with a list of metadata.
    """
    
    # cluster by appending to embedded list
    # when encountering a stem tag that is not 
    # first, add a new list for clustering
    data = [[]]
    first_stem = False
    root = parsing[0][1]
    
    for tag, text in parsing[1:]:
        
        if tag == 'stem':
            text = stem_marker_plain.findall(text)[0] # clean up the stem tag
            if not first_stem:
                first_stem = True
            else:
                data.append([]) # put new data group
                
        elif tag == 'lex_number':
            root += ' ' + text
            continue
                
        data[-1].append((tag, text))

    return {root: data}

for parsing, entry in parses:
    root2stems.update(cluster_stems(parsing))

In [77]:
for root in list(root2stems.keys())[:100]:
    print(root)
    pprint(root2stems[root])
    print()

ʾby
[[('stem', 'III'),
  ('cons', 'm-by'),
  ('inflections', 'mabe/măbe, mubele/mŭbele, maboye/măboye'),
  ('gloss', 'to swell; to cause to swell, to inflate')]]

ʾč̣m
[[('stem', 'I'),
  ('inflections', 'ʾač̣əm, č̣imle, č̣ama'),
  ('gloss', 'to close')]]

ʾjb
[[('stem', 'III'),
  ('cons', 'm-jb'),
  ('inflections', 'majəb/măjəb, mŭjəble/ʾŭjəble, măjobe'),
  ('gloss', 'to be astonished, to be amazed')]]

ʾmð
[[('stem', 'I'),
  ('inflections', 'ʾaməð, midle, maða'),
  ('gloss', 'to be baptized')],
 [('stem', 'III'),
  ('cons', 'm-mð'),
  ('inflections', 'maməð/măməð, mŭməðme, mămoðe'),
  ('gloss', 'to baptize')]]

ʾmr
[[('stem', 'I'), ('inflections', 'ʾamər, məre, mara'), ('gloss', 'to say')]]

ʾmṣ
[[('stem', 'I'), ('inflections', 'y-aməṣ'), ('gloss', 'to be able')]]

ʾqð
[[('stem', 'I'),
  ('inflections', 'ʾaqəð, qidle, qaða'),
  ('gloss', 'to burn (intr.)')],
 [('stem', 'III'),
  ('cons', 'm-qð'),
  ('inflections', 'maqəð, muqəðle, maqoðe'),
  ('gloss', 'to burn (tr.)')]]

ʾqr
[[('stem

In [75]:
# a loop that can be used for checking for empty fields in each entry

# probs = []

# # check for missing word forms
# for root, stems in root2stems.items():
#     for stem_data in stems:
#         data_dict = dict(stem_data)
#         if not set(data_dict.keys()) & {'gloss'}:
#             probs.append(root)
            
# len(probs)

In [76]:
# a loop for cataloguing all the unique stem strings identified

# stem2examples = collections.defaultdict(list)

# for root, stems in root2stems.items():
#     for stem_data in stems:
#         for tag, value in stem_data:
#             if tag == 'stem':
#                 stem2examples[value].append(root)
            
# stem2examples.keys()

In [66]:
# a loop for locating particular entries

# for i,entry in enumerate(entries):
#     childs = list(entry.children)
#     if 'hyr' in childs[0].text:
#         print(i, entry)

In [65]:
# These are rules that I ended up not using
# since we leave the text examples alone for now
# i.e. only interested in a select group of features

# def match_lexeme(element, debug=False):
#     """Identify element containing lexeme."""
    
#     # prepare siblings for rule processing
#     prev_sib = element.previous_sibling
#     psib_txt = get_string(prev_sib)
#     next_sib = element.next_sibling
#     nsib_txt = get_string(next_sib)
#     psib_is_stem = getattr(prev_sib,'name','') != 'em' and stem_marker_end.search(psib_txt)
#     nsib_is_stem = getattr(next_sib,'name','') != 'em' and stem_marker.match(nsib_txt)
#     nsib_is_num = lex_number.match(nsib_txt)
#     text = get_string(element)

#     # test rules
#     rules = [
#         bool(text),
#         element.name == 'em',
#         (psib_is_stem or nsib_is_stem or nsib_is_num),
        
#     ]
#     if debug: 
#         print(rules)
#     return all(rules)

# def match_definition_number(element):
#     """Identify definition numbers"""
#     text = getattr(element,'text','').strip()
#     if defi_number.fullmatch(text):
#         return True

# def match_example(element, debug=False):
#     """Identify element containing NENA example."""
#     next_sib = element.next_sibling
#     nsib_text = get_string(next_sib).strip()
#     text = get_string(element)
#     rules = [
#         getattr(element,'name','') == 'em',
#         getattr(next_sib,'name','') == 'sup',
#         nsib_text == '|',
#     ]
#     if debug: 
#         print(rules)
#     return all(rules)
    
# def match_translation(element, debug=False):
#     """Identify element containing translation of NENA example."""
#     prev_sib = element.previous_sibling
#     psib_text = get_string(prev_sib)
#     rules = [
#         getattr(element,'name',None) == None,
#         getattr(prev_sib,'name','') == 'sup',
#         psib_text == '|',
#     ]
#     if debug: 
#         print(rules)
#     return all(rules)

# -- test cases --

# gxk_case = entries[200]
# hwy_case = entries[213]
# hyr_case = entries[215]
# hyr2_case = entries[216]
# hymn_case = entries[214]
# gyd_case = entries[201]

# for i,entry in enumerate(entries):
#     childs = list(entry.children)
#     if 'gyð ' in childs[0].text:
#         print(i, entry)