# Generate Regex JSON from the Alphabet and Punctuation JSON

In [1]:
from unicodedata import normalize
from pathlib import Path
import json
import collections
import re
from pprint import pprint

nena_dir = Path.home().joinpath('github/CambridgeSemiticsLab/nena_corpus')
standards = nena_dir.joinpath('standards')
alpha_file = standards.joinpath('alphabet/alphabet.json')
punct_file = standards.joinpath('punctuation/punctuation.json')
alpha_data = json.loads(alpha_file.read_text())
punct_data = json.loads(punct_file.read_text())
regex_file = standards.joinpath('NFD_regexes.json')

In [2]:
alpha_patts = []
for letter in alpha_data:
    lower, upper = letter['decomposed_string'], letter['decomposed_upper_string']
    if upper == lower:
        upper = ''
    combo = '|'.join(c for c in [lower, upper] if c)
    alpha_patts.append(combo)
    
punct_patts = []
for punct in punct_data:
    punct_patts.append(re.escape(punct["decomposed_string"]))
                             
alpha_join = '|'.join(alpha_patts)
alpha_re = f'({alpha_join})(?![\u0300-\u036F])'
        
re_dict = {
    'alphabet': alpha_re,
    'punctuation': '|'.join(punct_patts),
}

In [3]:
punct_patts

['⁺',
 '"',
 '\\ ',
 '\\-',
 '=',
 'ˈ',
 ',',
 '\\.\\.\\.',
 ':',
 '—',
 ';',
 '\\.',
 '\\?',
 '!',
 '"']

In [4]:
help(re.escape)

Help on function escape in module re:

escape(pattern)
    Escape special characters in a string.



In [5]:
re_dict['alphabet']

'(b|B|c|C|č|Č|c̭|C̭|č̣|Č̣|č̭|Č̭|d|D|ḍ|Ḍ|f|F|g|G|ġ|Ġ|h|H|ḥ|Ḥ|j|J|k|K|k̭|K̭|l|L|ḷ|Ḷ|m|M|ṃ|Ṃ|n|N|p|P|p̣|P̣|p̭|P̭|q|Q|r|R|ṛ|Ṛ|s|S|š|Š|ṣ|Ṣ|t|T|ṭ|Ṭ|ṱ|Ṱ|v|V|w|W|x|X|y|Y|z|Z|ž|Ž|ẓ|Ẓ|ð|Ð|ð̣|Ð̣|ɟ|ʾ|ʿ|θ|Θ|a|A|à|À|á|Á|ā|Ā|ă|Ă|ā̀|Ā̀|ā́|Ā́|ằ|Ằ|ắ|Ắ|e|E|è|È|é|É|ē|Ē|ḕ|Ḕ|ḗ|Ḗ|i|I|ì|Ì|í|Í|ī|Ī|ī̀|Ī̀|ī́|Ī́|o|O|ò|Ò|ó|Ó|ō|Ō|ṑ|Ṑ|ṓ|Ṓ|u|U|ù|Ù|ú|Ú|ū|Ū|ŭ|Ŭ|ū̀|Ū̀|ū́|Ū́|ŭ̀|Ŭ̀|ŭ́|Ŭ́|ə|Ə|ə̀|Ə̀|ə́|Ə́|ɛ|Ɛ|ɛ̀|Ɛ̀|ɛ́|Ɛ́|ɛ̄|Ɛ̄)(?![̀-ͯ])'

In [6]:
re_dict['punctuation']

'⁺|"|\\ |\\-|=|ˈ|,|\\.\\.\\.|:|—|;|\\.|\\?|!|"'

## Test

In [7]:
test_file = normalize('NFD', nena_dir.joinpath('texts/alpha/Barwar/A Hundred Gold Coins.nena').read_text())
test_text = test_file[146:]
test_alpha = re.compile(re_dict['alphabet'])
test_punct = re.compile(re_dict['punctuation'])

In [8]:
test_text[:100]

'\n(1) xá-ga xèta,ˈ mállah Naṣràdin,ˈ xázəx mòdi wíða.ˈ gu-bɛ̀θa wéwa,ˈ har-zála-w\nθàya.ˈ z'

In [10]:
found_alphas = set(test_alpha.findall(test_text))
found_puncts = set(test_punct.findall(test_text))
                   
print(found_alphas)

{'l', 'i', 'b', 'q', 'z', 'í', 'w', 'k̭', 'ɛ̀', 'ū̀', 'ṱ', 'ù', 'š', 'u', 'N', 'ṭ', 'ì', 'é', 'ð', 'y', 'o', 'j', 't', 'è', 'ú', 'ằ', 'ò', 'ṣ', 'ă', 'k', 'g', 'p', 'ʿ', 'ə', 'ắ', 'n', 'à', 'ʾ', 'h', 'č̣', 'ē', 'a', 'ɛ', 'x', 'ž', 'á', 'm', 'ó', 'f', 'ɛ́', 'd', 's', 'č', 'r', 'θ', 'e', 'ə́', 'ə̀'}


In [11]:
found_puncts

{' ', '!', ',', '-', '.', '?', 'ˈ'}

## Export

In [12]:
with open(regex_file, 'w') as outfile:
    json.dump(re_dict, outfile, ensure_ascii=False, indent=4)