# Generate Transcription Formats

Transcriptions should be a simple regex pattern to string mapping within a dictionary.
Where each regex pattern corresponds with an acceptable letter or punctuator in the 
NENA standards library.

In [7]:
import json
import re
import collections
from pathlib import Path
import unicodedata

corpus_dir = Path.home().joinpath('github/CambridgeSemiticsLab/nena_corpus')
standards_dir = corpus_dir.joinpath('standards')
alphabet_json = standards_dir.joinpath('alphabet/alphabet.json')
punctuation_json = standards_dir.joinpath('punctuation/punctuation.json')
transcription_dir = standards_dir.joinpath('transcriptions')

In [37]:
class Transcriber:
    """Transcribe a string according to transcription rules.
    
    This transcription class is essentially a filter
    which determines which characters make it into a new,
    transcribed string. The filter is applied on a letter-by-letter 
    basis. A "letter" (token) is defined by the `tokens` argument 
    and can include diacritics/accents. The filter is applied 
    in one of three methods:
        1. replacements on a unicode composed letter (NFC)
        2. or replacements on punctuation if letter is punctuation
        3. or replacements on a unicode decomposed letter (NFD)
    The changes are added to a new string which is then returned.
    
    __init__(tokens, replacements, punctuation, keep):
        string: a string to transcribe
        tokens: regex for splitting letters (tokens)
            to be used with findall
        replacements: dict with find:replace mappings
        keep: regex for characters to keep
            
    Returns:
        str in transcribed form
    """
    def __init__(self, tokens='', replace={}, 
                 punctuation='', keep='', keep_case=False, apply_function=None):    
        self.tokenize = re.compile(f'{tokens}|{punctuation}').findall
        self.punct = re.compile(punctuation)
        self.keep = re.compile(keep)
        self.keep_case = keep_case
        self.apply_function = apply_function
        
        # ensure normalized characters for pattern searches
        self.repl = {
            unicodedata.normalize('NFC',f):r 
                for f,r in replace.items()
        }
        
    def convert(self, string, normalize='NFC'):
        """Convert string to transcription.
        
        Returns:
            str in transcribed form
        """
        
        string = unicodedata.normalize('NFC',string)
        if not self.keep_case:
            string = string.lower()
        transcription = ''

        for token in self.tokenize(string):

            # filter a composed string
            if token in self.repl:
                transcription += self.repl[token]

            # keep punctuation
            elif self.punct.match(token):
                transcription += token

            # filter at 
            else:
                for char in unicodedata.normalize('NFD', token):

                    # attempt second match on char by char basis
                    if char in self.repl:
                        transcription += self.repl[char]

                    # attempt to keep with keep-set
                    elif self.keep.match(char):
                        transcription += char   
        
        # apply optional function
        if self.apply_function:
            transcription = self.apply_function(transcription)
        
        return unicodedata.normalize(normalize, transcription)
    
    
trans_full = {
    'tokens': f'[\u207A]?[^\W\d_][\u0300-\u036F]*',
    'replace': {
   
    # char combinations
    'p̌':'p<',
    'ṭ': 't',
    'ð̣': '6',

    # non-latin vowels
    '\u0131': 'i',  # 0x0131 ı dotless i
    '\u0251': 'a',  # 0x0251 ɑ alpha
    '\u0259': '9',  # 0x0259 ə schwa
    '\u025B': '3',  # 0x025B ɛ open e
    
    # vowel accents
    '\u0300': '`',  # 0x0300 à grave
    '\u0301': "'",  # 0x0301 á acute
    '\u0304': '_',  # 0x0304 ā macron
    '\u0306': '%',  # 0x0306 ă breve
    '\u0308': '"',  # 0x0308 ä diaeresis
    '\u0303': '~',  # 0x0303 ã tilde
    '\u02C8': '', # 0x2c8 ˈ small vertical line
        
    # non-latin consonants
    '\u00F0': '6',  # 0x00F0 ð eth
    '\u025F': '4',  # 0x025F ɟ small dotless j with stroke
    '\u0248': '4',  # 0x0248 Ɉ capital J with stroke
    '\u03B8': '8',  # 0x03B8 θ greek theta
    '\u02B8': '7',  # 0x02B8 ʸ small superscript y
    '\u02BE': ')',  # 0x02BE ʾ right half ring (alaph)
    '\u02BF': '(',  # 0x02BF ʿ left half ring (ayin)
        
    # consonant diacritics
    '\u207A': '+',  # 0x207A ⁺ superscript plus
    '\u030C': '>',  # 0x030C x̌ caron
    '\u0302': '^',  # 0x0302 x̂ circumflex
    '\u0307': ';',  # 0x0307 ẋ dot above
    '\u0323': '.',  # 0x0323 x̣ dot below
    '\u032D': '<',  # 0x032D x̭ circumflex below
    
    # punctuation
    '\u02C8': '|', # 0x2c8 ˈ small vertical line
    },
    'punctuation': '[\s.,?!:;–\-\u2014]',
    'keep': '[A-Za-z]',
}

trans_lite = {
    'tokens': f'[\u207A]?[^\W\d_][\u0300-\u036F]*', 
    'replace': {
        'ʾ': ')',
        'ʿ': '(',
        'č': '5',
        'č̭': '5',
        'č̣': '%',
        'ḍ': 'D',
        'ð': '6',
        'ð̣': '^',
        'ġ': 'G',
        'ḥ': 'H',
        'ɟ': '4',
        'Ɉ': '4',
        'k̭': '&',
        'ḷ': 'L',
        'ṃ': 'M',    
        'p̣': 'P',
        'ṛ': 'R',
        'ṣ': 'S',
        'š': '$',
        'ṱ': '+',
        'ṭ': 'T',
        'θ': '8',
        'ž': '7',
        'ẓ': 'Z',
        'ā̀': 'A',
        'ā́': 'A',
        'ă': '@',
        'ắ': '@',
        'ằ': '@',
        'ē': 'E',
        'ɛ': '3',
        'ī': 'I',
        'ĭ': '9',
        'ə': '9',
        'o': 'o',
        'ō': 'O',
        'ū': 'U',
        'ŭ': '2',
        'ı': 'i',
        'ɑ': 'a',
        'ˈ': '|'
    },
    'punctuation': '[\s.,?!:;–\-\u2014]',
    'keep': '[A-Za-z]',
}

fuzzy_urmi = {
    'tokens': f'[\u207A]?[^\W\d_][\u0300-\u036F]*',
    'replace': {
        'c': 'k',
        'c̭': 'k',
        'č': '5',
        'č̭': '5',
        'č̣': '5',
        'k̭': 'q',
        'ɟ': 'g',
        'Ɉ': 'g',
        'ə': 'i',
        'v': 'w',
    },
    'punctuation': '[\s.,?!:;–\-\u2014]',
    'keep': '[A-Za-z]',
}

fuzzy_barwar = {
    'tokens': f'[\u207A]?[^\W\d_][\u0300-\u036F]*',
    'replace': {
        'č': '5',
        'č̭': '5',
        'č̣': '5',
        'k̭': 'k',
        'θ': 't',
        'ð': 'd',
        'ɛ': 'e',
        'ə': 'i',
    },
    'punctuation': '[\s.,?!:;–\-\u2014]',
    'keep': '[A-Za-z]',
}

trans_text = {
    'tokens': f'[\u207A]?[^\W\d_][\u0300-\u036F]*',
    'replace': {
    },
    'punctuation': '[-–=]',
    'keep': '[^\s.,?!:;\u02C8]+',
    'keep_case': True,
}

trans_no_accent = {

}

class Normalizer:
    """Strip accents from vowels. 
    
    This is done in a class so .convert can be called
    alongside the transcriber within a loop
    """
    def __init__(self):
        pass
    def convert(self, word):
        """Strip accents and spaces from NENA text on a node.

        Args:
            word: a node number to get normalized text
        """
        accents = '\u0300|\u0301|\u0304|\u0306|\u0308|\u0303'
        norm = unicodedata.normalize('NFD', word) # decompose for accent stripping
        norm = re.sub(accents, '', norm) # strip accents
        return norm

dialect2trans = {
    'ALL': {     
        'text': Transcriber(**trans_text), # full utf8 text without punctuation
        'text_trans': Transcriber(**trans_full), # transcription full
        'text_lite': Transcriber(**trans_lite), # transcription lite feature
        'text_noaccent': Normalizer(),
    },
    'Barwar': {
        'fuzzy': Transcriber(**fuzzy_barwar), # fuzzy
    },
    'Urmi_C': {
        'fuzzy': Transcriber(**fuzzy_urmi), # fuzzy
    },
}

## Transcribe Alphabet and Punctuations

In [38]:
# load alphabet/punct data
alpha_data = json.loads(alphabet_json.read_text())
punct_data = json.loads(punctuation_json.read_text())
    
char_data = []
for dtype in (alpha_data, punct_data):
    for d in dtype:
        cdata = (d['decomposed_regex'], d['decomposed_string'])
        char_data.append(cdata)
        
print(len(char_data), 'chars loaded')

104 chars loaded


In [39]:
# compose transcription data
# data is simply a regex string to transcribed string mapping in a dict

for dialect, transes in dialect2trans.items():
    
    dialect_dir = transcription_dir.joinpath(dialect)
    dialect_dir.mkdir(exist_ok=True)
    
    for trans_name, tscriber in transes.items():
        
        trans_file = dialect_dir.joinpath(trans_name+'.json')
        trans_data = {}
        for re_patt, string in char_data:
            trans_data[re_patt] = tscriber.convert(string)
            
        with open(trans_file, 'w') as outfile:
            json.dump(trans_data, outfile, ensure_ascii=False, indent=4)