# Generating Character Tables

In [1]:
import collections
import csv
import unicodedata

## Transcription Lite

In [3]:
trans_lite = {
    'ʾ': ')',
    'ʿ': '(',
    'c̭': 'c',
    'č': '5',
    'č̭': '5',
    'č̣': '%',
    'ḍ': 'D',
    'ð': '6',
    'ð̣': '^',
    'ġ': 'G',
    'ḥ': 'H',
    'ɟ': '4',
    'Ɉ': '4',
    'k̭': '&',
    'ḷ': 'L',
    'ṃ': 'M',
    'p̭': 'p',
    'p̌': 'p',
    'p̣': 'P',
    'ṛ': 'R',
    'ṣ': 'S',
    'š': '$',
    'ṱ': '<+>',
    'ṭ': 'T',
    'θ': '8',
    'ž': '7',
    'ẓ': 'Z',
    'á': 'a',
    'à': 'a',
    'ā': 'a',
    'ā̀': 'A',
    'ā́': 'A',
    'ă': '@',
    'ắ': '@',
    'ằ': '@',
    'e': 'e',
    'ē': 'E',
    'ɛ': '3',
    'i': 'i',
    'ī': 'I',
    'ĭ': '9',
    'ə': '9',
    'o': 'o',
    'ō': 'O',
    'u': 'u',
    'ū': 'U',
    'ŭ': '2',
    'ı': 'i',
    'ɑ': 'a',
}

In [50]:
table = []

for char, trans in trans_lite.items():
    char = unicodedata.normalize('NFC', char)
    c_hex = '+'.join(hex(ord(c)) for c in char)
    names = '+'.join(unicodedata.name(c) for c in char)
    
    table.append({
        'chars': char,
        'trans': trans,
        'hexs': c_hex,
        'names': names,
    })

In [21]:
len(unicodedata.normalize('NFC', 'c̭'))

2

In [53]:
with open('trans_lite.tsv', 'w') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=table[0].keys(), delimiter='\t')
    writer.writeheader()
    writer.writerows(table)

# Dialect-dependent Transcription

We will prepare an ASCII transcription for the NENA dialects. Character transcriptions, however, can be different per dialect. This is because the semantic value of a consonant is dependent on its dialect. For instance, the character `q` in Barwar should be transcribed as a `k̭` in Christian Urmi. 

In the next section, we build up these consonant mappings by describing character sets, which consist of unicode codes and regex patterns. We can then map those sets in various ways depending on the dialects.

The foundation of our transcription is the [*Comprehensive Aramaic Lexicon*](http://cal.huc.edu/) which is likewise used in the [Dukhrana database](https://www.dukhrana.com/lexicon/PayneSmith/). 

We will use [unicode literals](https://docs.python.org/3/howto/unicode.html#unicode-literals-in-python-source-code) for the tables.

In [9]:
# import csv
# from pathlib import Path
# char_table_dir = Path('/Users/cody/github/CambridgeSemiticsLab/nena_tf/char_tables/')

In [20]:
# # map ASCII universal transcription symbol
# # to a colloquial name of the character
# universal_transcription = {
#     'consonants': {
#         ')': '', 
#         'b': '', 
#         'g': '', 
#         'd': '', 
#         'D': '', 
#         'h': '', 
#         'w': '', 
#         'z': '', 
#         'x': '', 
#         'T': '', 
#         'y': '', 
#         'k': '',
#         'l': '', 
#         'm': '', 
#         'n': '', 
#         's': '', 
#         '(': '', 
#         'p': '', 
#         'f': '', 
#         'S': '', 
#         'q': '', 
#         'r': '', 
#         '$': '',
#         't': '',
#         '&': '',
#     },
#     'vowels': {
#         'a': 'a', 
#         'e': 'e', 
#         'i': 'i',
#         'o': 'o', 
#         'u': 'u', 
#         'E': 'shewa', # ə
        
#         # get rid of?
#         '1': 'dotless i', # ı
#         '@': 'alpha',     # ɑ 
#         '3': 'open E',    # ɛ
#     },
#     'vowel_diacritics': {# e.g.
#         '`': 'grave',     # à
#         "'": 'acute',     # á
#         '_': 'macron',    # ā
#         '%': 'breve',     # ă
#         '"': 'diaeresis', # ä
#         '~': 'tilde'      # ã
#     },
#     'consonant_diacritics': {   # e.g.
#         '<': 'caron',             # x̌
#         '^': 'circumflex',        # x̂
#         ';': 'dot above',         # ẋ
#         '.': 'dot below',         # x̣
#         '>': 'circumflex below',  # x̭
        
#         # !! WHAT TO NAME +?
#         '+': '',                  # ⁺x 
#     },
# }


In [21]:
# table_spreadsheets = collections.defaultdict(list)

# for t_name, table in universal_transcription.items():
#     for char, name in table.items():
#         table_spreadsheets[t_name].append([char, name])

In [22]:
# for t_name, data in table_spreadsheets.items():
#     universal = char_table_dir.joinpath('universal')
#     filename = universal.joinpath(t_name+'.tsv')
#     with open(filename, 'w') as outfile:
#         writer = csv.writer(outfile, delimiter='\t')
#         writer.writerow(['char', 'name'])
#         writer.writerows(data)

In [23]:
# # map universal characters to 
# # UTF8 instances

# typical_vowels = {
#     'a': 'a', 
#     'e': 'e',
#     'i': 'i',
#     'o': 'o',
#     'u': 'u',
#     '\u0259': 'E', # shewa
    
#     # get rid of?
#     '\u0131': '1',  # dotless i
#     '\u0251': '@',  # alpha
#     '\u025B': '3',  # ɛ open e
    
# }
# typical_accents = {
#     '\u0300': '`', # grave
#     '\u0301': "'", # acute
#     '\u0304': '_', # macron
#     '\u0306': '%', # breve
#     '\u0308': '"', # diaeresis
#     '\u0303': '~', # tilde
# }
# typical_diacritics = {
#     '\u030C': '<', # caron
#     '\u0302': '^', # circumflex
#     '\u0307': ';', # dot above
#     '\u0323': '.', # dot below
#     '\u032D': '>', # circumflex below
#     '\u207A': '+', # plus for Urmi
# }

# # build mappings from UTF8 representation
# # to universal transcription symbol
# dialect_transcriptions = {
#     'Barwar': {
#         'consonants': {
#             '\u02be': ')', # aleph
#             'b': 'b',
#             'g': 'g', 
#             'd': 'd',
#             '\u00f0': 'D', # eth
#             'h': 'h', 
#             'w': 'w', 
#             'z': 'z', 
#             'x': 'x', 
#             '\u1e6d': 'T', # t with dot below
#             'y': 'y', 
#             'k': 'k', 
#             'l': 'l', 
#             'm': 'm', 
#             'n': 'n', 
#             's': 's', 
#             '\u02bf': '(', # ayin
#             'p': 'p',
#             'f': 'f',
#             '\u1e63': 'S', # s with dot below
#             'q': 'q',
#             'r': 'r',
#             '\u0161': '$', # s with caron
#             't': 't',
#             '\u03b8': '&', # theta
#             '\u02b8': 'y', # small letter y (!CHECK THIS ONE!)
#         },
#         'vowels': 
#             typical_vowels,
#         'consonant_diacritics': 
#             typical_diacritics,
#         'vowel_diacritics': 
#             typical_accents,
#     },
#     'Urmi_C': {
#         'consonants': {
#             '\u02be': ')', # aleph
#             'b': 'b',
#             '\u025f': 'g', # ɟ
#             '\u0248': 'g', # Ɉ i.e. capital ɟ
#             'd': 'd',
#             # no ð
#             'h': 'h', 
#             'w': 'w', 
#             'z': 'z', 
#             'x': 'x', 
#             '\u1e71': 'T', # t with circumflex below
#             'y': 'y', 
#             'k': 'k', 
#             'l': 'l', 
#             'm': 'm', 
#             'n': 'n', 
#             's': 's', 
#             # no ayin
#             'p': 'p',
#             'f': 'f',
#             's': 'S', # !! NB THIS IS A PROBLEM IN URMI! See other s!
#             'q': 'q',
#             'r': 'r',
#             '\u0161': '$', # s with caron
#             't': 't',
#             # no theta
#         },
#         'vowels': 
#             typical_vowels,
#         'consonant_diacritics': 
#             typical_diacritics,
#         'vowel_diacritics': 
#             typical_accents,
#     },
# }

In [25]:
# table_spreadsheets = collections.defaultdict(list)
# for dialect, table in dialect_transcriptions.items():
#     for char, trans in table.items():
#         table_spreadsheets[dialect].append([char, trans])

In [26]:
# for dialect, data in table_spreadsheets.items():
#     dialect = char_table_dir.joinpath(dialect)
#     filename = dialect.joinpath(t_name+'.tsv')
#     with open(filename, 'w') as outfile:
#         writer = csv.writer(outfile, delimiter='\t')
#         writer.writerow(['char', 'trans'])
#         writer.writerows(data)