In [1]:
import json
import sys
import unicodedata
from collections import defaultdict
from pathlib import Path

output_dir = Path('final_data')
output_dir.mkdir(exist_ok=True)

tmp_dir = Path('tmp')
tmp_dir.mkdir(exist_ok=True)

In [2]:
from IPython.display import HTML, display

def table(fn):
    h = '<div style="max-height: 15em; overflow-y:auto"><table>'
    for row in fn():
        h += '<tr>'+''.join(f'<td>{v}</td>' for v in row)+'</tr>'
    h += '</table></div>'
    
    display(HTML(h))
    
    return fn
    
def tsv_symbols(filename):
    def wrapper(fn):
        with open(filename,'w') as f:
            for row in fn():
                f.write('\t'.join(str(x) for x in row)+'\n')

    return wrapper

# Enumerate Unicode characters

Enumerate all characters in Unicode, and then pick out subsets corresponding to mathematical letters and symbols, Greek letters, and subscript and superscript characters.

In [3]:
def valid_unicode(c):
    try:
        unicodedata.name(c)
        return True
    except ValueError:
        return False

all_unicode = [chr(i) for i in range(sys.maxunicode + 1) if valid_unicode(chr(i))]


In [4]:
math = [c for c in all_unicode if 'MATH' in unicodedata.name(c) or unicodedata.category(c) == 'Sm']

unicode_category = defaultdict(list)

for c in all_unicode:
    unicode_category[unicodedata.category(c)].append(c)

Characters already understood by Numbas JME:

In [5]:
categories = 'Ll,Lu,Lo,Lt,Nl,Nd,White_space'.split(',')
characters = r""" '".{}?/\n:&;|^>=<-+*#!(),[] """


# Create letter mappings

For all mathematical letters, work out which Latin letter they correspond to, and extract any annotations from the unicodedata name.

In [6]:
import re
classes = set()
rest = []

letter_mapping = {}
for c in math:
    name = unicodedata.name(c)
    if unicodedata.category(c)[0] != 'L':
        continue
    m = re.match(r'^MATHEMATICAL (.*?) ?(NABLA|PARTIAL DIFFERENTIAL|(CAPITAL|SMALL( FINAL| DOTLESS)?) \w+|(CAPITAL )?\w+ SYMBOL)?$', name)
    m2 = re.match(r'^ARABIC MATHEMATICAL(?: (.*))? (\w+)', name)
    n = unicodedata.normalize('NFKD',c)
    if m:
        words = m.group(1).split(' ')
        classes = classes.union(set(words))
        letter_mapping[c] = (n, words)
    elif m2:
        words = m2.group(1).split(' ') if m2.group(1) else []
        classes = classes.union(set(words))
        letter_mapping[c] = (n,words)
    else:
        rest.append(name)

print(f'Unclassified letters: ', rest)

with open('other-letters.tsv') as f:
    lines = [l.split('\t') for l in f.read().strip().split('\n')]
    for line in lines:
        annotations = [w for w in line[7].split(' ') if w] if len(line)>=8 else []
        classes = classes.union(set(annotations))
        letter_mapping[line[1]] = [line[6], annotations]
        
print('Annotation words:')
for cc in sorted(classes):
    print(cc)
    
with open(output_dir / 'letters.json', 'w') as f:
    json.dump(letter_mapping, f, indent=4)

Unclassified letters:  []
Annotation words:
BLACK-LETTER
BOLD
DOTLESS
DOUBLE-STRUCK
FRAKTUR
INITIAL
ITALIC
LOOPED
MONOSPACE
RING
SANS-SERIF
SCRIPT
STRETCHED
TAILED


In [7]:
unicodedata.category('∅')

'Sm'

# Greek letters

Mathematical Greek letters are dealt with above, but standard Greek letters should map to their names in Latin, e.g. α to 'alpha'.

There are many combinations of Greek letters with diacritics, but these are normalized to multi-character strings, and not usually used in maths, so should be treated the same as other accented characters such as ü.

In [8]:
greek = [c for c in all_unicode if 'GREEK' in unicodedata.name(c)]

normalized_greek = sorted(set(unicodedata.normalize('NFKD',c)[0] for c in greek), key=lambda c: unicodedata.name(c))

@table
def greek_things():
    for c in greek:
        yield [c, unicodedata.name(c)]

0,1
͂,COMBINING GREEK PERISPOMENI
̓,COMBINING GREEK KORONIS
̈́,COMBINING GREEK DIALYTIKA TONOS
ͅ,COMBINING GREEK YPOGEGRAMMENI
Ͱ,GREEK CAPITAL LETTER HETA
ͱ,GREEK SMALL LETTER HETA
Ͳ,GREEK CAPITAL LETTER ARCHAIC SAMPI
ͳ,GREEK SMALL LETTER ARCHAIC SAMPI
ʹ,GREEK NUMERAL SIGN
͵,GREEK LOWER NUMERAL SIGN


In [9]:
with open('greek.tsv') as f:
    lines = [l.split('\t') for l in f.read().strip().split('\n')]
    greek_mapping = {k:[v,[]] for k,v in lines}
    
with open(output_dir / 'greek.json', 'w') as f:
    json.dump(greek_mapping, f, indent=4)

# Symbol mappings

There are hundreds of characters in the "Mathematical symbols" category.
Lots of them are combinations of symbols that would be very hard to disentangle, while others correspond to operations that we're extremely unlikely to see in e-assessment.

So the best method I can think of for coming up with a mapping to common function names or operator symbols is to go through the list manually, removing rare combinations or operations.

In [10]:
unicodedata.name('⟹')

'LONG RIGHTWARDS DOUBLE ARROW'

In [11]:
@tsv_symbols(tmp_dir / 'unedited-symbols.tsv')
@table
def symbols_to_map():
    for c in math+['ˆ']:
        if c in letter_mapping:
            continue
        category = unicodedata.category(c)
        if category == 'Nd':
            continue
        n = unicodedata.normalize('NFKC',c)
        if not (category in categories or c in characters or n in characters):
            yield (c, unicodedata.name(c), n, unicodedata.name(n[0]),len(n))


0,1,2,3,4
~,TILDE,~,TILDE,1
¬,NOT SIGN,¬,NOT SIGN,1
±,PLUS-MINUS SIGN,±,PLUS-MINUS SIGN,1
×,MULTIPLICATION SIGN,×,MULTIPLICATION SIGN,1
÷,DIVISION SIGN,÷,DIVISION SIGN,1
϶,GREEK REVERSED LUNATE EPSILON SYMBOL,϶,GREEK REVERSED LUNATE EPSILON SYMBOL,1
؆,ARABIC-INDIC CUBE ROOT,؆,ARABIC-INDIC CUBE ROOT,1
؇,ARABIC-INDIC FOURTH ROOT,؇,ARABIC-INDIC FOURTH ROOT,1
؈,ARABIC RAY,؈,ARABIC RAY,1
⁄,FRACTION SLASH,⁄,FRACTION SLASH,1


Copy `tmp/unedited-symbols.tsv` to `symbols.tsv`, remove characters we won't deal with, and give mappings for what remains.

In [12]:
symbol_mapping = {}

with open('symbols.tsv') as f:
    for row in f.read().strip().split('\n'):
        bits = row.split('\t')
        c = bits[0]
        alias = bits[5]
        annotations = bits[6].split(' ') if len(bits)>=7 else []
        symbol_mapping[c] = [alias, annotations]
 
with open(output_dir / 'symbols.json','w') as f:
    json.dump(symbol_mapping, f, indent=4)

Show symbols that are not explicitly mapped, and aren't in the existing JME grammar:

In [13]:
@table
def not_explicitly_mapped():
    for c in math:
        if c in letter_mapping or c in symbol_mapping:
            continue
        n = unicodedata.normalize('NFKC',c)
        if n[0] not in characters:
            yield [c,unicodedata.name(c),n,unicodedata.name(n[0]),len(n)]

0,1,2,3,4
϶,GREEK REVERSED LUNATE EPSILON SYMBOL,϶,GREEK REVERSED LUNATE EPSILON SYMBOL,1
؈,ARABIC RAY,؈,ARABIC RAY,1
⁻,SUPERSCRIPT MINUS,−,MINUS SIGN,1
⅁,TURNED SANS-SERIF CAPITAL G,⅁,TURNED SANS-SERIF CAPITAL G,1
⅂,TURNED SANS-SERIF CAPITAL L,⅂,TURNED SANS-SERIF CAPITAL L,1
⅃,REVERSED SANS-SERIF CAPITAL L,⅃,REVERSED SANS-SERIF CAPITAL L,1
⅄,TURNED SANS-SERIF CAPITAL Y,⅄,TURNED SANS-SERIF CAPITAL Y,1
⅋,TURNED AMPERSAND,⅋,TURNED AMPERSAND,1
←,LEFTWARDS ARROW,←,LEFTWARDS ARROW,1
↑,UPWARDS ARROW,↑,UPWARDS ARROW,1


# Punctuation characters

There are lots and lots of punctuation characters. Of particular interest are the brackets and parentheses.
It looks like we can rely on the standard normalization algorithm to convert these to the most common characters, so we don't need to produce a manual mapping dictionary.

Any character in category `Po` which doesn't match another kind of token should be treated as a punctuation token, and normalized.

The cell below displays all punctuation symbols whose normalized versions are not in the existing JME grammar.

In [14]:
@table
def unrecognised_punctuation():
    for c in unicode_category['Po']:
        n = unicodedata.normalize('NFKD',c)
        if n not in characters:
            yield [c,n, unicodedata.name(c)]

0,1,2
%,%,PERCENT SIGN
@,@,COMMERCIAL AT
¡,¡,INVERTED EXCLAMATION MARK
§,§,SECTION SIGN
¶,¶,PILCROW SIGN
·,·,MIDDLE DOT
¿,¿,INVERTED QUESTION MARK
·,·,GREEK ANO TELEIA
՚,՚,ARMENIAN APOSTROPHE
՛,՛,ARMENIAN EMPHASIS MARK


# Digits

There are a few variations on the European digits that normalize to the ASCII digits.

There are also a couple of sets of circled digits that don't normalize to anything else - should these be converted to ASCII digits too?

Finally, there are lots of digits in other scripts. When these are used in a base-10 system and map directly on to the digits 0-9, they could be mapped, but there are a variety of other symbols representing fractions, such as "TELUGU FRACTION DIGIT ONE FOR EVEN POWERS OF FOUR". It's probably safest to ignore these and throw an error if they're used, or deal with them in a dedicated number notation style.

So just normalize everything in class `Nd`.

In [15]:
digits = [c for c in all_unicode if unicodedata.category(c) =='Nd']

@table
def digits():
    for k, cs in sorted({k:[c for c in digits if unicodedata.normalize('NFKD',c)==k] for k in set(unicodedata.normalize('NFKD',c) for c in digits)}.items(), key=lambda x:(-len(x[1]),unicodedata.name(x[0][0])),reverse=False):
        names = ', '.join(unicodedata.name(c) for c in k)
        yield [k,len(k),names,cs]

0,1,2,3
8,1,DIGIT EIGHT,"['8', '８', '𝟖', '𝟠', '𝟪', '𝟴', '𝟾', '🯸']"
5,1,DIGIT FIVE,"['5', '５', '𝟓', '𝟝', '𝟧', '𝟱', '𝟻', '🯵']"
4,1,DIGIT FOUR,"['4', '４', '𝟒', '𝟜', '𝟦', '𝟰', '𝟺', '🯴']"
9,1,DIGIT NINE,"['9', '９', '𝟗', '𝟡', '𝟫', '𝟵', '𝟿', '🯹']"
1,1,DIGIT ONE,"['1', '１', '𝟏', '𝟙', '𝟣', '𝟭', '𝟷', '🯱']"
7,1,DIGIT SEVEN,"['7', '７', '𝟕', '𝟟', '𝟩', '𝟳', '𝟽', '🯷']"
6,1,DIGIT SIX,"['6', '６', '𝟔', '𝟞', '𝟨', '𝟲', '𝟼', '🯶']"
3,1,DIGIT THREE,"['3', '３', '𝟑', '𝟛', '𝟥', '𝟯', '𝟹', '🯳']"
2,1,DIGIT TWO,"['2', '２', '𝟐', '𝟚', '𝟤', '𝟮', '𝟸', '🯲']"
0,1,DIGIT ZERO,"['0', '０', '𝟎', '𝟘', '𝟢', '𝟬', '𝟶', '🯰']"


Class `Nl` is "letter numbers", such as Roman numerals. These normalize to the corresponding letters. 
They should map to numbers, but they don't work the same way as the conventional base-10 system, so should be implemented as dedicated number notation styles, and throw an error if used otherwise.

In [16]:
numerals = [c for c in all_unicode if unicodedata.category(c) == 'Nl']

@table
def show_numerals():
    for c in numerals:
        n = unicodedata.normalize('NFKD',c)
        names = ', '.join(unicodedata.name(c) for c in n)
        yield [c, unicodedata.name(c), n, names]

0,1,2,3
ᛮ,RUNIC ARLAUG SYMBOL,ᛮ,RUNIC ARLAUG SYMBOL
ᛯ,RUNIC TVIMADUR SYMBOL,ᛯ,RUNIC TVIMADUR SYMBOL
ᛰ,RUNIC BELGTHOR SYMBOL,ᛰ,RUNIC BELGTHOR SYMBOL
Ⅰ,ROMAN NUMERAL ONE,I,LATIN CAPITAL LETTER I
Ⅱ,ROMAN NUMERAL TWO,II,"LATIN CAPITAL LETTER I, LATIN CAPITAL LETTER I"
Ⅲ,ROMAN NUMERAL THREE,III,"LATIN CAPITAL LETTER I, LATIN CAPITAL LETTER I, LATIN CAPITAL LETTER I"
Ⅳ,ROMAN NUMERAL FOUR,IV,"LATIN CAPITAL LETTER I, LATIN CAPITAL LETTER V"
Ⅴ,ROMAN NUMERAL FIVE,V,LATIN CAPITAL LETTER V
Ⅵ,ROMAN NUMERAL SIX,VI,"LATIN CAPITAL LETTER V, LATIN CAPITAL LETTER I"
Ⅶ,ROMAN NUMERAL SEVEN,VII,"LATIN CAPITAL LETTER V, LATIN CAPITAL LETTER I, LATIN CAPITAL LETTER I"


# Subscripts and superscripts

There are a smattering of glyphs representing characters rendered in superscript or subscript.

These normalize with the algorithm `NFKD` to their standard equivalents, but this would lose semantics in mathematical expressions: superscripts usually represent an exponent, and subscripts are used for indexing names.

So make dictionaries of superscript and subscript characters, for the parser to use appropriately.

In JME, a string of superscript characters produces a `^` operator token followed by whatever partial expression the normalized version corresponds to, wrapped in parentheses.
For example, `x²⁺³` is equivalent to `x^(2+3)`.

In [17]:
subscript = [c for c in all_unicode if 'SUBSCRIPT' in unicodedata.name(c)]

@tsv_symbols(tmp_dir / 'unedited-subscripts.tsv')
@table
def subscript_things():
    for c in subscript:
        n = unicodedata.normalize('NFKD',c)
        names = ', '.join(unicodedata.name(c) for c in n)
        yield [c, unicodedata.name(c), n, names]        

0,1,2,3
ٖ,ARABIC SUBSCRIPT ALEF,ٖ,ARABIC SUBSCRIPT ALEF
ᵢ,LATIN SUBSCRIPT SMALL LETTER I,i,LATIN SMALL LETTER I
ᵣ,LATIN SUBSCRIPT SMALL LETTER R,r,LATIN SMALL LETTER R
ᵤ,LATIN SUBSCRIPT SMALL LETTER U,u,LATIN SMALL LETTER U
ᵥ,LATIN SUBSCRIPT SMALL LETTER V,v,LATIN SMALL LETTER V
ᵦ,GREEK SUBSCRIPT SMALL LETTER BETA,β,GREEK SMALL LETTER BETA
ᵧ,GREEK SUBSCRIPT SMALL LETTER GAMMA,γ,GREEK SMALL LETTER GAMMA
ᵨ,GREEK SUBSCRIPT SMALL LETTER RHO,ρ,GREEK SMALL LETTER RHO
ᵩ,GREEK SUBSCRIPT SMALL LETTER PHI,φ,GREEK SMALL LETTER PHI
ᵪ,GREEK SUBSCRIPT SMALL LETTER CHI,χ,GREEK SMALL LETTER CHI


In [18]:
subscript_mapping = {}

with open('subscripts.tsv') as f:
    for row in f.read().strip().split('\n'):
        bits = row.split('\t')
        c = bits[0]
        alias = bits[4]
        subscript_mapping[c] = [alias, []]
 
with open(output_dir / 'subscripts.json','w') as f:
    json.dump(subscript_mapping, f, indent=4)

In [19]:
superscript = [c for c in all_unicode if 'SUPERSCRIPT' in unicodedata.name(c)]

@tsv_symbols(tmp_dir / 'unedited-superscripts.tsv')
@table
def superscript_things():
    for c in superscript:
        n = unicodedata.normalize('NFKD',c)
        names = ', '.join(unicodedata.name(c) for c in n)
        yield [c, unicodedata.name(c), n, names]        

0,1,2,3
²,SUPERSCRIPT TWO,2,DIGIT TWO
³,SUPERSCRIPT THREE,3,DIGIT THREE
¹,SUPERSCRIPT ONE,1,DIGIT ONE
ٰ,ARABIC LETTER SUPERSCRIPT ALEF,ٰ,ARABIC LETTER SUPERSCRIPT ALEF
ܑ,SYRIAC LETTER SUPERSCRIPT ALAPH,ܑ,SYRIAC LETTER SUPERSCRIPT ALAPH
⁰,SUPERSCRIPT ZERO,0,DIGIT ZERO
ⁱ,SUPERSCRIPT LATIN SMALL LETTER I,i,LATIN SMALL LETTER I
⁴,SUPERSCRIPT FOUR,4,DIGIT FOUR
⁵,SUPERSCRIPT FIVE,5,DIGIT FIVE
⁶,SUPERSCRIPT SIX,6,DIGIT SIX


In [20]:
superscript_mapping = {}

with open('superscripts.tsv') as f:
    for row in f.read().strip().split('\n'):
        bits = row.split('\t')
        c = bits[0]
        alias = bits[4]
        superscript_mapping[c] = [alias, []]
 
with open(output_dir / 'superscripts.json','w') as f:
    json.dump(superscript_mapping, f, indent=4)

# Symbols from autolatex
The file `autolatex-data.json` contains thousands of mappings from LaTeX math-mode commands to unicode characters.
Let's find out which symbols in there haven't already been dealt with.

In [21]:
with open('autolatex-data.json') as f:
    d = json.load(f)
    autolatex_symbols = [y for x,y in d]

In [22]:
unmapped_autolatex = [c for c in sorted(set(autolatex_symbols)) if c not in symbol_mapping and c not in letter_mapping and c not in characters]

@tsv_symbols(tmp_dir / 'autolatex.tsv')
@table
def autolatex_todo():
    for c in unmapped_autolatex:
        if len(c)>1:
            continue
        try:
            cnames = ', '.join(unicodedata.name(x) for x in c)
            n = unicodedata.normalize('NFKD',c)
            nnames = ', '.join(unicodedata.name(c) for c in n)
            yield [len(c),c, unicodedata.category(c[0]), cnames, n, nnames]
        except ValueError:
            print("oops",c)

oops 


0,1,2,3,4,5
1,$,Sc,DOLLAR SIGN,$,DOLLAR SIGN
1,%,Po,PERCENT SIGN,%,PERCENT SIGN
1,@,Po,COMMERCIAL AT,@,COMMERCIAL AT
1,A,Lu,LATIN CAPITAL LETTER A,A,LATIN CAPITAL LETTER A
1,B,Lu,LATIN CAPITAL LETTER B,B,LATIN CAPITAL LETTER B
1,C,Lu,LATIN CAPITAL LETTER C,C,LATIN CAPITAL LETTER C
1,D,Lu,LATIN CAPITAL LETTER D,D,LATIN CAPITAL LETTER D
1,E,Lu,LATIN CAPITAL LETTER E,E,LATIN CAPITAL LETTER E
1,F,Lu,LATIN CAPITAL LETTER F,F,LATIN CAPITAL LETTER F
1,G,Lu,LATIN CAPITAL LETTER G,G,LATIN CAPITAL LETTER G


oops 


To deal with:

* Vulgar fraction characters, such as `¼ ½ ¾`. These might be combined with a string of digits to represent a fractional quantity, e.g. 13¾, "thirteen and three quarters".
* Some double-struck, script and black-letter characters: I've put these in `other-letters.tsv`.
* Some characters representing physical constants, e.g. ℎ, `PLANCK CONSTANT`, also in `other-letters.tsv`.
* Punctuation characters: primes, doubled and tripled exclamation and question marks.
* Ceiling and floor delimiters, and angle brackets: `⌈ ⌉ ⌊ ⌋ 〈 〉`. These need to be dealt with in the parser.