In [None]:
# 2024-03
# Oracle's UTF-EBCDIC variant
# (making a point about emoji)

In [1]:
import ebcdic

In [20]:
# correct dialect for UTF-EBCDIC
EBCDIC = 'cp1047'

In [65]:
def utf8(x):
    if isinstance(x, str):
        return b''.join([utf8(ord(c)) for c in x])
    if x < 0x80:
        return bytes([x])
    prefix = 0x80
    for n_bytes in [2, 3, 4]:
        prefix = 0x80 + (prefix >> 1)
        bit_depth = (7 - n_bytes) + (n_bytes - 1) * 6
        if x < 2**bit_depth:
            out = []
            for i in range(n_bytes - 1):
                out = [0x80 + (0x3F & x)] + out
                x >>= 6
            out = [prefix + x] + out
            return bytes(out)
    raise ValueError

def utf16(x):
    # return: List[int] of code units
    if isinstance(x, str):
        return sum([utf16(ord(c)) for c in x], start=[])
    if x < 0x10000: 
        # BMP
        return [x]
    else: 
        # astral
        y = x - 0x10000
        assert y < 0x100000
        return [0xD800 + (y >> 10), 0xDC00 + (y & 0x03FF)]

def utf8mod(x):
    if isinstance(x, str):
        return b''.join([utf8mod(ord(c)) for c in x])
    if x < 0xA0:
        return bytes([x])
    prefix = 0x80
    for n_bytes in [2, 3, 4, 5]:
        prefix = 0x80 + (prefix >> 1)
        bit_depth = (7 - n_bytes) + (n_bytes - 1) * 5
        if x < 2**bit_depth:
            out = []
            for i in range(n_bytes - 1):
                out = [0xA0 + (0x1F & x)] + out
                x >>= 5
            out = [prefix + x] + out
            return bytes(out)
    raise ValueError(x)

def utfe_remap(x):
    if x < 0xA0:
        # one-byte: EBCDIC back-compat
        return [*chr(x).encode(EBCDIC)][0]
    elif x < 0xC0:
        # continuation byte
        offset = x - 0xA0
        ranges = {
            0x41: 0x4A,
            0x51: 0x59,
            0x62: 0x6A,
            0x70: 0x73,
        }
        for first in ranges:
            if first + offset <= ranges[first]:
                return first + offset
            offset -= ranges[first] + 1 - first
    elif x < 0xFA: # max: 1111 1001
        # start byte
        offset = x - 0xC0
        ranges = {
            0x74: 0x78,
            0x80: 0x80,
            0x8A: 0x90,
            0x9A: 0xA0,
            0xAA: 0xAC,
            0xAE: 0xBC,
            0xBE: 0xBF,
            0xCA: 0xCF,
            0xDA: 0xDF,
            0xE1: 0xE1,
            0xEA: 0xEE,
        }
        for first in ranges:
            if first + offset <= ranges[first]:
                return first + offset
            offset -= ranges[first] + 1 - first
    else:
        raise ValueError(x)

def utfe(x):
    mod_bytes = utf8mod(x)
    return bytes(utfe_remap(x) for x in mod_bytes)

def cesu8(x):
    return b''.join(map(utf8, utf16(x)))

def oracle_utfe(x):
    return b''.join(map(utfe, utf16(x)))

### TESTS

In [22]:
for x in range(0xA0):
    y = chr(x).encode(EBCDIC)

In [80]:
# compare against Python implementation

for i in range(0x110000):
    if 0xd800 <= i < 0xe000:
        pass # surrogate
    else:
        assert(utf8(chr(i)) == chr(i).encode('utf8'))

In [53]:
cesu8('E ȅ 𐐀')

b'E \xc8\x85 \xed\xa0\x81\xed\xb0\x80'

In [106]:
'Þß'.encode('1252')

b'\xde\xdf'

In [103]:
[hex(ord(c)) for c in 'Þß']

['0xde', '0xdf']

In [85]:
# cf. Oracle documentation

''.join(map(hex, oracle_utfe('€')))

'0xca0x460x53'

In [77]:
def bin8(n):
    assert n >= 0
    s = bin(n)[2:]
    return '0' * (8 - len(s)) + s

def prettybin(n):
    bits = bin8(n)
    return ''.join(['. ', '# '][int(b)] for b in bits)

In [107]:
# draw a smiley face ;)

bs = oracle_utfe('😃')
for n in bs:
    print(prettybin(n))

# # . # # # . # 
. # # . . # . # 
. # . . . . # . 
. # # # . . . # 
# # . # # # . # 
. # # . . # # . 
. # . # . # # # 
. # . . . # . . 


In [109]:
for n in [0x42, 0xA5, 0x00, 0x00, 0x7E, 0x42, 0x7E]:
    print(prettybin(n))

. # . . . . # . 
# . # . . # . # 
. . . . . . . . 
. . . . . . . . 
. # # # # # # . 
. # . . . . # . 
. # # # # # # . 


In [15]:
x = 0
x >>= 5
print(x)

0
