In [52]:
import pandas as pd
import numpy as np

df = pd.read_csv(
    "data/Unihan_Readings.txt", sep = "\t", comment = "#",
    names = ["Unicode", "col1", "col2"]
).pivot(index = "Unicode", columns = "col1", values = "col2")

df.columns.name = None

columns_of_interest = [
    "kDefinition", "kHangul", "kKorean",
    "kMandarin",
    "kJapanese", "kJapaneseOn",
]

df = df[columns_of_interest].reset_index()

df["Character"]   = df.Unicode.apply(lambda x: chr(int(x.replace('U+', ''), 16)))

df["kMandarin"]   = df.kMandarin.str.split(" ").str[0]
df["kHangul"]     = df.kHangul.str.split(" ").str[0].str.split(":").str[0]
df["kKorean"]     = df.kKorean.str.split(" ").str[0]
df["kJapanese"]   = df.kJapanese.str.split(" ").str[0]
df["kJapaneseOn"] = df.kJapaneseOn.str.split(" ").str[0]

In [53]:
pd.set_option("display.max_rows", None)

minimal_columns = [
    "kDefinition", "Character", "kMandarin", "kHangul", "kKorean", "kJapanese", "kJapaneseOn"
]

df = df[minimal_columns].dropna().reset_index(drop=True)

df.head()

Unnamed: 0,kDefinition,Character,kMandarin,kHangul,kKorean,kJapanese,kJapaneseOn
0,"one; a, an; alone",一,yī,일,IL,イチ,ICHI
1,"male adult; robust, vigorous; 4th heavenly stem",丁,dīng,정,CENG,チョウ,TEI
2,seven,七,qī,칠,CHIL,シチ,SHICHI
3,ten thousand; innumerable,万,wàn,만,MAN,マン,MAN
4,"unit of length equal 3.3 meters; gentleman, ma...",丈,zhàng,장,CANG,ジョウ,JOU


In [55]:
HANGUL_BASE = 0xAC00
CHOSUNG_BASE = 588
JUNGSUNG_BASE = 28

CHOSUNG_LIST = [
    'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ',
    'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ',
    'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'
]

JUNGSUNG_LIST = [
    'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ',
    'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ',
    'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ'
]

JONGSUNG_LIST = [
    '', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ',
    'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ',
    'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ',
    'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'
]

def decompose_hangul(syllable):
    code = ord(syllable)
    if not (0xAC00 <= code <= 0xD7A3):
        return (syllable, '', '')  # Not a Hangul syllable

    offset = code - HANGUL_BASE
    chosung_index = offset // CHOSUNG_BASE
    jungsung_index = (offset % CHOSUNG_BASE) // JUNGSUNG_BASE
    jongsung_index = offset % JUNGSUNG_BASE

    return (
        CHOSUNG_LIST[chosung_index],
        JUNGSUNG_LIST[jungsung_index],
        JONGSUNG_LIST[jongsung_index]
    )

df["decomposed_hangul"] = df.kHangul.apply(decompose_hangul)
df[["kHangul", "kKorean", "decomposed_hangul"]].value_counts()

kHangul  kKorean  decomposed_hangul
구        KWU      (ㄱ, ㅜ, )             101
기        KI       (ㄱ, ㅣ, )              87
비        PI       (ㅂ, ㅣ, )              84
사        SA       (ㅅ, ㅏ, )              78
유        YU       (ㅇ, ㅠ, )              76
조        CO       (ㅈ, ㅗ, )              75
수        SWU      (ㅅ, ㅜ, )              74
전        CEN      (ㅈ, ㅓ, ㄴ)             73
정        CENG     (ㅈ, ㅓ, ㅇ)             67
부        PWU      (ㅂ, ㅜ, )              63
경        KYENG    (ㄱ, ㅕ, ㅇ)             63
고        KO       (ㄱ, ㅗ, )              63
소        SO       (ㅅ, ㅗ, )              59
주        CWU      (ㅈ, ㅜ, )              58
도        TO       (ㄷ, ㅗ, )              57
장        CANG     (ㅈ, ㅏ, ㅇ)             55
호        HO       (ㅎ, ㅗ, )              53
초        CHO      (ㅊ, ㅗ, )              51
추        CHWU     (ㅊ, ㅜ, )              50
자        CA       (ㅈ, ㅏ, )              49
지        CI       (ㅈ, ㅣ, )              49
저        CE       (ㅈ, ㅓ, )              46
시        SI       

In [132]:
import re
import unicodedata

# Pinyin initials (including digraphs)
INITIALS = [
    "zh", "ch", "sh",  # must come before "z", "c", "s"
    "b", "p", "m", "f", "d", "t", "n", "l",
    "g", "k", "h", "j", "q", "x",
    "r", "z", "c", "s", "y", "w"
]

# Vowel mapping with tones
PINYIN_TONE_MAP = {
    "ā": ("a", 1), "á": ("a", 2), "ǎ": ("a", 3), "à": ("a", 4),
    "ē": ("e", 1), "é": ("e", 2), "ě": ("e", 3), "è": ("e", 4),
    "ī": ("i", 1), "í": ("i", 2), "ǐ": ("i", 3), "ì": ("i", 4),
    "ō": ("o", 1), "ó": ("o", 2), "ǒ": ("o", 3), "ò": ("o", 4),
    "ū": ("u", 1), "ú": ("u", 2), "ǔ": ("u", 3), "ù": ("u", 4),
    "ǖ": ("ü", 1), "ǘ": ("ü", 2), "ǚ": ("ü", 3), "ǜ": ("ü", 4),
    # plain vowels (neutral tone)
    "a": ("a", 5), "e": ("e", 5), "i": ("i", 5), "o": ("o", 5),
    "u": ("u", 5), "ü": ("ü", 5),
}

def split_pinyin(pinyin):
    # Step 1: find and normalize tone-marked vowel
    tone = 5
    final_chars = []
    for ch in pinyin:
        if ch in PINYIN_TONE_MAP:
            base, tone = PINYIN_TONE_MAP[ch]
            final_chars.append(base)
        else:
            final_chars.append(ch)
    pinyin_norm = ''.join(final_chars)

    # Step 2: match initial from the start
    initial = ''
    for init in INITIALS:
        if pinyin_norm.startswith(init):
            initial = init
            break
    final = pinyin_norm[len(initial):]

    return initial, final, tone

df["decomposed_pinyin"]     = df.kMandarin.apply(split_pinyin)
df["toneless_pinyin"] = df.decomposed_pinyin.apply(lambda t: t[0] + t[1].replace("ü", "v"))

df[["kMandarin", "decomposed_pinyin", "toneless_pinyin"]].value_counts()

kMandarin  decomposed_pinyin  toneless_pinyin
yì         (y, i, 4)          yi                 60
xī         (x, i, 1)          xi                 50
bì         (b, i, 4)          bi                 49
zhì        (zh, i, 4)         zhi                44
yù         (y, u, 4)          yu                 43
fú         (f, u, 2)          fu                 41
lì         (l, i, 4)          li                 40
shì        (sh, i, 4)         shi                40
jī         (j, i, 1)          ji                 39
yú         (y, u, 2)          yu                 38
qí         (q, i, 2)          qi                 35
jì         (j, i, 4)          ji                 31
jí         (j, i, 2)          ji                 29
yàn        (y, an, 4)         yan                27
huì        (h, ui, 4)         hui                27
jiàn       (j, ian, 4)        jian               26
fù         (f, u, 4)          fu                 26
xiè        (x, ie, 4)         xie                25
jué        (j, ue,

In [110]:
import pykakasi

# New-style converter
kks = pykakasi.kakasi()
converter = kks.convert

SMALL_KANA = "ァィゥェォャュョヮヵヶ"
SOKUON = "ッ"
CHOONPU = "ー"

def decompose_katakana(word):
    morae = []
    i = 0
    while i < len(word):
        ch = word[i]
        if ch == SOKUON or ch == CHOONPU:
            morae.append(ch)
            i += 1
        elif i + 1 < len(word) and word[i + 1] in SMALL_KANA:
            morae.append(ch + word[i + 1])
            i += 2
        else:
            morae.append(ch)
            i += 1
    return morae

def katakana_to_romaji_tuple(word):
    morae = decompose_katakana(word)
    romaji = []
    for m in morae:
        result = converter(m)
        if result:
            romaji.append(result[0]['hepburn'])
        else:
            romaji.append(m)  # fallback for symbols like ッ or ー
    return "".join(romaji)


df["katakana_romaji"] = df["kJapanese"].apply(katakana_to_romaji_tuple)

df[["kJapanese", "katakana_romaji"]].value_counts()

kJapanese  katakana_romaji
コウ         kou                272
ショウ        shou               191
シ          shi                168
カン         kan                161
キ          ki                 160
セン         sen                142
ソウ         sou                139
トウ         tou                139
ケン         ken                111
ケイ         kei                109
キョウ        kyou               108
カ          ka                 103
ホウ         hou                 95
エン         en                  94
カイ         kai                 93
シン         shin                89
ヨウ         you                 89
ヒ          hi                  86
チョウ        chou                83
イ          i                   81
セイ         sei                 80
シュウ        shuu                79
テイ         tei                 75
コ          ko                  68
サン         san                 66
タン         tan                 65
キュウ        kyuu                64
フ          fu                  63
サイ         sai       

In [111]:
df.head()

Unnamed: 0,kDefinition,Character,kMandarin,kHangul,kKorean,kJapanese,kJapaneseOn,decomposed_hangul,decomposed_pinyin,katakana_romaji,pinyin_without_accent
0,"one; a, an; alone",一,yī,일,IL,イチ,ICHI,"(ㅇ, ㅣ, ㄹ)","(y, i, 1)",ichi,yi
1,"male adult; robust, vigorous; 4th heavenly stem",丁,dīng,정,CENG,チョウ,TEI,"(ㅈ, ㅓ, ㅇ)","(d, ing, 1)",chou,ding
2,seven,七,qī,칠,CHIL,シチ,SHICHI,"(ㅊ, ㅣ, ㄹ)","(q, i, 1)",shichi,qi
3,ten thousand; innumerable,万,wàn,만,MAN,マン,MAN,"(ㅁ, ㅏ, ㄴ)","(w, an, 4)",man,wan
4,"unit of length equal 3.3 meters; gentleman, ma...",丈,zhàng,장,CANG,ジョウ,JOU,"(ㅈ, ㅏ, ㅇ)","(zh, ang, 4)",jou,zhang


In [None]:
df.to_pickle("triolingo.pkl.xz")

In [None]:
# katakana_romaji
# decomposed_pinyin
# decomposed_hangul
# kMandarin
# kHangul
# toneless_pinyin

def inequality_score(lst):
    n = len(lst)
    sorted_lst = sorted(lst)
    cumulative = 0
    for i, val in enumerate(sorted_lst):
        cumulative += (2 * (i + 1) - n - 1) * val
    gini = cumulative / (n * sum(lst))
    
    scaled_capped_sum = min(5, sum(lst)) / 5
    return gini * scaled_capped_sum

INPUT = "toneless_pinyin"
OUTPUT = "katakana_romaji"

results = df.groupby(INPUT).apply(lambda s: pd.Series({
    "inequality_score": inequality_score(np.unique(s[OUTPUT], return_counts = True)[1]),
    "max_proportion": max(np.unique(s[OUTPUT], return_counts = True)[1] / len(s[OUTPUT])),
    "sum_squared_proportion": sum(np.square(np.unique(s[OUTPUT], return_counts = True)[1] / len(s[OUTPUT]))),
    "distribution": s[OUTPUT].value_counts().to_dict()
}), include_groups = False).sort_values("inequality_score", ascending = False)

results[["inequality_score", "max_proportion", "sum_squared_proportion"]] = results[
    ["inequality_score", "max_proportion", "sum_squared_proportion"]
].apply(lambda x: x.round(2))

results

Unnamed: 0_level_0,inequality_score,max_proportion,sum_squared_proportion,distribution
toneless_pinyin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fu,0.71,0.55,0.37,"{'fu': 51, 'fuku': 20, 'futsu': 8, 'bu': 3, 'h..."
li,0.66,0.41,0.28,"{'ri': 31, 'rei': 22, 'reki': 9, 'ritsu': 4, '..."
huan,0.65,0.79,0.64,"{'kan': 23, 'en': 2, 'gyou': 1, 'gen': 1, 'gan..."
qi,0.64,0.51,0.28,"{'ki': 41, 'sei': 9, 'kei': 6, 'kitsu': 5, 'sh..."
wei,0.64,0.59,0.37,"{'i': 37, 'bi': 6, 'wai': 3, 'gi': 3, 'kai': 3..."
jin,0.63,0.56,0.37,"{'kin': 27, 'shin': 9, 'jin': 5, 'sen': 2, 'ko..."
han,0.63,0.87,0.76,"{'kan': 26, 'gan': 2, 'yama': 1, 'zen': 1}"
shen,0.63,0.77,0.6,"{'shin': 23, 'chin': 2, 'jin': 2, 'juu': 1, 's..."
you,0.63,0.83,0.71,"{'yuu': 35, 'yu': 5, 'u': 1, 'you': 1}"
gu,0.63,0.77,0.61,"{'ko': 30, 'koku': 4, 'kotsu': 3, 'ka': 1, 'it..."
