In [1]:
import pandas as pd  # !mamba install -y pandas
import difflib
import dragonmapper.hanzi
from itertools import combinations
import jieba # !mamba install -y jieba
from tqdm import tqdm    # !mamba install -y tqdm

remove_tone_symbols = lambda char_in_ipa: char_in_ipa.rstrip('˥').rstrip('˧˥').rstrip('˧˩˧').rstrip('˥˩')

def compare(english_word_ipa, subset_ipa):
    ratio = difflib.SequenceMatcher(None, subset_ipa, english_word_ipa).ratio()
    # Punish length difference.
    len_diff = abs(len(subset_ipa)-len(english_word_ipa))
    score = ratio*(1-len_diff)
    return score

In [3]:
# Import English dictionary.
ENGLISH_DICT_PATH = "english.csv"
try:
    df = pd.read_csv(ENGLISH_DICT_PATH)
except:
    print('Failed to load the English dictionary. Creating...')
    import eng_to_ipa as ipa # !pip install eng-to-ipa
    def convert_to_ipa(word, pbar):
        pbar.update()
        try:
            return ipa.convert(word)
        except:
            return ''
    words = ipa.mode_type('json').keys()
    df = pd.DataFrame({'word': words})
    with tqdm(df) as pbar: # Takes about 20 min.
        df['ipa'] = df['word'].apply(convert_to_ipa, pbar=pbar)
    # Filter words.
    df = df[~df['ipa'].str.endswith('*')]
    df.drop_duplicates('ipa', inplace=True)
    df.reset_index(drop=True, inplace=True)
    print(f"{len(df)} words remain.")
    df.to_csv(ENGLISH_DICT_PATH, index=False)
df.head(2)

Failed to load the English dictionary. Creating...


100%|██████████| 125000/125000 [19:23<00:00, 107.44it/s]


107973 words remain.


Unnamed: 0,word,ipa
0,a,ə
1,a's,eɪz


In [5]:
chinese = "山重水复疑无路"
chinese_in_ipa = dragonmapper.hanzi.to_ipa(chinese)
chars_in_ipa = chinese_in_ipa.split(' ')
chars_in_ipa = list(map(remove_tone_symbols, chars_in_ipa))

words = []
words_in_ipa = []
for word, start, end in jieba.tokenize(chinese):
    word_in_ipa = chars_in_ipa[start:end]
    word_in_ipa = ''.join(word_in_ipa)
    words_in_ipa.append(word_in_ipa)
    words.append(word)
print(' '.join(words))

dfs = []
# For each possible substring.
for x, y in combinations(range(len(words) + 1), r = 2):
    subset_ipa = words_in_ipa[x:y]
    subset_ipa = ''.join(subset_ipa)
    this_df = df.assign(
        diff_sim = df['ipa'].apply(compare, subset_ipa=subset_ipa)
    )
    this_df.sort_values('diff_sim', ascending=False, inplace=True)
    this_df = this_df.head(3)
    # These columns have the same content across rows. Thus, it is better to add them after truncating the rows.
    this_df = this_df.assign(
        x=x,
        y=y,
        subset_ipa=subset_ipa,
        subset_chinese=''.join(words[x:y]),
        result=''.join(words[:x])+this_df['word']+''.join(words[y:]),
    )
    dfs.append(this_df)
res_df = pd.concat(dfs)
res_df.sort_values('diff_sim', ascending=False)

山重水复 疑无路


Unnamed: 0,word,ipa,diff_sim,x,y,subset_ipa,subset_chinese,result
18374,clune,klun,0.5,1,2,iulu,疑无路,山重水复clune
35457,fluet,flut,0.5,1,2,iulu,疑无路,山重水复fluet
69364,ooley,ˈuli,0.5,1,2,iulu,疑无路,山重水复ooley
21345,counterweight,ˈkaʊntərˌweɪt,0.384615,0,1,ʂanʈʂʊŋʂweɪfu,山重水复,counterweight疑无路
9688,biosafety,ˌbaɪoʊˈseɪfti,0.384615,0,1,ʂanʈʂʊŋʂweɪfu,山重水复,biosafety疑无路
21529,cowperthwaite,ˈkaʊpərθˌweɪt,0.384615,0,1,ʂanʈʂʊŋʂweɪfu,山重水复,cowperthwaite疑无路
62843,microwavable,ˌmaɪkroʊˈweɪvəbəl,0.352941,0,2,ʂanʈʂʊŋʂweɪfuiulu,山重水复疑无路,microwavable
67569,nomenclatorial,ˌnoʊmɪnkləˈtɔriəl,0.294118,0,2,ʂanʈʂʊŋʂweɪfuiulu,山重水复疑无路,nomenclatorial
62840,microtubules,ˈmaɪˌkroʊˈtubjulz,0.294118,0,2,ʂanʈʂʊŋʂweɪfuiulu,山重水复疑无路,microtubules
