In [1]:
import pandas as pd
import difflib
import dragonmapper.hanzi
from itertools import combinations

remove_tone_symbols = lambda char_in_ipa: char_in_ipa.rstrip('˥').rstrip('˧˥').rstrip('˧˩˧').rstrip('˥˩')

def compare(english_word_ipa, subset_ipa):
    ratio = difflib.SequenceMatcher(None, subset_ipa, english_word_ipa).ratio()
    # Punish length difference.
    len_diff = abs(len(subset_ipa)-len(english_word_ipa))
    score = ratio*(1-len_diff)
    return score

In [2]:
# 导入英文单词
df = pd.read_csv("english.csv", index_col=0)
df.head(2)

Unnamed: 0,word,phonetic
0,"a, an",[ə；æn]
1,abandon,[əˈbændən]


In [3]:
# Clean up.
df['phonetic'] = df['phonetic'].str.strip().str.lstrip('[').str.rstrip(']')
df['phonetic'] = df['phonetic'].apply(lambda x: x[:x.find('；')] if '；' in x else x)
df['word'] = df['word'].apply(lambda x: x[:x.find(', ')] if ', ' in x else x)

df.head(2)

Unnamed: 0,word,phonetic
0,a,ə
1,abandon,əˈbændən


In [4]:
# 单词筛选
df.drop_duplicates('word', inplace=True)
df.drop_duplicates('phonetic', inplace=True)
df.reset_index(drop=True, inplace=True)
print(f"{len(df)} words remain.")

2933 words remain.


In [9]:
chinese = "山重水复疑无路"
chinese_in_ipa = dragonmapper.hanzi.to_ipa(chinese)
chars_in_ipa = chinese_in_ipa.split(' ')
chars_in_ipa = list(map(remove_tone_symbols, chars_in_ipa))

dfs = []
# For each possible substring.
for x, y in combinations(range(len(chinese) + 1), r = 2):
    if y-x<2: continue
    subset_chars_in_ipa = chars_in_ipa[x:y]
    subset_ipa = ''.join(subset_chars_in_ipa)
    this_df = df.assign(
        diff_sim = df['phonetic'].apply(compare, subset_ipa=subset_ipa)
    )
    this_df.sort_values('diff_sim', ascending=False, inplace=True)
    this_df = this_df.head(3)
    # These columns have the same content across rows. Thus, it is better to add them after truncating the rows.
    this_df = this_df.assign(
        x=x,
        y=y,
        subset_ipa=subset_ipa,
        subset_chinese=chinese[x:y],
        result=chinese[:x]+this_df['word']+chinese[y:],
    )
    dfs.append(this_df)
res_df = pd.concat(dfs)
res_df.sort_values('diff_sim', ascending=False)

Unnamed: 0,word,phonetic,diff_sim,x,y,subset_ipa,subset_chinese,result
1005,fee,fiː,0.666667,3,5,fui,复疑,山重水fee无路
2244,safety,ˈseɪfti,0.571429,2,5,ʂweɪfui,水复疑,山重safety无路
1006,feed,fiːd,0.500000,3,6,fuiu,复疑无,山重水feed路
1089,free,friː,0.500000,3,6,fuiu,复疑无,山重水free路
1008,feel,fiːl,0.500000,4,7,iulu,疑无路,山重水复feel
...,...,...,...,...,...,...,...,...
568,contribution,ˌkɑːntrɪˈbjuːʃn,0.200000,0,6,ʂanʈʂʊŋʂweɪfuiu,山重水复疑无,contribution路
504,communication,kəˌmjuːnɪˈkeɪʃn,0.200000,0,6,ʂanʈʂʊŋʂweɪfuiu,山重水复疑无,communication路
2190,responsibility,rɪˌspɑːnsəˈbɪləti,0.176471,0,7,ʂanʈʂʊŋʂweɪfuiulu,山重水复疑无路,responsibility
1330,increase,ɪnˈkris; ˈɪnˌkris,0.176471,0,7,ʂanʈʂʊŋʂweɪfuiulu,山重水复疑无路,increase
