In [1]:
import pandas as pd  # !mamba install -y pandas
import difflib
import dragonmapper.hanzi
from itertools import combinations
import jieba # !mamba install -y jieba

remove_tone_symbols = lambda char_in_ipa: char_in_ipa.rstrip('˥').rstrip('˧˥').rstrip('˧˩˧').rstrip('˥˩')

def compare(english_word_ipa, subset_ipa):
    ratio = difflib.SequenceMatcher(None, subset_ipa, english_word_ipa).ratio()
    # Punish length difference.
    len_diff = abs(len(subset_ipa)-len(english_word_ipa))
    score = ratio*(1-len_diff)
    return score

In [None]:
# 导入英文单词
ENGLISH_DICT_PATH = "english.csv"
try:
    df = pd.read_csv(ENGLISH_DICT_PATH)
except:
    print('Failed to load the English dictionary. Creating...')
    import eng_to_ipa as ipa # !pip install eng-to-ipa
    from tqdm import tqdm    # !mamba install -y tqdm
    def convert_to_ipa(word, pbar):
        pbar.update()
        return ipa.convert(word)
    df = pd.read_table('/usr/share/dict/words', header=None)
    df.columns = ['word']
    with tqdm(df) as pbar:
        df['ipa'] = df['word'].apply(convert_to_ipa, pbar=pbar)
    # 单词筛选
    df = df[~df['ipa'].str.endswith('*')]
    df.drop_duplicates('word', inplace=True)
    df.drop_duplicates('ipa', inplace=True)
    df.reset_index(drop=True, inplace=True)
    print(f"{len(df)} words remain.")
    df.to_csv(ENGLISH_DICT_PATH, index=False)
df.head(2)

  0%|          | 0/235886 [00:00<?, ?it/s]

Failed to load the English dictionary. Creating...


 27%|██▋       | 62896/235886 [09:44<27:11, 106.04it/s]

In [None]:
chinese = "据说明天有雨"
chinese_in_ipa = dragonmapper.hanzi.to_ipa(chinese)
chars_in_ipa = chinese_in_ipa.split(' ')
chars_in_ipa = list(map(remove_tone_symbols, chars_in_ipa))

words = []
words_in_ipa = []
for word, start, end in jieba.tokenize(chinese):
    word_in_ipa = chars_in_ipa[start:end]
    word_in_ipa = ''.join(word_in_ipa)
    words_in_ipa.append(word_in_ipa)
    words.append(word)
print(' '.join(words))

dfs = []
# For each possible substring.
for x, y in combinations(range(len(words) + 1), r = 2):
    subset_ipa = words_in_ipa[x:y]
    subset_ipa = ''.join(subset_ipa)
    this_df = df.assign(
        diff_sim = df['ipa'].apply(compare, subset_ipa=subset_ipa)
    )
    this_df.sort_values('diff_sim', ascending=False, inplace=True)
    this_df = this_df.head(3)
    # These columns have the same content across rows. Thus, it is better to add them after truncating the rows.
    this_df = this_df.assign(
        x=x,
        y=y,
        subset_ipa=subset_ipa,
        subset_chinese=''.join(words[x:y]),
        result=''.join(words[:x])+this_df['word']+''.join(words[y:]),
    )
    dfs.append(this_df)
res_df = pd.concat(dfs)
res_df.sort_values('diff_sim', ascending=False)