In [1]:
!pip install xpinyin



In [2]:
from xpinyin import Pinyin
import pandas as pd
import numpy as np

In [3]:
import difflib
import Levenshtein

In [4]:
p = Pinyin()

In [5]:
# 导入英文单词
data = pd.read_csv("english.csv", index_col=0)

In [6]:
data.head(2)

Unnamed: 0,word,phonetic
0,"a, an",[ə；æn]
1,abandon,[əˈbændən]


In [7]:
yb2py ={" ":"","'":"","[":"","]":"",",":"", "/":"","ˈ":"","ˌ":"","ː":"",
        "i":"yi","ɪ":"yi","e":"ai","æ":"a","ə":"a","ʌ":"a","a":"a","ɑ":"a","ɔ":"ou","u":"u",
       "eɪ":"ei","aɪ":"ai","ɔɪ":"wei","əu":"ou","au":"ao","ɪə":"yi","ɛə":"aei","uə":'wei',
       "p":"p","b":"b","t":"t","d":"d","k":"k","ɡ":"g","f":"f","v":"u","s":"s","z":"s",
       "θ":"si","ð":"zhe","ʃ":"shi","ʒ":"yi","h":"he","r":"r","tʃ":"qi","dʒ":"ji",
       "tr":"que","dr":"jue","ts":"ci","dz":"zi","m":"m","n":"n","ŋ":"en","w":"wo","j":"","ɛ":"ai",
       "o":"o","ɝ":"er","ɜ":"e","ʊ":"o","ɚ":"er","ō":"o","l":""}

def yb_2_py(wordYB):
    '''Converts IPA symbols to Pinyin.'''
    py = ""
    index = 0
    YBLen = len(wordYB)
    while index<YBLen:
        if index+1<YBLen and wordYB[index:index+2] in yb2py:
            py += yb2py[wordYB[index:index+2]]
            index+=2
        else:
            py += yb2py[wordYB[index]]
            index+=1
    return py

In [8]:
yb_2_py("skaɪ")

'skai'

In [15]:
# 预处理
# 词
data.loc[0, "word"] = "an"
data.loc[992, "word"] = "an"
# 音标
phonetic_to_yb = lambda item: item.split(";")[0].split(")")[-1].replace(":","ː").strip()
ybs = data["phonetic"].apply(phonetic_to_yb)
ybs[0]='æn'
data = data.assign(ybs=ybs)
# 拼音
pys = data["ybs"].apply(yb_2_py)
data = data.assign(pys=pys)

data.head(2)

Unnamed: 0,word,phonetic,ybs,pys
0,an,[ə；æn],æn,an
1,abandon,[əˈbændən],[əˈbændən],abandan


In [25]:
# 单词筛选
data = data.loc[(data['word'].str.len()<=5) & (data['word'].str.len()>=3)]
data.reset_index(drop=True, inplace=True)
print(f"{len(data)} words remain.")

1196 words remain.


In [27]:
chinese = u"有备而来"
pinyin = p.get_pinyin(chinese).split("-")
str1 = ''.join(pinyin[1:3])
print(str1)

beier


In [38]:
# 转换音标
data = data.assign(
    diff_sim = data['word'].apply(lambda str2: difflib.SequenceMatcher(None, str1,str2).ratio()),
    leven_dis = data['word'].apply(lambda str2: Levenshtein.distance(str1, str2)),
    leven_ratio = data['word'].apply(lambda str2: Levenshtein.ratio(str1, str2)),
)

In [40]:
print('Top 5 most similar words by `difflib.SequenceMatcher`:')
data.sort_values('diff_sim', ascending=False).head(5)

Top 5 most similar words by `difflib.SequenceMatcher`:


Unnamed: 0,word,phonetic,ybs,pys,diff_sim,leven_dis,leven_ratio
83,beer,[bɪr],[bɪr],byir,0.888889,1,0.888889
81,bee,[biː],[biː],byi,0.75,2,0.75
82,beef,[biːf],[biːf],byif,0.666667,2,0.666667
77,bear,[ber],[ber],bair,0.666667,2,0.666667
98,bird,[bɜːrd],[bɜːrd],berd,0.666667,3,0.666667


In [41]:
print('Top 5 most similar words by `Levenshtein.distance`:')
data.sort_values('leven_dis', ascending=False).head(5)

Top 5 most similar words by `Levenshtein.distance`:


Unnamed: 0,word,phonetic,ybs,pys,diff_sim,leven_dis,leven_ratio
598,mark,[mɑːrk],[mɑːrk],mark,0.222222,5,0.222222
944,sort,[sɔːrt],[sɔːrt],sourt,0.222222,5,0.222222
942,soon,[suːn],[suːn],sun,0.0,5,0.0
941,song,[ˈsʌmwer],[ˈsʌmwer],samwoair,0.0,5,0.0
940,son,[sʌn],[sʌn],san,0.0,5,0.0


In [42]:
print('Top 5 most similar words by `Levenshtein.ratio`:')
data.sort_values('leven_ratio', ascending=False).head(5)

Top 5 most similar words by `Levenshtein.ratio`:


Unnamed: 0,word,phonetic,ybs,pys,diff_sim,leven_dis,leven_ratio
83,beer,[bɪr],[bɪr],byir,0.888889,1,0.888889
81,bee,[biː],[biː],byi,0.75,2,0.75
303,ever,[ˈevər],[ˈevər],aiuar,0.666667,2,0.666667
77,bear,[ber],[ber],bair,0.666667,2,0.666667
78,bear,[ber],[ber],bair,0.666667,2,0.666667


In [28]:
# 直接
n = len(wordsList)
diff_sim = np.zeros(n)
leven_dis = np.zeros(n)
leven_ratio = np.zeros(n)
for i,str2 in enumerate(wordsList):
    diff_sim[i] = difflib.SequenceMatcher(None, str1,str2).ratio()
    leven_dis[i] = Levenshtein.distance(str1, str2)
    leven_ratio[i] = Levenshtein.ratio(str1, str2)
print(wordsList[diff_sim==max(diff_sim)],max(diff_sim))
print(wordsList[leven_ratio==max(leven_ratio)],max(leven_ratio))
print(wordsList[leven_dis==min(leven_dis)],min(leven_dis))

['beer'] 0.8888888888888888
['beer'] 0.8888888888888888
['beer'] 1.0


In [29]:
print(wordsList[leven_ratio==max(leven_ratio)][0]+chinese[2:])

beer而来


In [31]:
testID = 10
wordYB = ybs[testID]
py = pys[testID]
print(data["word"][testID],wordYB,py)

accept [əkˈsept] aksaipt


In [32]:
min(1,2,3)

1

In [33]:
class Solution(object):
    def minDistance(self, word1, word2):
        """
        :type word1: str
        :type word2: str
        :rtype: int
        """
        # init
        counter = [[0]*(len(word2)+1) for i in range(len(word1)+1)]
        for i in range(1,len(word1)+1):
            counter[i][0]=i
        for i in range(1,len(word2)+1):
            counter[0][i]=i
        # 开始计算
        for i in range(1,len(word1)+1):
            for j in range(1,len(word2)+1):
                if word1[i-1]==word2[j-1]:
                    thisone = counter[i-1][j-1]
                else:
                    thisone = min(counter[i][j-1]+1,counter[i-1][j]+1,counter[i-1][j-1]+1) # 加，减，替换
                counter[i][j] = thisone 
        return counter[-1][-1]


In [34]:
so = Solution()

In [35]:
so.minDistance("horse","ros")

3