In [1]:
import nltk
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\97263\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\97263\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\97263\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\97263\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def get_wordnet_pos(treebank_tag):
    """Converts treebank tags to WordNet tags."""
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return None

def find_polysemous_words(sentence):
    words = word_tokenize(sentence)
    tagged_words = pos_tag(words)
    polysemous_words = []

    for word, tag in tagged_words:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag:
            synsets = wn.synsets(word, pos=wn_tag)
            if len(synsets) > 1:  # 判断是否为多义词
                polysemous_words.append((word, len(synsets)))

    return polysemous_words

# 示例句子
sentence = "The bank can deny the loan."
polysemous_words = find_polysemous_words(sentence)
print(polysemous_words)


[('bank', 10), ('deny', 7), ('loan', 2)]


# 方法1：基于Lesk算法的简化词义消歧

In [4]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
nltk.download('omw-1.4')

def simplified_lesk(word, sentence):
    best_sense = None
    max_overlap = 0
    context = set(word_tokenize(sentence))
    for sense in wn.synsets(word):
        signature = set(word_tokenize(sense.definition()))
        for example in sense.examples():
            signature.union(set(word_tokenize(example)))
        overlap = len(context.intersection(signature))
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense
    return best_sense

# 使用示例
sentence = "The bank of China can deny the loan."
word = "bank"
sense = simplified_lesk(word, sentence)
print(f"Best sense for '{word}':", sense.definition() if sense else "No sense found")


Best sense for 'bank': sloping land (especially the slope beside a body of water)


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\97263\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
pip install -U pywsd

Collecting pywsd
  Downloading pywsd-1.2.5-py3-none-any.whl (26.9 MB)
     ---------------------------------------- 26.9/26.9 MB 3.6 MB/s eta 0:00:00
Collecting wn==0.0.23 (from pywsd)
  Downloading wn-0.0.23.tar.gz (31.6 MB)
     ---------------------------------------- 31.6/31.6 MB 2.3 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: wn
  Building wheel for wn (setup.py): started
  Building wheel for wn (setup.py): finished with status 'done'
  Created wheel for wn: filename=wn-0.0.23-py3-none-any.whl size=31792911 sha256=7ef61066099b2043671569e69430152f9fbb9d5b2aff1d703b480a9204d040c6
  Stored in directory: c:\users\97263\appdata\local\pip\cache\wheels\ec\47\17\409766c99dd470f34c512000b90b83f34747c2c975769654d7
Successfully built wn
Installing collected packages: wn, pywsd
Successfully installed pywsd-1.2.5 wn-0.0.23
Note: you may need to restart the kernel to use updated pa



# 使用其他包

In [19]:
from pywsd.lesk import simple_lesk
sent = 'I went to the bank to deposit my money'
ambiguous = 'bank'
answer = simple_lesk(sent, ambiguous, pos='r')
print(answer)

Synset('depository_financial_institution.n.01')


In [37]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.tag import pos_tag
from pywsd.lesk import simple_lesk

# 确保已经下载了必要的nltk资源
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

def get_wordnet_pos(treebank_tag):
    """Converts treebank tags to WordNet tags."""
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return None

def find_polysemous_words(sentence):
    words = word_tokenize(sentence)
    tagged_words = pos_tag(words)
    polysemous_words = []

    for word, tag in tagged_words:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag:
            synsets = wn.synsets(word, pos=wn_tag)
            if len(synsets) > 1:  # 判断是否为多义词
                polysemous_words.append((word, tag, len(synsets)))

    return polysemous_words

def disambiguate_sentence(sentence):
    polysemous_words = find_polysemous_words(sentence)
    disambiguated_words = []

    for word, tag, _ in polysemous_words:
        wn_tag = get_wordnet_pos(tag)
        meaning = simple_lesk(sentence, word, pos=wn_tag)
        if meaning:
            disambiguated_words.append((word, meaning.definition()))

    return disambiguated_words

# 示例句子
sentence = "I went to the bank to deposit my money"
disambiguated_words = disambiguate_sentence(sentence)
for word, meaning in disambiguated_words:
    print(f"'{word}': {meaning}")

    
print(disambiguated_words)


'went': to be spent or finished
'bank': a financial institution that accepts deposits and channels the money into lending activities
'deposit': put into a bank account
'money': the official currency issued by a government or national bank
[('went', 'to be spent or finished'), ('bank', 'a financial institution that accepts deposits and channels the money into lending activities'), ('deposit', 'put into a bank account'), ('money', 'the official currency issued by a government or national bank')]


In [34]:
sentence = "I went to the bank to deposit my money."
word = "bank"

# 使用simple_lesk进行词义消歧
disambiguated = simple_lesk(sentence, word,pos="n")

# 打印结果
print(f"Best sense for '{word}':", disambiguated.definition())
if disambiguated.examples():
    print("Examples:", disambiguated.examples())

Best sense for 'bank': a financial institution that accepts deposits and channels the money into lending activities
Examples: ['he cashed a check at the bank', 'that bank holds the mortgage on my home']


In [36]:
import sys
print(sys.executable)


D:\Toobox\Anaconda\python.exe
