In [None]:
# pip install wn
import wn

wn.add("data/dictionary/english-wordnet-2024.xml.gz") 

POS_MAP = {
    'n': 'noun',    
    'v': 'verb',     
    'a': 'adjective', 
    's': 'adjective',
    'r': 'adverb',   
}

syns = wn.synsets("bank", lexicon="oewn:2024", lang="en")  

for s in syns:
    sid = s.id
    pos_code = s.pos      
    pos_readable = POS_MAP.get(pos_code, pos_code)  
    definition = s.definition()    

    lemma_names = sorted(set(s.lemmas()))  

    print(f"{sid} ({pos_readable}) - {definition}")        
    print("  Synonyms:", ", ".join(lemma_names) if lemma_names else "(none)") 
    # print("  Antonyms:", ", ".join(sorted(antonym_names)) if antonym_names else "(none)") 
    print("-" * 60)


oewn-09236472-n (noun) - sloping land (especially the slope beside a body of water)
  Synonyms: bank
------------------------------------------------------------
oewn-08437235-n (noun) - a financial institution that accepts deposits and channels the money into lending activities
  Synonyms: bank, banking company, banking concern, depository financial institution
------------------------------------------------------------
oewn-09236341-n (noun) - a long ridge or pile
  Synonyms: bank
------------------------------------------------------------
oewn-08479077-n (noun) - an arrangement of similar objects in a row or in tiers
  Synonyms: bank
------------------------------------------------------------
oewn-13389491-n (noun) - a supply or stock held in reserve for future use (especially in emergencies)
  Synonyms: bank
------------------------------------------------------------
oewn-13377435-n (noun) - the funds held by a gambling house or the dealer in some gambling games
  Synonyms: ban

[KSkipping oewn:2024 (Open Engish Wordnet); already added



In [6]:
print(nltk.data.path)

['/home/snt/nltk_data', '/home/snt/projects_lujun/agi_index_tournament/.venv/nltk_data', '/home/snt/projects_lujun/agi_index_tournament/.venv/share/nltk_data', '/home/snt/projects_lujun/agi_index_tournament/.venv/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']


In [5]:
# pip install nltk
import nltk
from nltk.corpus import wordnet as wn

nltk.download('wordnet:2024')  # 下载 WordNet 2024 版本[web:50][web:51]
# 若使用自带 OMW/WordNet，需先 nltk.download('wordnet')；自备 WNDB 时可用路径参数加载[web:50][web:51]
# 示例：基本查询
for s in wn.synsets("bank")[:5]:
    print(s.name(), s.definition())  # 传统接口：name()/definition()/lemmas() 等[web:50][web:51]
    print([l.name() for l in s.lemmas()])  # 同义词表面形式[web:50][web:51]

bank.n.01 sloping land (especially the slope beside a body of water)
['bank']
depository_financial_institution.n.01 a financial institution that accepts deposits and channels the money into lending activities
['depository_financial_institution', 'bank', 'banking_concern', 'banking_company']
bank.n.03 a long ridge or pile
['bank']
bank.n.04 an arrangement of similar objects in a row or in tiers
['bank']
bank.n.05 a supply or stock held in reserve for future use (especially in emergencies)
['bank']


[nltk_data] Error loading wordnet:2024: Package 'wordnet:2024' not
[nltk_data]     found in index


In [4]:
nltk.data.path

['/home/snt/nltk_data',
 '/home/snt/projects_lujun/agi_index_tournament/.venv/nltk_data',
 '/home/snt/projects_lujun/agi_index_tournament/.venv/share/nltk_data',
 '/home/snt/projects_lujun/agi_index_tournament/.venv/lib/nltk_data',
 '/usr/share/nltk_data',
 '/usr/local/share/nltk_data',
 '/usr/lib/nltk_data',
 '/usr/local/lib/nltk_data']

In [42]:
from nltk.wsd import lesk
from nltk.corpus import wordnet as wn
sent = ['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.']
ss = lesk(sent, 'bank', 'n')
synonyms = [lemma.name() for lemma in ss.lemmas()] 
print(ss, ss.definition())

Synset('depository_financial_institution.n.01') a financial institution that accepts deposits and channels the money into lending activities


In [None]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')
from nltk import word_tokenize, pos_tag


sent_text = "NLTK can tag parts of speech for words in a sentence."
tokens = word_tokenize(sent_text)
tags = pos_tag(tokens)
print(tags) 


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/snt/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


[('NLTK', 'NNP'), ('can', 'MD'), ('tag', 'VB'), ('parts', 'NNS'), ('of', 'IN'), ('speech', 'NN'), ('for', 'IN'), ('words', 'NNS'), ('in', 'IN'), ('a', 'DT'), ('sentence', 'NN'), ('.', '.')]


In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk import word_tokenize, pos_tag
from nltk.wsd import lesk
from nltk.corpus import wordnet as wn

def ptb_to_wordnet_pos(ptb_tag: str):
    if not ptb_tag:
        return None
    if ptb_tag.startswith('J'):
        return wn.ADJ   # 'a'
    elif ptb_tag.startswith('V'):
        return wn.VERB  # 'v'
    elif ptb_tag.startswith('N'):
        return wn.NOUN  # 'n'
    elif ptb_tag.startswith('R'):
        return wn.ADV   # 'r'
    else:
        return None

def synset_id_str(ss):
    return f"{ss.offset():08d}-{ss.pos()}"

def wsd_sentence(sentence: str):
    tokens = word_tokenize(sentence)
    tags = pos_tag(tokens)
    tag_map = dict(tags)

    results = []
    for w in tokens:
        ptb = tag_map.get(w)
        wn_pos = ptb_to_wordnet_pos(ptb)
        ss = lesk(tokens, w, wn_pos) if wn_pos else lesk(tokens, w)
        if ss:
            item = {
                "word": w,
                "wnet_number": synset_id_str(ss),
                "gloss": ss.definition(),
                "synonyms": [lemma.name() for lemma in ss.lemmas()]
            }
        else:
            item = {
                "word": w,
                "wnet_number": None,
                "gloss": None,
                "synonyms": []
            }
        results.append(item)
    return results

results = wsd_sentence("NLTK can tag parts of speech for words in a sentence.")


[nltk_data] Downloading package punkt_tab to /home/snt/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/snt/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/snt/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/snt/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [10]:
# Test the package
from agi_toolkit import DictRewriter
import importlib
importlib.reload(DictRewriter)

rewriter = DictRewriter.DictRewriter()
sentence = """
Before you begin rewriting, it's a must to fully understand the meaning of the original text. This ensures that the paraphrase is accurate and captures all the essential points of the source, avoiding misinterpretation."""

## RB, adverb
print("ORIGINAL SENTENCE:", sentence)
rewritten, wsd= rewriter.rewrite(sentence, ratio=0.9, exclude_word_class=["DT","VB","VBD","VBG","VBN","VBP","VBZ","RB"])
# rewritten, wsd= rewriter.rewrite(sentence, ratio=0.9)
print("REWRITTEN SENTENCE:", rewritten)

ORIGINAL SENTENCE: 
Before you begin rewriting, it's a must to fully understand the meaning of the original text. This ensures that the paraphrase is accurate and captures all the essential points of the source, avoiding misinterpretation.
REWRITTEN SENTENCE: 
ahead you begin rewriting , it 's a must to fully understand the import of the original text . This ensures that the paraphrasis is accurate and captures all the essential point of the reference , avoiding mistaking .


[nltk_data] Downloading package punkt_tab to /home/snt/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/snt/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/snt/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/snt/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
wsd

for item in wsd:
    print(item["word"])
    print(item["synonyms"])
    print(item["gloss"])
    print(item["wnet_pos"])
    print("-----------------------------------------------------------")

You
[]
None
None
-----------------------------------------------------------
are
['be']
have the quality of being; (copula, used with an adjective or a predicate noun)
v
-----------------------------------------------------------
an
['Associate_in_Nursing', 'AN']
an associate degree in nursing
None
-----------------------------------------------------------
expert
['expert']
a person with special knowledge or ability who performs skillfully
n
-----------------------------------------------------------
in
['indium', 'In', 'atomic_number_49']
a rare soft silvery metallic element; occurs in small quantities in sphalerite
None
-----------------------------------------------------------
Luxembourgish-English
[]
None
a
-----------------------------------------------------------
translation
['translation', 'interlingual_rendition', 'rendering', 'version']
a written communication in a second language having the same meaning as the written communication in a first language
n
-------------------