In [1]:
import nltk
import os
import wikipedia
from collections import defaultdict
from gensim.models import KeyedVectors
from nltk.corpus import wordnet as wn
from scipy.spatial.distance import cosine
# nltk.download("wordnet")

# Word2Vec

In [2]:
model_path = './udn.word2vec.bin'
model = KeyedVectors.load_word2vec_format(model_path, binary=True)

# Synset

In [3]:
target_synsets = []
for line in open('lab02_input.txt'):
    word, synset, offset = line.strip().split('\t')
    target_synsets.append((word, wn.synset_from_pos_and_offset('n', int(offset))))

print("The amount of target synsets :", len(target_synsets))

The amount of target synsets : 54


# WordNet EC

In [4]:
wnec = {}
with open('wn.ec.noun.txt') as f:
    for line in f:
        word, sense, ch = line.split('\t', 3)
        res = [w for w in ch.split('|') if w in model]
        if word in model:
            res.append(word)
        if res:
            wnec[sense] = res

# Wiki Link EC

In [5]:
wikiec = {}
with open('ec.link.txt') as f:
    for line in f:
        _, word, ch = line.split('\t', 3)
        word.replace('_', ' ')
        ch = ch[2:]
        if ch in model:
            wikiec[word] = model[ch]

# Word Sense

In [6]:
f = open("lab02_output.tsv","w+")
result = []

In [7]:
for word, synset in target_synsets:
    # synset: hypernyms, lemmas, hyponyms
    s_hyper = list(synset.closure(lambda s: s.hypernyms(), 2))
    s_hypo = list(synset.closure(lambda s: s.hyponyms(), 2))
    s_hy = s_hyper + s_hypo
    s_lemma = synset.lemmas()
    s_family = s_hy + s_lemma
    # corresponding words
    n_s = synset.name().split('.')[0]
    n_hy = [s.name().split('.')[0] for s in s_hy]
    n_lemma = [s.name().split('.')[-1] for s in s_lemma]
    n_family = n_hy + n_lemma
    # corresponding chinese
    ec_lemma = list(n_lemma)
    for n in ec_lemma:
        ec_lemma.extend(wnec.get(n, []))
    ec_family = list(n_family)
    for n in ec_family:
        ec_family.extend(wnec.get(n, []))
    # w2v model
    v_l = [model[n] for n in ec_lemma if n in model]
    if v_l:
        m_s = model[n_s] if n_s in model else sum(v_l) / len(v_l)
        m_family = [n for n in ec_family if n in model]
        v_family = [model[n] - m_s for n in m_family]
        v_s = sum(v_family) / len(v_family) + m_s
    # wiki candidate
    candidate = []
    for lemma in m_family:
        cur = [lemma]
        if '_' in lemma:
            cur = wikipedia.search(lemma, 3)
        for page in cur:
            try:
                wiki = wikipedia.page(page)
                candidate.append(page)
            except:
                pass
    # score by vector cosine similarity
    scores = {}
    for cd in candidate:
        s = [1 - cosine(v_s, model[c]) if c in model else 0 for c in cd.lower().split()]
        scores[cd] = sum(s) / len(s)
    page = 'can not match to wiki'
    if scores:
        page, score = min(scores.items(), key=lambda k: k[1])
        #if score >= 0.5:
        page = wikipedia.page(page).url
    print(score)
    result.append('\t'.join([word, str(synset), page]))



  lis = BeautifulSoup(html).find_all('li')


0.7801557779312134
0.6944633722305298
0.9233576655387878
0.8686987161636353
0.30378568172454834
0.5829584002494812
0.6366953253746033
0.6906927227973938
0.6194887757301331
0.7347137928009033
0.9471968412399292
0.5368343591690063
0.5441094636917114
1.0
0.6883959770202637
0.9753903746604919
1.0
0.980416476726532
0.8759234547615051
0.8759234547615051
0.8759234547615051
0.9173532724380493
0.2547925114631653
0.2547925114631653
0.2547925114631653
0.2547925114631653
0.8186461925506592
0.8186461925506592
0.8186461925506592
0.8186461925506592
0.8186461925506592
0.9156430959701538
0.8181314468383789
0.9940304160118103
0.30323734879493713
0.3981686532497406
0.7702730894088745
0.9581615924835205
0.6506733298301697
0.8989349007606506
0.9080107808113098
0.9080107808113098
0.5159494280815125
0.7146669030189514
0.7146669030189514
0.7146669030189514
0.8175163865089417
0.5118826627731323
0.545118510723114
0.4713805019855499
0.6494073867797852
0.43743735551834106
0.7783246040344238
0.5843098759651184


In [8]:
f.write('\n'.join(result))
f.close()