In [14]:
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
from konlpy.tag import Kkma
import torch

In [15]:
sentence1 = 'I went fishing last weekend and I got a bass and cooked it'
sentence2 = 'I love the music from the speaker which has strong beat and bass'
sentence3 = 'I think the bass is more important than guitar'

In [16]:
word = 'bass'
sentences = [sentence1, sentence2, sentence3]

In [17]:
with open('한글_5.9.7_실습text.txt', encoding = 'utf-8') as f:
    korean_sentences = [l.strip() for l in f.read().splitlines() if l.strip()]

In [18]:
for ss in wn.synsets('bass'):
    print(ss, ss.definition())

Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range


In [19]:
def do_lesk(sentence, word):
    best_synset = lesk(sentence.split(), word)
    print(best_synset, best_synset.definition())

In [20]:
for sentence in sentences:
    do_lesk(sentence, word)

Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('bass.n.02') the lowest part in polyphonic music
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae


In [21]:
sentence1 = 'I went fishing last weekend and I got a bass and cooked it'
sentence2 = 'I love the music from the speaker which has strong beat and bass'
sentence3 = 'I think the bass is more important than guitar'

In [22]:
for ss in wn.synsets('bass'):
    print(ss, ss.definition())

Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range


In [23]:
def count_seen_headwords(lines, predicate = 'VV', headword = 'NNG'):
    tagger = Kkma()
    seen_dict = {}
    
    for line in lines:
        pos_result = tagger.pos(line)
        
        word_h = None
        word_p = None
        for word, pos in pos_result:
            if pos == predicate or pos[:3] == predicate + '+':
                word_p = word
                break
            if pos == headword:
                word_h = word
                
        if word_h is not None and word_p is not None:
            seen_dict[word_p] = [word_h] + ([] if seen_dict.get(word_p) is None else seen_dict[word_p])
            
    return seen_dict

In [24]:
seen_headwords = count_seen_headwords(korean_sentences)
seen_headwords

{'카': ['지미'],
 '늘': ['일부',
  '주기율표',
  '뉴랜즈',
  '하',
  '로',
  '역사',
  '주기율표',
  '주기율표',
  '주기율표',
  '주기표',
  '사이',
  '이론',
  '키',
  '하',
  '선소',
  '베르트',
  '이론',
  '진동수',
  '시계',
  '편이',
  '하',
  '이론',
  '하',
  '다루',
  '그라스',
  '작품',
  '전후',
  '그라스',
  '작자',
  '소설',
  '이야기',
  '작가',
  '이야기',
  '일',
  '문서',
  '소프트웨어',
  '이후',
  '공개',
  '이',
  '하',
  '하',
  '하',
  '초반',
  '화',
  '장르',
  '수',
  '소프트웨어',
  '소프트웨어',
  '키',
  '버전',
  '키',
  '하',
  '키',
  '키',
  '소프트웨어',
  '키',
  '링크',
  '문서',
  '지',
  '키',
  '소프트웨어',
  '사용자',
  '문서',
  '본문',
  '소프트웨어',
  '편집자',
  '키',
  '하',
  '일부',
  '문서',
  '키',
  '키',
  '하',
  '키',
  '키',
  '문서',
  '키',
  '키',
  '키',
  '텍스트',
  '키',
  '한국',
  '가위',
  '가위',
  '종류',
  '가위',
  '가위',
  '시절',
  '음식',
  '불리',
  '서부',
  '얼',
  '종교',
  '업체',
  '의회',
  '주',
  '세계',
  '하',
  '이전',
  '당시',
  '일',
  '수',
  '처음',
  '전투',
  '수',
  '중반',
  '수도',
  '여름',
  '보이',
  '보',
  '유럽',
  '길이',
  '공용어',
  '사이',
  '산업',
  '로',
  '공용어',
  '남동쪽',
  '하',
  '하',
  '비엔날레',
  '전시',
  '전시

In [25]:
def get_cosine_similarity(x1, x2):
    return (x1*x2).sum() / ((x1**2).sum()**.5*(x2**2).sum()**.5)

In [26]:
def get_selectional_association(predicate, headword, lines, dataframe, metric):
    global seen_headwords
    v1 = torch.FloatTensor(dataframe.loc[headword].values)
    seens = seen_headwords[predicate]
    
    total = 0
    for seen in seens:
        try:
            v2 = torch.FloatTensor(dataframe.loc[seen].values)
            total += metric(v1,v2)
        except:
            pass
        
    return total

In [27]:
def wsd(predicate, headwords):
    global korean_sentences
    global get_cosine_similarity
    selectional_associations = []
    for h in headwords:
        selectional_associations += [get_selectional_association(predicate, h, korean_sentences, co, get_cosine_similarity)]
        
    print(selectional_associations)

In [28]:
co = torch.load('co.pth')

In [29]:
wsd('만들',['연구', '요리'])

[tensor(1.5971), tensor(0.8774)]
