#### Word Vectors : 어휘 사전

In [1]:
# import nltk
# nltk.download('wordnet')

In [2]:
from nltk.corpus import wordnet as wn

* "good"의 synonym(유의어)

In [3]:
poses = {'n':'noun', 'v':'verb', 's':'adj (s)', 'a':'adj', 'r':'adv'}
poses

{'n': 'noun', 'v': 'verb', 's': 'adj (s)', 'a': 'adj', 'r': 'adv'}

In [4]:
for synset in wn.synsets("good"):
    pos = poses[synset.pos()] # POS Tag, Part-Of-Speech: 품사
    lemmas = [lemma.name() for lemma in synset.lemmas()] # Lemma : 단어의 원형
    lemmas = ', '.join(lemmas)
    print(f"{pos:7s}: {lemmas}")

noun   : good
noun   : good, goodness
noun   : good, goodness
noun   : commodity, trade_good, good
adj    : good
adj (s): full, good
adj    : good
adj (s): estimable, good, honorable, respectable
adj (s): beneficial, good
adj (s): good
adj (s): good, just, upright
adj (s): adept, expert, good, practiced, proficient, skillful, skilful
adj (s): good
adj (s): dear, good, near
adj (s): dependable, good, safe, secure
adj (s): good, right, ripe
adj (s): good, well
adj (s): effective, good, in_effect, in_force
adj (s): good
adj (s): good, serious
adj (s): good, sound
adj (s): good, salutary
adj (s): good, honest
adj (s): good, undecomposed, unspoiled, unspoilt
adj (s): good
adv    : well, good
adv    : thoroughly, soundly, good


* "cat"의 hypernym(상위어)

In [5]:
cat = wn.synset("cat.n.01")
hyper = lambda s: s.hypernyms()
list(cat.closure(hyper))

[Synset('feline.n.01'),
 Synset('carnivore.n.01'),
 Synset('placental.n.01'),
 Synset('mammal.n.01'),
 Synset('vertebrate.n.01'),
 Synset('chordate.n.01'),
 Synset('animal.n.01'),
 Synset('organism.n.01'),
 Synset('living_thing.n.01'),
 Synset('whole.n.02'),
 Synset('object.n.01'),
 Synset('physical_entity.n.01'),
 Synset('entity.n.01')]

어휘 사전의 문제점 : 단어와 단어 간의 유사도(similartiy)를 계산할 수 없음!

#### SVD Method Example

In [6]:
import numpy as np

X = np.array([[0, 2, 2, 0, 0, 0, 0, 0, 0, 0], # I
              [2, 0, 0, 1, 0, 0, 0, 0, 0, 0], # ate
              [2, 0, 0, 0, 0, 0, 1, 0, 1, 0], # like
              [0, 1, 0, 0, 1, 1, 0, 0, 0, 0], # a
              [0, 0, 0, 1, 0, 0, 0, 0, 0, 1], # banana
              [0, 0, 0, 1, 0, 0, 0, 0, 0, 1], # cherry
              [0, 0, 1, 0, 0, 0, 0, 1, 0, 0], # deep
              [0, 0, 0, 0, 0, 0, 1, 0, 0, 1], # learning
              [0, 0, 1, 0, 0, 0, 0, 0, 0, 1], # NLP
              [0, 0, 0, 0, 1, 1, 0, 1, 1, 0]]) # .

In [7]:
# X_svd
U, S, V_t = np.linalg.svd(X)

In [8]:
k = 5
theta = sum(S[:k]) / sum(S)
print(f'theta : {theta}')

theta : 0.8174988844487285


In [9]:
U = U[:, :k]
U

array([[-0.58951321,  0.63338858, -0.07381218, -0.31561873,  0.08921915],
       [-0.43880494, -0.4679372 , -0.0950592 , -0.15540716,  0.40200628],
       [-0.49906814, -0.51894471,  0.18296896, -0.18902671, -0.32887242],
       [-0.21718772,  0.19140752,  0.37405398,  0.29204662,  0.48061906],
       [-0.12612756, -0.06426321, -0.39796596,  0.39641327,  0.19296524],
       [-0.12612756, -0.06426321, -0.39796596,  0.39641327,  0.19296524],
       [-0.19421905,  0.18666537,  0.02954001,  0.04266901, -0.41679693],
       [-0.11890892, -0.06274144, -0.25333285,  0.28215586, -0.35443303],
       [-0.21471749,  0.16369162, -0.31774506,  0.17599977, -0.30080158],
       [-0.184132  ,  0.00884932,  0.57389513,  0.57316285, -0.16426787]])

In [10]:
US = np.dot(U, np.diag(S[:k]))
X_hat = np.dot(US, V_t[:k ,:]) # shape
X_hat = np.round(X_hat)
X_hat

array([[ 0.,  2.,  2., -0.,  0.,  0., -0.,  0., -0.,  0.],
       [ 2.,  0., -0.,  1., -0., -0.,  0., -0.,  0.,  0.],
       [ 2., -0.,  0.,  0.,  0.,  0.,  1.,  0.,  1., -0.],
       [-0.,  1.,  0.,  0.,  1.,  1., -0.,  0.,  0., -0.],
       [ 0., -0.,  0.,  1.,  0.,  0.,  0., -0., -0.,  1.],
       [ 0., -0.,  0.,  1.,  0.,  0.,  0., -0., -0.,  1.],
       [-0.,  0.,  1., -0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0., -0.,  0.,  0., -0., -0.,  0.,  0.,  0.,  1.],
       [-0.,  0.,  1.,  0., -0., -0.,  0.,  0.,  0.,  1.],
       [ 0.,  0., -0., -0.,  1.,  1.,  0.,  1.,  1.,  0.]])

In [11]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

bow = ['I', 'ate', 'like', 'a',
      'banana', 'cherry', 'deep', 'learning', 'NLP', '.']

fig = plt.figure(1, figsize=(8, 5))
ax = Axes3D(fig)

for idx, word in enumerate(bow):
    ax.axis([-.75, .75, -.75, .75])
    ax.text3D(U[idx, 0], U[idx, 1], U[idx, 2], word, size=15)
    
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
plt.show()

<Figure size 800x500 with 1 Axes>

* SVD Method의 문제점
    * 새로운 단어 출현으로 인해 Matrix의 차원(dimension)이 자주 바뀜 -> SVD 다시 계산
    * 대부분의 단어가 동시에 출현(co-occur)하지 않으므로 Matrix가 매우 스파스(sparse) 함
    * 일반적으로 Matrix는 매우 높은 차원을 가짐
    * 계산 비용이 높음 -> mxn Matrix인 경우 O(mn^2)

* SVD Method의 문제점 보완 방법들
    * 'the', 'he', 'has' 등 불용어(stop words) 제거
    * Window size에서 단어의 거리를 반영해주는 ramp window 적용
    * 단순 카운팅이 아닌 Pearson correlation & negative count 사용하기

#### Negative Sampling (NEG)

In [12]:
from keras.preprocessing.sequence import skipgrams

Using TensorFlow backend.


In [13]:
text = 'The quick brwon fox jumps over the lazy dog'
text = text.split()

In [14]:
index2word = {idx: word for idx, word in enumerate(text)}
word2index = {word: idx for idx, word in index2word.items()}
seq = [word2index[word] for word in text]

In [15]:
pairs, labels = skipgrams(sequence=seq, vocabulary_size=len(word2index.keys()),
                         window_size=2, negative_samples=.5)

for pair, label in zip(pairs, labels):
    w_i, w_o = pair
    print(f'({index2word[w_i]}, {index2word[w_o]}), {label}')

(over, lazy), 1
(fox, over), 1
(the, lazy), 1
(fox, fox), 0
(fox, quick), 1
(quick, fox), 0
(over, jumps), 0
(jumps, brwon), 1
(jumps, over), 1
(fox, brwon), 1
(the, brwon), 0
(dog, lazy), 0
(over, fox), 1
(jumps, the), 1
(brwon, fox), 1
(brwon, quick), 1
(lazy, dog), 1
(the, jumps), 1
(lazy, over), 1
(jumps, dog), 0
(lazy, the), 1
(the, quick), 0
(brwon, dog), 0
(quick, fox), 1
(over, the), 1
(fox, jumps), 1
(the, lazy), 0
(the, dog), 1
(the, over), 1
(quick, brwon), 1
(brwon, fox), 0
(dog, the), 1
(over, fox), 0
(jumps, fox), 1
(dog, lazy), 1
(brwon, quick), 0
(over, jumps), 1
(dog, brwon), 0
(brwon, jumps), 1
