In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk

In [2]:
corpus = "I have a puppy. His name is Bori. I love him."
tokenized = nltk.word_tokenize(corpus)
vocabulary = list(set(tokenized)) # 단어의 집합(중복 x)
print(tokenized)
print(vocabulary)

['I', 'have', 'a', 'puppy', '.', 'His', 'name', 'is', 'Bori', '.', 'I', 'love', 'him', '.']
['I', 'is', 'love', 'him', 'name', 'His', 'have', '.', 'a', 'Bori', 'puppy']


In [3]:
word2index={}
for voca in vocabulary:
    if word2index.get(voca)==None:
        word2index[voca]=len(word2index)
print(word2index)

{'I': 0, 'name': 4, 'is': 1, 'His': 5, 'have': 6, 'love': 2, '.': 7, 'a': 8, 'him': 3, 'Bori': 9, 'puppy': 10}


In [4]:
WINDOW_SIZE = 2
windows = list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + tokenized + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1))

In [17]:
train_data = []

for window in windows:
    for i in range(WINDOW_SIZE * 2 + 1):
        if i == WINDOW_SIZE or window[i] == '<DUMMY>': 
            continue
        train_data.append((window[WINDOW_SIZE], window[i]))

print(train_data[:WINDOW_SIZE * 2])

[('I', 'have'), ('I', 'a'), ('have', 'I'), ('have', 'a')]


In [6]:
# 각 단어를 index로 바꾸고 LongTensor로 바꿔주는 함수
def prepare_word(word, word2index):
    return Variable(torch.LongTensor([word2index[word]]))

X_p,y_p=[],[]

for (center,context) in train_data:
    X_p.append(prepare_word(center, word2index).view(1, -1))
    y_p.append(prepare_word(context, word2index).view(1, -1))
    
train_data = list(zip(X_p,y_p))
train_data[0]

(Variable containing:
  0
 [torch.LongTensor of size 1x1], Variable containing:
  6
 [torch.LongTensor of size 1x1])

In [7]:
# 3차원으로 임베딩
center_embed = nn.Embedding(len(word2index),3)
context_embed = nn.Embedding(len(word2index),3)

print(center_embed.weight)
print(context_embed.weight)

Parameter containing:
-0.2524  1.3636 -0.1962
 0.6567 -0.4086  0.4950
 0.7263  0.7845 -0.5036
 0.0126  1.4632  0.5829
 0.5574 -0.2718  0.2905
-0.4669  0.8146  0.1285
 1.4794  0.1370 -0.6939
 0.5348 -0.7883  2.0635
 0.3054  0.4519 -0.1813
 1.2936 -0.2578 -1.0669
-1.8129  0.9762 -1.0742
[torch.FloatTensor of size 11x3]

Parameter containing:
-1.5218 -0.1553 -0.6425
-0.8546 -0.8382 -0.5070
 0.7839  0.4105  0.0647
 0.7736  0.1882  0.1087
 0.1280  0.6651 -0.9395
 0.1514  0.8391 -1.2819
-0.0180 -1.0372  0.3673
 0.0377 -1.1143  0.1572
-0.0252  0.3677  0.8179
 0.7805  0.6672  0.4507
-0.4009 -1.7529  0.6296
[torch.FloatTensor of size 11x3]



In [9]:
center,context = train_data[0]

center_vector = center_embed(center)
context_vector = context_embed(context)
print(center_vector)
print(context_vector)

Variable containing:
(0 ,.,.) = 
 -0.2524  1.3636 -0.1962
[torch.FloatTensor of size 1x1x3]

Variable containing:
(0 ,.,.) = 
 -0.0180 -1.0372  0.3673
[torch.FloatTensor of size 1x1x3]



In [10]:
score = torch.exp(context_vector.bmm(center_vector.transpose(1,2))).squeeze(2)
score

Variable containing:
 0.2272
[torch.FloatTensor of size 1x1]

In [11]:
vocabulary

['I', 'is', 'love', 'him', 'name', 'His', 'have', '.', 'a', 'Bori', 'puppy']

In [12]:
#  시퀀스(단어들의 연속된 리스트)가 들어오면 LongTensor로 매핑
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w], seq))
    return Variable(torch.LongTensor(idxs))

vocabulary_tensor = prepare_sequence(vocabulary,word2index).view(1,-1)
print(vocabulary_tensor)

Variable containing:
    0     1     2     3     4     5     6     7     8     9    10
[torch.LongTensor of size 1x11]



In [13]:
vocabulary_vector = context_embed(vocabulary_tensor)

In [14]:
vocabulary_vector

Variable containing:
(0 ,.,.) = 
 -1.5218 -0.1553 -0.6425
 -0.8546 -0.8382 -0.5070
  0.7839  0.4105  0.0647
  0.7736  0.1882  0.1087
  0.1280  0.6651 -0.9395
  0.1514  0.8391 -1.2819
 -0.0180 -1.0372  0.3673
  0.0377 -1.1143  0.1572
 -0.0252  0.3677  0.8179
  0.7805  0.6672  0.4507
 -0.4009 -1.7529  0.6296
[torch.FloatTensor of size 1x11x3]

In [15]:
norm_scores = vocabulary_vector.bmm(center_vector.transpose(1, 2))
norm_scores = torch.exp(torch.sum(norm_scores,1))
print(norm_scores)

Variable containing:
 0.1101
[torch.FloatTensor of size 1x1]



In [16]:
score/norm_scores

Variable containing:
 2.0634
[torch.FloatTensor of size 1x1]