# Word Embedding

In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f625c1db630>

###  nn.Embedding : # of Vocab -> Dimension

In [2]:
word_to_ix = { "hello": 0, "world": 1 }
embeds = nn.Embedding(2, 5) # 2 words in vocab, 5 dimensional embeddings
lookup_tensor = torch.LongTensor([word_to_ix["hello"]])
hello_embed = embeds( autograd.Variable(lookup_tensor) )
print(hello_embed)

Variable containing:
-2.9718  1.7070 -0.4305 -2.2820  0.5237
[torch.FloatTensor of size 1x5]



# N-Gram Language Modeling

$$ P(w_i | w_{i-1}, w_{i-2}, \dots, w_{i-n+1} ) $$

### 1. 데이터 준비

In [158]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
# We will use Shakespeare Sonnet 2
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()
# we should tokenize the input, but we will ignore that for now
# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
trigrams = [ ([test_sentence[i], test_sentence[i+1]], test_sentence[i+2]) for i in range(len(test_sentence) - 2) ]
print(trigrams[:3]) # print the first 3, just so you can see what they look like

[(['When', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege')]


trigram 즉, 이전 2 단어가 주어지면 그 다음 단어를 예측하는 모델

In [177]:
vocab = set(test_sentence)
word_to_ix = { word: i for i, word in enumerate(vocab) }
ix_to_word = {v:k for k,v in word_to_ix.items()}

### 2. 모델링 

In [160]:
class NGramLanguageModeler(nn.Module):
    
    # 역시나 부모 클래스 초기화 후,
    # 모델의 모듈을 차곡차곡 선언 후
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
    
    # forward 함수에서 이어준다
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out)
        return log_probs

### 3. 트레이닝

In [6]:
trigrams[0]

(['When', 'forty'], 'winters')

When forty 다음에 올 단어로 winters

In [166]:
losses = []
loss_function = nn.NLLLoss() # Negative Log Likelihood
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(1000):
    total_loss = torch.Tensor([0])
    
    if epoch % 100 ==0: print(epoch)
    
    for context, target in trigrams:
    
        # 컨텍스트 워드들을 인덱스로 변환해서 인티저텐서(LongTensor)로 만든 후
        # autograd.Variable로 래핑
        context_idxs = list(map(lambda w: word_to_ix[w], context))
        context_var = autograd.Variable( torch.LongTensor(context_idxs) )
    
        # Torch는 gradient를 누적하기 떄문에 항상 초기화를 해줘야 함
        model.zero_grad()
    
        # forward path
        log_probs = model(context_var)
    
        # 예측값과 레이블값의 loss 계산
        # logits, labels 순서로 넣어준다

        loss = loss_function(log_probs, autograd.Variable(torch.LongTensor([word_to_ix[target]])))
    
        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()
    
        total_loss += loss.data
    losses.append(total_loss)
print(losses[0],losses[-1]) # The loss decreased every iteration over the training data!


0
100
200
300
400
500
600
700
800
900

 520.9233
[torch.FloatTensor of size 1]
 
 5.3252
[torch.FloatTensor of size 1]



로스 줄어든다~

### 4. 테스트

In [167]:
import random

In [168]:
test = random.choice(trigrams)
test_context = list(map(lambda x:word_to_ix[x], test[0]))
test_input = autograd.Variable(torch.LongTensor(test_context))
hypothesis = model(test_input)
v,i = torch.max(hypothesis,1) # argmax 

In [169]:
pred_ix = i.data.numpy()[0][0]
print('맥란 단어 : ', *test[0]) # * 붙이면 unpack 된다 
print('예측 단어 : ',ix_to_word[pred_ix])
print('실제 단어 : ',test[1])

맥란 단어 :  beauty by
예측 단어 :  succession
실제 단어 :  succession


# Continuous Bag-of-Words (CBOW)

The CBOW model is as follows. Given a target word $w_i$ and an $N$ context window on each side, $w_{i-1}, \dots, w_{i-N}$ and $w_{i+1}, \dots, w_{i+N}$, referring to all context words collectively as $C$, CBOW tries to minimize $$ -\log p(w_i | C) = \log \text{Softmax}(A(\sum_{w \in C} q_w) + b) $$ where $q_w$ is the embedding for word $w$.


양 옆에 2개씩 총 4개의 단어들 C가 주어졌을 때, 현재 단어 $w_i$ 를 예측하는 모델

### 1. 데이터 준비 

In [181]:
CONTEXT_SIZE = 2 # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process. Computational processes are abstract
beings that inhabit computers. As they evolve, processes manipulate other abstract
things called data. The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()
word_to_ix = { word: i for i, word in enumerate(set(raw_text)) }
ix_to_word = {v:k for k,v in word_to_ix.items()}
data = []
vocab = set(raw_text)
for i in range(2, len(raw_text) - 2):
    context = [ raw_text[i-2], raw_text[i-1], raw_text[i+1], raw_text[i+2] ]
    target = raw_text[i]
    data.append( (context, target) )
print(data[:5])

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]


### 2. 모델링 

In [182]:
# create your model and train.  here are some functions to help you make the data ready for use by your module
def make_context_vector(context, word_to_ix):
    idxs = list(map(lambda w: word_to_ix[w], context))
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)

make_context_vector(data[0][0], word_to_ix) # example

Variable containing:
  4
 48
 23
 29
[torch.LongTensor of size 4]

In [183]:
class CBOW(nn.Module):
    
    def __init__(self, vocab_size,projection_dim):
        super(CBOW,self).__init__()
        self.embeddings = nn.Embedding(vocab_size, projection_dim)
        self.projection = nn.Linear(projection_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        sum_embeds = torch.sum(embeds,0) # row 기준으로 sum 혹은 average?
        out = self.projection(sum_embeds)
        probs = F.log_softmax(out)
        return probs
    
    def prediction(self, inputs):
        embeds = self.embeddings(inputs)
        
        return embeds

### 3. 트레이닝 

In [184]:
PROJECTION = 10

In [187]:
losses = []
loss_function = nn.NLLLoss() # Negative Log Likelihood
model = CBOW(len(vocab),PROJECTION)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(1000):
    total_loss = torch.Tensor([0])
    
    if epoch % 100 ==0: print(epoch)

    for context, target in data:
        
        model.zero_grad()
        
        inputs = make_context_vector(context,word_to_ix)
        pred = model(inputs)
        loss = loss_function(pred,autograd.Variable(torch.LongTensor([word_to_ix[target]])))
        
        
        loss.backward()
        optimizer.step()
    
        total_loss += loss.data
    losses.append(total_loss)
print(losses[0],losses[-1]) 

0
100
200
300
400
500
600
700
800
900

 265.6400
[torch.FloatTensor of size 1]
 
 6.2804
[torch.FloatTensor of size 1]



### 4. 테스트 

In [197]:
from scipy.spatial.distance import cosine

In [192]:
def transform(word,dic):
    
    return autograd.Variable(torch.LongTensor([dic[word]]))

In [264]:
def word_analogy(target,vocabs):
    target_idx = word_to_ix[target]
    target_V = model.prediction(transform(target,word_to_ix)).data.numpy()
    nearest_idx = -1
    minimum = 100
    
    for i in range(len(vocabs)):
        if i == target_idx: continue
        
        vector = model.prediction(transform(list(vocabs)[i],word_to_ix)).data.numpy()
        
        temp = cosine(target_V,vector)
        
        if temp < minimum:
            nearest_idx = i
            minimum = temp
        
    return ix_to_word[nearest_idx], minimum

In [269]:
test = random.choice(list(vocab))
test

'rules'

In [270]:
word_analogy(test,vocab)

('idea', 0.36502336690142312)

잘 된건가? 젠장,,,,