# Word Embeddings

https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html

In [2]:
# Author: Robert Guthrie

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1);

In [3]:
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings
lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long)
hello_embed = embeds(lookup_tensor)
print hello_embed

tensor([[ 0.6614,  0.2669,  0.0617,  0.6213, -0.4519]], grad_fn=<EmbeddingBackward>)


In [7]:
embeds(torch.tensor(0))

tensor([ 0.6614,  0.2669,  0.0617,  0.6213, -0.4519], grad_fn=<EmbeddingBackward>)

The Continuous Bag-of-Words model (CBOW) is frequently used in NLP deep learning. It is a model that tries to predict words given the context of a few words before and after the target word. Typically, CBOW is used to quickly train word embeddings, and these embeddings are used to initialize the embeddings of some more complicated model. Usually, this is referred to as pretraining embeddings. It almost always helps performance a couple of percent.

The CBOW model is as follows. Given a target word $w_i$ and an $N$ context window on each side, $w_i,...,w_{i-N}$ and $w_{i+1},...,w_{i+N}$, referring to all context words collectively as $C$, CBOW tries to minimize

$$-\log p(w_i|C) = -\log \text{Softmax} \left(A \cdot \left(\sum_{w \in C} q_w \right) + b \right)$$

where $q_w$ is the embedding for word $w$

...

My understanding is that the softmax should then try to be the index of word $w_i$

In [145]:
raw_text = """I like cats
I like dogs
we like cats
we like dogs""".lower().split("\n") 
print raw_text

data = []
vocab = []
for sentence in raw_text:
    words = sentence.split()
    for i, word in enumerate(words):
        vocab.append(word)
        if i == 0:
            context = [words[i+1], words[i+2]]
        elif i == len(words) - 1:
            context = [words[i-1], words[i-2]]
        else:
            context = [words[i-1], words[i+1]]
        data.append((context, word))
        
vocab = set(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
vocab_size = len(vocab)
print data

def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)


class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()       
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        #self.embeddings = nn.Embedding.from_pretrained(my_weights)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        return F.log_softmax(self.linear(embeds.sum(dim=0)).view(1,-1), dim=1)

import random
loss_fn = nn.NLLLoss()
cbow = CBOW(vocab_size, 3)
optimizer = optim.Adam(cbow.parameters(), lr=1e-3)
total_losses = 0
step_size = 5000
for t in range(100000):
    cbow.zero_grad()
    r_c = random.choice(data)
    c_v = make_context_vector(r_c[0], word_to_ix)
    output = cbow(c_v)
    loss = loss_fn(output, torch.tensor([word_to_ix[r_c[1]]]))
    loss.backward()
    optimizer.step()
    
    if t%step_size == 0:
        print total_losses/step_size
        total_losses = 0
    total_losses += loss.item()  
        

['i like cats', 'i like dogs', 'we like cats', 'we like dogs']
[(['like', 'cats'], 'i'), (['i', 'cats'], 'like'), (['like', 'i'], 'cats'), (['like', 'dogs'], 'i'), (['i', 'dogs'], 'like'), (['like', 'i'], 'dogs'), (['like', 'cats'], 'we'), (['we', 'cats'], 'like'), (['like', 'we'], 'cats'), (['like', 'dogs'], 'we'), (['we', 'dogs'], 'like'), (['like', 'we'], 'dogs')]
0
0.707748860598
0.473636484218
0.470310950851
0.463544004107
0.469343433285
0.463169505358
0.467262666368
0.463991085815
0.464555020761
0.46814455781
0.464821910238
0.466449660206
0.467577114058
0.470020286322
0.461448034382
0.462875285292
0.456222123384
0.469007117796
0.45284354043


In [115]:
cbow.eval()
with torch.no_grad():
    process = torch.tensor([word_to_ix["cats"]])
    process_embed = cbow.embeddings(process)
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    for word in word_to_ix:
        other_embed = cbow.embeddings(torch.tensor([word_to_ix[word]]))
        print word, cos(process_embed, other_embed).item()

i -0.779573857784
we -0.734702944756
cats 1.0
like 0.0177476629615
dogs 0.995721817017


In [117]:
cbow.embeddings.weight

Parameter containing:
tensor([[-3.3615, -0.7423, -0.2105],
        [-3.7016, -0.9870, -0.6369],
        [ 3.2918,  2.3234, -2.1894],
        [ 2.2512, -2.3888,  0.7248],
        [ 2.8887,  2.5256, -2.1756]], requires_grad=True)

In [136]:
my_weights = torch.tensor([[-2.0,0,0],[-2,0,0],[2,2,-2],[2,-2,0],[2,2,-2]])

In [139]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()       
        # self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.embeddings = nn.Embedding.from_pretrained(my_weights)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        return F.log_softmax(self.linear(embeds.sum(dim=0)).view(1,-1), dim=1)

import random
loss_fn = nn.NLLLoss()
cbow = CBOW(vocab_size, 3)
optimizer = optim.Adam(cbow.parameters(), lr=1e-3)
total_losses = 0
step_size = 5000

for t in range(50000):
    cbow.zero_grad()
    r_c = random.choice(data)
    c_v = make_context_vector(r_c[0], word_to_ix)
    output = cbow(c_v)
    loss = loss_fn(output, torch.tensor([word_to_ix[r_c[1]]]))
    loss.backward()
    optimizer.step()
    
    if t%step_size == 0:
        print total_losses/step_size
        total_losses = 0
    total_losses += loss.item()  
        

0
0.79161856159
0.474246563113
0.471182758391
0.459992285752
0.463002798766
0.465520760489
0.468648498082
0.462784547627
0.467265071416


In [138]:
cbow.embeddings.weight

Parameter containing:
tensor([[-2.,  0.,  0.],
        [-2.,  0.,  0.],
        [ 2.,  2., -2.],
        [ 2., -2.,  0.],
        [ 2.,  2., -2.]], requires_grad=True)

In [156]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
# We will use Shakespeare Sonnet 2
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()
# we should tokenize the input, but we will ignore that for now
# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)

trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[:3])

vocab = set(test_sentence)
word_to_ix = {word: i+1 for i, word in enumerate(vocab)}
word_to_ix["SPACE"] = 0
trigrams.append((["SPACE","SPACE"],"When"))
trigrams.append((["SPACE", "When"], "forty"))

class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab)+1, EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
    total_loss = 0
    for context, target in trigrams:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        # print len(context_idxs), target, log_probs
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

[(['When', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege')]
[530.6794657707214, 528.4181003570557, 526.1691706180573, 523.9320974349976, 521.7055933475494, 519.4902567863464, 517.2831115722656, 515.0839352607727, 512.891900062561, 510.70729088783264]
