Distantly tring to follow along on this https://arxiv.org/abs/1608.05859

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [106]:
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(NGramLanguageModeler, self).__init__()
        self.embedding_dim = embedding_dim
        self.input_embedding = nn.Linear(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, embedding_dim)
        self.output_embedding =  nn.Linear(embedding_dim, vocab_size)
        self.input_embedding.weight.data = self.output_embedding.weight.data.transpose(0,1)

    def forward(self, inputs):
        embeds = torch.zeros(self.embedding_dim)
        for input in inputs:
            embeds += self.input_embedding.weight.data[:,input]
        drop = torch.nn.Dropout(0.5)
        out = torch.tanh(drop(self.linear(embeds)))
        out = self.output_embedding(out)

        log_probs = F.log_softmax(out, dim=0)
        return log_probs

raw_text = """I like cats
I like dogs
we like cats
we like dogs""".lower().split("\n") 
#print raw_text

data = []
vocab = []
for sentence in raw_text:
    words = sentence.split()
    for i, word in enumerate(words):
        vocab.append(word)
        if i == 0:
            context = [words[i+1], words[i+2]]
        elif i == len(words) - 1:
            context = [words[i-1], words[i-2]]
        else:
            context = [words[i-1], words[i+1]]
        data.append((context, word))
        
vocab = set(vocab)

def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

word_to_ix = {word: i for i, word in enumerate(vocab)}
vocab_size = len(vocab)
# print data
ngram = NGramLanguageModeler(vocab_size, 10)
ngram(torch.tensor([1], dtype=torch.long))

tensor([-1.5289, -1.8399, -1.4751, -1.8754, -1.4174], grad_fn=<LogSoftmaxBackward>)

In [107]:
import random
loss_fn = nn.NLLLoss()
ngram = NGramLanguageModeler(vocab_size, 6)
optimizer = torch.optim.Adam(ngram.parameters(), lr=1e-3)
total_losses = 0
step_size = 10000
for t in range(100000):
    ngram.zero_grad()
    r_c = random.choice(data)

    c_v = make_context_vector(r_c[0], word_to_ix)
    output = ngram(c_v)

    loss = loss_fn(output.view(1,-1), torch.tensor([word_to_ix[r_c[1]]]))
    loss.backward()
    optimizer.step()
    
    if t%step_size == 0:
        print total_losses/step_size
        total_losses = 0
    total_losses += loss.item()  

0
0.82182794109
0.616729064476
0.603718156415
0.578988845563
0.568767220169
0.579611914837
0.570812347806
0.560428542209
0.56582658869


In [108]:
ngram.eval()
with torch.no_grad():
    process = torch.tensor([word_to_ix["i"]])
    process_embed = ngram.input_embedding.weight.data[:,process].t()[0]
    print process_embed
    cos = nn.CosineSimilarity(dim=0, eps=1e-6)
    for word in word_to_ix:
        other_embed = ngram.input_embedding.weight.data[:,torch.tensor([word_to_ix[word]])].t()[0]
        print word, cos(process_embed, other_embed) #.item()

tensor([ 2.2014, -0.5674,  0.2388,  0.5209,  1.8421, -2.1559])
i tensor(1.)
we tensor(0.9994)
cats tensor(-0.8740)
like tensor(-0.2435)
dogs tensor(-0.8699)
