In [54]:
import numpy as np
from collections import defaultdict
import random

In [14]:
raw_text = """I like cats
I like dogs
we like cats
we like dogs
he likes cats
he likes dogs
she likes cats
she likes dogs""".lower().split("\n") 
print raw_text

data = []
vocab = []
for sentence in raw_text:
    words = sentence.split()
    for i, word in enumerate(words):
        vocab.append(word)
        if i == 0:
            context = [words[i+1], words[i+2]]
        elif i == len(words) - 1:
            context = [words[i-1], words[i-2]]
        else:
            context = [words[i-1], words[i+1]]
        data.append((context, word))
        
vocab = set(vocab)

w2i = {word: i for i, word in enumerate(vocab)}

print data
nwords = len(w2i)
nbits = len(np.binary_repr(nwords-1))
print nbits, nwords, type(np.binary_repr(nwords-1)),np.binary_repr(nwords-1)

['i like cats', 'i like dogs', 'we like cats', 'we like dogs', 'he likes cats', 'he likes dogs', 'she likes cats', 'she likes dogs']
[(['like', 'cats'], 'i'), (['i', 'cats'], 'like'), (['like', 'i'], 'cats'), (['like', 'dogs'], 'i'), (['i', 'dogs'], 'like'), (['like', 'i'], 'dogs'), (['like', 'cats'], 'we'), (['we', 'cats'], 'like'), (['like', 'we'], 'cats'), (['like', 'dogs'], 'we'), (['we', 'dogs'], 'like'), (['like', 'we'], 'dogs'), (['likes', 'cats'], 'he'), (['he', 'cats'], 'likes'), (['likes', 'he'], 'cats'), (['likes', 'dogs'], 'he'), (['he', 'dogs'], 'likes'), (['likes', 'he'], 'dogs'), (['likes', 'cats'], 'she'), (['she', 'cats'], 'likes'), (['likes', 'she'], 'cats'), (['likes', 'dogs'], 'she'), (['she', 'dogs'], 'likes'), (['likes', 'she'], 'dogs')]
3 8 <type 'str'> 111


In [19]:
pos_words = data[1][0]
word_repr = [[float(y) for y in np.binary_repr(w2i[x]).zfill(nbits)] for x in pos_words]
word_repr

[[0.0, 1.0, 0.0], [1.0, 0.0, 0.0]]


In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [40]:
class BinaryEmbed(nn.Module):
    def __init__(self, vocab_size, embedding_dim, nbits):
        super(BinaryEmbed, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.linear = torch.nn.Linear(2*embedding_dim, nbits) #2 because our window size is 1
    def forward(self, inputs):
        embed = torch.cat((self.embedding(inputs[0]), self.embedding(inputs[1])))
        return torch.sigmoid(self.linear(embed))
    
b = BinaryEmbed(8, 2, 3)
b(torch.tensor([3,2]))

tensor([0.6789, 0.7762, 0.2728], grad_fn=<SigmoidBackward>)

In [63]:
model = BinaryEmbed(nwords, 3, nbits)
criterion = torch.nn.BCELoss(reduction="sum")
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
for t in range(100000):
    random.shuffle(data)
    words = [torch.tensor(w2i[x]) for x in data[0][0]]
    y_pred = model(words)
    loss = criterion(y_pred, torch.tensor([float(y) for y in np.binary_repr(w2i[data[0][1]]).zfill(nbits)]))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if (t+1)%5000 == 0:
        print t, loss.item()

4999 1.36492860317
9999 1.16079640388
14999 0.926463186741
19999 0.746452689171
24999 0.554771780968
29999 0.206861525774
34999 0.614300370216
39999 0.454041272402
44999 0.731417953968
49999 0.777692198753
54999 0.5702688694
59999 0.0409789718688
64999 0.6044896245
69999 0.0312567800283
74999 0.829018652439
79999 0.0228183548898
84999 0.783381581306
89999 0.866666316986
94999 0.0153988786042
99999 0.748439192772


In [64]:
print data[0]
words = [torch.tensor(w2i[x]) for x in data[0][0]]
model(words)

(['likes', 'cats'], 'she')


tensor([0.5094, 0.9760, 0.9944], grad_fn=<SigmoidBackward>)

In [62]:
w2i

{'cats': 4,
 'dogs': 6,
 'he': 7,
 'i': 2,
 'like': 1,
 'likes': 5,
 'she': 3,
 'we': 0}