Using CBOW model

In [80]:
import torch
import torch.nn as nn
import torch.optim as optim

In [155]:

#preproccess
text_data = ["""I'm so happy 'cause today I found my friends
They're in my head
I'm so ugly, that's okay, 'cause so are you
Broke our mirrors
Sunday morning is everyday, for all I care
And I'm not scared
Light my candles in a daze
'Cause I've found God"""]
words = []
for sentence in text_data:
    for word in sentence.split():
        words.append(word.lower())
vocab = set(words)
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
vocab_size = len(vocab)


Used Documentation from PyTorch to help implement

In [156]:
embedding_dim = 10
context_size = 2 #2 before 2 after
num_epochs = 1000
lr = 0.001


In [149]:
data = []
print(len(words))
for i in range(context_size, len(words) - context_size):
    #two before and two after
    context = [words[i - context_size ], words[i - (context_size-1)],
               words[i + (context_size-1)], words[i + context_size]]
    target = words[i] #word in between
    data.append((context, target))
print(data)

47
[(["i'm", 'so', "'cause", 'today'], 'happy'), (['so', 'happy', 'today', 'i'], "'cause"), (['happy', "'cause", 'i', 'found'], 'today'), (["'cause", 'today', 'found', 'my'], 'i'), (['today', 'i', 'my', 'friends'], 'found'), (['i', 'found', 'friends', "they're"], 'my'), (['found', 'my', "they're", 'in'], 'friends'), (['my', 'friends', 'in', 'my'], "they're"), (['friends', "they're", 'my', 'head'], 'in'), (["they're", 'in', 'head', "i'm"], 'my'), (['in', 'my', "i'm", 'so'], 'head'), (['my', 'head', 'so', 'ugly,'], "i'm"), (['head', "i'm", 'ugly,', "that's"], 'so'), (["i'm", 'so', "that's", 'okay,'], 'ugly,'), (['so', 'ugly,', 'okay,', "'cause"], "that's"), (['ugly,', "that's", "'cause", 'so'], 'okay,'), (["that's", 'okay,', 'so', 'are'], "'cause"), (['okay,', "'cause", 'are', 'you'], 'so'), (["'cause", 'so', 'you', 'broke'], 'are'), (['so', 'are', 'broke', 'our'], 'you'), (['are', 'you', 'our', 'mirrors'], 'broke'), (['you', 'broke', 'mirrors', 'sunday'], 'our'), (['broke', 'our', 'sund

In [150]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_size, vocab_size)
        

    def forward(self, inputs):
        embedding = self.embeddings(inputs).mean(dim=0).view(1,-1)
        out = self.linear(embedding)
        pred = nn.functional.log_softmax(out, dim = 1)
        return pred


In [151]:
model = CBOW(vocab_size, embedding_dim)
crit = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

In [152]:
for epoch in range(1001):
    total_loss = 0
    for context, target in data:
        context_indexes = []
        for word in context:
            context_indexes.append(word_to_idx[word])
        context_indexes = torch.tensor((context_indexes),dtype= torch.long)
        model.zero_grad()
        pred = model(context_indexes)
        loss = crit(pred, torch.tensor([word_to_idx[target]], dtype=torch.long))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss}')


Epoch 0, Loss: 158.8257236480713
Epoch 10, Loss: 144.97513031959534
Epoch 20, Loss: 132.63411116600037
Epoch 30, Loss: 120.43028914928436
Epoch 40, Loss: 108.18865919113159
Epoch 50, Loss: 96.18789672851562
Epoch 60, Loss: 84.7951335310936
Epoch 70, Loss: 74.27168518304825
Epoch 80, Loss: 64.72489219903946
Epoch 90, Loss: 56.153749614953995
Epoch 100, Loss: 48.50559261441231
Epoch 110, Loss: 41.713062822818756
Epoch 120, Loss: 35.71287000179291
Epoch 130, Loss: 30.448493123054504
Epoch 140, Loss: 25.86537890136242
Epoch 150, Loss: 21.906123086810112
Epoch 160, Loss: 18.509541675448418
Epoch 170, Loss: 15.612790241837502
Epoch 180, Loss: 13.154089234769344
Epoch 190, Loss: 11.074930742383003
Epoch 200, Loss: 9.321489058434963
Epoch 210, Loss: 7.845366485416889
Epoch 220, Loss: 6.603895734995604
Epoch 230, Loss: 5.560068279504776
Epoch 240, Loss: 4.682214092463255
Epoch 250, Loss: 3.943521626293659
Epoch 260, Loss: 3.321483777835965
Epoch 270, Loss: 2.797292783856392
Epoch 280, Loss: 2.3

In [154]:
embeddings = model.embeddings.weight.data
for i, word in enumerate(vocab):
    for j, word2 in enumerate(vocab):
        #Cosine Similarity is basically comparing vectors (dot product of both divided by product of magnitudes)
        #The closer their values, the closer to one the values should be. 
        # The dissimilarity should be 1 - cosine dissimilarity
        print(f'Cosine Similarity between {word} and {word2}: {nn.functional.cosine_similarity(embeddings[i],embeddings[j],dim = 0)}')
        print(f'Cosine Dissimilarity between {word} and {word2}: {1- nn.functional.cosine_similarity(embeddings[i],embeddings[j],dim = 0)}')

Cosine Similarity between okay, and okay,: 1.0000001192092896
Cosine Dissimilarity between okay, and okay,: -1.1920928955078125e-07
Cosine Similarity between okay, and god: 0.3150038421154022
Cosine Dissimilarity between okay, and god: 0.6849961280822754
Cosine Similarity between okay, and candles: 0.03175738453865051
Cosine Dissimilarity between okay, and candles: 0.9682426452636719
Cosine Similarity between okay, and not: 0.12313272058963776
Cosine Dissimilarity between okay, and not: 0.8768672943115234
Cosine Similarity between okay, and you: -0.22057007253170013
Cosine Dissimilarity between okay, and you: 1.2205700874328613
Cosine Similarity between okay, and found: -0.10158276557922363
Cosine Dissimilarity between okay, and found: 1.1015827655792236
Cosine Similarity between okay, and is: -0.6610755920410156
Cosine Dissimilarity between okay, and is: 1.6610755920410156
Cosine Similarity between okay, and 'cause: -0.22449742257595062
Cosine Dissimilarity between okay, and 'cause: 1