In [1]:
import torch
from torch import nn
import numpy as np
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pandas as pd
import random



## Note

### This version of CBOW uses One hot encoding for the word vector embeddings. Due to this, the vocab size that this network can model is also less, due to the sparse nature of ohe embeddings. 

### Please go through the next file that uses Word2Vec for word embeddings, to see CBOW working on larger sets of sentences.

### Also the network uses CrossEntropy, since the word prediction done here is similar to a multi-class classification. The words are converted to ohe forms, which are then converted to labels => label = word_ohe_vector.index(1) => label = [0, 5, 3 0] converted to continuous labels for pytorch; so label = [0, 1, 2, 0]

In [2]:
paragraph = "This is a cat. The cat is white"


sentences_untampered = paragraph.split('.')
sentences = [sentence.lower().strip() for sentence in sentences_untampered]
sentences



['this is a cat', 'the cat is white']

In [3]:
vocab = []

def remove_punctuation(sentence):
    punctuation_chars = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    cleaned_sentence = sentence.translate(str.maketrans('', '', punctuation_chars))
    return cleaned_sentence

cleaned_sentences = []

for sentence in sentences:
    cleaned_sentence = remove_punctuation(sentence)
    cleaned_sentences.append(cleaned_sentence)
    
sentences = cleaned_sentences.copy()

for sentence in sentences:
    for word in sentence.split(" "):
        vocab.append(word.lower())

# vocab = [word.lower() for word in paragraph.split(" ")]
for word in vocab:
    if word == '' or word == ',' or word == '.':
        vocab.remove(word)

In [4]:
sentences

['this is a cat', 'the cat is white']

In [5]:
vocab = list(set(vocab))

In [6]:
len(vocab)

6

In [7]:
ohe_dict = {}

for word in vocab:
    if word not in ohe_dict.keys():
        ohe_dict[word] = [0 for word in vocab]
        insert_index = vocab.index(word)
        ohe_dict[word][insert_index] = 1



In [8]:
ohe_dict

{'a': [1, 0, 0, 0, 0, 0],
 'cat': [0, 1, 0, 0, 0, 0],
 'white': [0, 0, 1, 0, 0, 0],
 'the': [0, 0, 0, 1, 0, 0],
 'is': [0, 0, 0, 0, 1, 0],
 'this': [0, 0, 0, 0, 0, 1]}

In [9]:
# make pairings

def two_word_avg_ohe(word1, word2):
    return list((np.array(ohe_dict[word1]) + np.array(ohe_dict[word2])) / 2)

pairings = []
for sentence in sentences:
    words = [word for word in sentence.split(' ')]

    for i in range(0, len(words) - 2):
        # appending input and output
        pairings.append([two_word_avg_ohe(words[i], words[i + 2]), ohe_dict[words[i + 1]]])


In [10]:
pairings



[[[0.5, 0.0, 0.0, 0.0, 0.0, 0.5], [0, 0, 0, 0, 1, 0]],
 [[0.0, 0.5, 0.0, 0.0, 0.5, 0.0], [1, 0, 0, 0, 0, 0]],
 [[0.0, 0.0, 0.0, 0.5, 0.5, 0.0], [0, 1, 0, 0, 0, 0]],
 [[0.0, 0.5, 0.5, 0.0, 0.0, 0.0], [0, 0, 0, 0, 1, 0]]]

In [11]:
X = []
Y = []
for pair in pairings:
    X.append(pair[0])
    Y.append(pair[1])

In [12]:
X_tensor = torch.from_numpy(np.array(X))
Y_tensor = torch.from_numpy(np.array(Y))

In [13]:
layer_shape = len(vocab)
layer_shape

6

In [28]:
class CBOW(nn.Module):

    def __init__(self):
        super().__init__()
        self.container = nn.Sequential(
            nn.Linear(6, 3),
            nn.ReLU(),
            nn.Linear(3, 3),
            nn.Softmax(dim=0)
        )

    def forward(self, x):
        return self.container(x)

In [29]:
cbow = CBOW()

In [30]:
cbow(X_tensor.float())

tensor([[0.2856, 0.2731, 0.2464],
        [0.2387, 0.2233, 0.2655],
        [0.2215, 0.2286, 0.2335],
        [0.2542, 0.2750, 0.2545]], grad_fn=<SoftmaxBackward0>)

In [31]:
Y

[[0, 0, 0, 0, 1, 0],
 [1, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 1, 0]]

In [32]:
# convert output ohe vector to label
output_labels = []

for i in range(len(Y)):
    output_labels.append(Y[i].index(1))

unaltered_output_labels = output_labels
unaltered_output_labels

[4, 0, 1, 4]

In [33]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cbow.parameters(), lr=3e-4)

In [34]:
def replace_with_consecutive(a):
    mapping = {}
    current_index = 0

    for num in a:
        if num not in mapping:
            mapping[num] = current_index
            current_index += 1
    
    replaced_list = [mapping[num] for num in a]    
    return replaced_list

In [35]:
output_labels = replace_with_consecutive(output_labels)

In [36]:
output_labels

[0, 1, 2, 0]

In [37]:
for i in range(2000): # run for 100 epochs
    loss = loss_fn(cbow(X_tensor.float()), torch.tensor(output_labels))
    print(f"EPOCH => {i + 1} | LOSS => {loss}")
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

EPOCH => 1 | LOSS => 1.0995335578918457
EPOCH => 2 | LOSS => 1.0994806289672852
EPOCH => 3 | LOSS => 1.0994278192520142
EPOCH => 4 | LOSS => 1.0993748903274536
EPOCH => 5 | LOSS => 1.099321961402893
EPOCH => 6 | LOSS => 1.099268913269043
EPOCH => 7 | LOSS => 1.0992159843444824
EPOCH => 8 | LOSS => 1.0991630554199219
EPOCH => 9 | LOSS => 1.0991100072860718
EPOCH => 10 | LOSS => 1.0990570783615112
EPOCH => 11 | LOSS => 1.0990040302276611
EPOCH => 12 | LOSS => 1.098950982093811
EPOCH => 13 | LOSS => 1.098897933959961
EPOCH => 14 | LOSS => 1.0988447666168213
EPOCH => 15 | LOSS => 1.0987915992736816
EPOCH => 16 | LOSS => 1.098738431930542
EPOCH => 17 | LOSS => 1.0986852645874023
EPOCH => 18 | LOSS => 1.0986319780349731
EPOCH => 19 | LOSS => 1.098578691482544
EPOCH => 20 | LOSS => 1.0985252857208252
EPOCH => 21 | LOSS => 1.0984718799591064
EPOCH => 22 | LOSS => 1.0984184741973877
EPOCH => 23 | LOSS => 1.0983649492263794
EPOCH => 24 | LOSS => 1.098311424255371
EPOCH => 25 | LOSS => 1.09825778

In [38]:
def ohe_encode(word, vocab):
    ohe_vector = [0 for word in vocab]
    insert_index = vocab.index(word)
    ohe_vector.insert(insert_index, 1)
    return ohe_vector


def get_keys_from_value(dictionary, value):
    keys = []
    for key, val in dictionary.items():
        if val == value:
            keys.append(key)
    return keys

In [39]:
def test_cbow(word1, word2):
    test_sample = torch.from_numpy(np.array(two_word_avg_ohe(word1, word2))).float()

    output_list = list(cbow(test_sample).detach().numpy())
    max_val_index = np.argmax(output_list)
    output_list = [0 for ele in output_list]
    output_list[max_val_index] = 1
    
    resultant_ohe_one_index = unaltered_output_labels[output_labels.index(output_list.index(1))]
    result_word_ohe_vector = [0 for word in vocab]
    result_word_ohe_vector[resultant_ohe_one_index] = 1
    result_word = get_keys_from_value(ohe_dict, result_word_ohe_vector)
    return result_word

In [42]:
test_cbow("the", "is") # pass the two surrounding words around the word you wish to predict

['cat']