In [1]:
import torch
from torch import nn
import numpy as np
import pandas as pd
import sys
import random
from gensim.models import Word2Vec
from gensim.models import KeyedVectors



In [2]:

# Sample sentences
sentences = [
    "The sun rose over the horizon, painting the sky in hues of orange and pink.",
    "Birds chirped cheerfully as they flitted from tree to tree, welcoming the new day.",
    "In the distance, a river meandered lazily through the countryside, reflecting the morning light.",
    "The scent of freshly brewed coffee wafted through the air, promising warmth and comfort.",
    "People bustled about, their footsteps echoing against the pavement as they hurried to their destinations.",
    "Amidst the hustle and bustle, a sense of serenity enveloped the city park, where joggers and dog-walkers enjoyed the tranquil surroundings.",
    "As noon approached, the temperature rose, and the city streets became a mosaic of shadows and sunlight.",
    "The distant sound of laughter floated through the air, a reminder of the joys of human connection.",
    "In the evening, the city came alive with the glow of neon lights and the buzz of nightlife.",
    "As the night grew darker, stars twinkled overhead, offering a sense of wonder and possibility."
]

def remove_punctuation(sentence):
    punctuation_chars = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    cleaned_sentence = sentence.translate(str.maketrans('', '', punctuation_chars))
    return cleaned_sentence


sentences_split = []

for sentence in sentences:
    sentence = remove_punctuation(sentence)
    sentences_split.append(sentence.lower().split(' '))
sentences_split


[['the',
  'sun',
  'rose',
  'over',
  'the',
  'horizon',
  'painting',
  'the',
  'sky',
  'in',
  'hues',
  'of',
  'orange',
  'and',
  'pink'],
 ['birds',
  'chirped',
  'cheerfully',
  'as',
  'they',
  'flitted',
  'from',
  'tree',
  'to',
  'tree',
  'welcoming',
  'the',
  'new',
  'day'],
 ['in',
  'the',
  'distance',
  'a',
  'river',
  'meandered',
  'lazily',
  'through',
  'the',
  'countryside',
  'reflecting',
  'the',
  'morning',
  'light'],
 ['the',
  'scent',
  'of',
  'freshly',
  'brewed',
  'coffee',
  'wafted',
  'through',
  'the',
  'air',
  'promising',
  'warmth',
  'and',
  'comfort'],
 ['people',
  'bustled',
  'about',
  'their',
  'footsteps',
  'echoing',
  'against',
  'the',
  'pavement',
  'as',
  'they',
  'hurried',
  'to',
  'their',
  'destinations'],
 ['amidst',
  'the',
  'hustle',
  'and',
  'bustle',
  'a',
  'sense',
  'of',
  'serenity',
  'enveloped',
  'the',
  'city',
  'park',
  'where',
  'joggers',
  'and',
  'dogwalkers',
  'enjoy

In [3]:
X = []
Y = []

In [4]:
model = Word2Vec(sentences_split, vector_size=96, window=5, min_count=1, workers=4)

# Save the model
model.save("word2vec.model")

# Load the model
loaded_model = Word2Vec.load("word2vec.model")

pairings_dict = {}
for sentence_split in sentences_split:
    # print(sentence_split)
    for i in range(1, len(sentence_split) - 1):
        # print(sentence_split[i])

        key_vector = loaded_model.wv[sentence_split[i - 1]] + loaded_model.wv[sentence_split[i + 1]]
        X.append(key_vector)
        Y.append(loaded_model.wv[sentence_split[i]])
        

In [5]:
X_tensor = torch.from_numpy(np.array(X))
Y_tensor = torch.from_numpy(np.array(Y))

In [6]:
Y_tensor.shape

torch.Size([140, 96])

In [7]:
X_tensor.shape

torch.Size([140, 96])

In [8]:
class CBOWNetwork(nn.Module):

    def __init__(self):
        super().__init__()
        self.leaky_relu = nn.LeakyReLU()
        self.linear_layer1 = nn.Linear(96, 32)
        self.context_layer = nn.Linear(32, 96)


    def forward(self, x):
        x = self.linear_layer1(x)
        x = self.leaky_relu(x)
        x = self.context_layer(x)
        x = self.leaky_relu(x)

        return x

In [9]:
cbow = CBOWNetwork()

In [10]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(cbow.parameters(), lr=3e-3)

In [11]:
epochs = 100000
for i in range(epochs):
    loss = loss_fn(cbow(X_tensor), Y_tensor)
    print(f'EPOCHS {i + 1} | LOSS {loss}')
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

EPOCHS 1 | LOSS 0.004589637275785208
EPOCHS 2 | LOSS 0.004086000379174948
EPOCHS 3 | LOSS 0.0036333550233393908
EPOCHS 4 | LOSS 0.003225948428735137
EPOCHS 5 | LOSS 0.0028577835764735937
EPOCHS 6 | LOSS 0.0025225472636520863
EPOCHS 7 | LOSS 0.002215950982645154
EPOCHS 8 | LOSS 0.0019353211391717196
EPOCHS 9 | LOSS 0.001677979133091867
EPOCHS 10 | LOSS 0.001441802131012082
EPOCHS 11 | LOSS 0.0012259915238246322
EPOCHS 12 | LOSS 0.001030284445732832
EPOCHS 13 | LOSS 0.0008547952747903764
EPOCHS 14 | LOSS 0.0006997546879574656
EPOCHS 15 | LOSS 0.0005651445826515555
EPOCHS 16 | LOSS 0.00045022607082501054
EPOCHS 17 | LOSS 0.0003527493972796947
EPOCHS 18 | LOSS 0.00027055287500843406
EPOCHS 19 | LOSS 0.00020293740089982748
EPOCHS 20 | LOSS 0.00014948447642382234
EPOCHS 21 | LOSS 0.0001089722863980569
EPOCHS 22 | LOSS 7.985560660017654e-05
EPOCHS 23 | LOSS 6.029922587913461e-05
EPOCHS 24 | LOSS 4.825178984901868e-05
EPOCHS 25 | LOSS 4.139380689593963e-05
EPOCHS 26 | LOSS 3.7998222978785634e-

In [12]:
def create_vocab(sentences):
    vocab = set()
    for sentence in sentences:
        sentence = remove_punctuation(sentence)
        words = sentence.lower().split()  # Split the sentence into words
        vocab.update(words)  # Add the words to the vocabulary set
    return list(vocab)


def get_ans_word(ans_vector):
    min_loss = sys.maxsize
    ans_word = ''

    print('Most probable words are: ')
    vocab = create_vocab(sentences)
    for word in vocab:
        vector = loaded_model.wv[word]
        current_loss = loss_fn(ans_vector, torch.tensor(np.array(vector)))
        # print(ans_vector.shape)
        # print(np.array(vector).shape)
        if current_loss < min_loss:
            min_loss = current_loss
            ans_word = word
            print(loss_fn(ans_vector, torch.tensor(np.array(vector))), word)

    # return ans_word

def test_model(pre_word, post_word):
    key_vector = loaded_model.wv[pre_word] + loaded_model.wv[post_word]
    ans_vector = cbow(torch.from_numpy(key_vector))
    return get_ans_word(ans_vector)
    
    

In [17]:
# list of most probable words with their distance from pred vector 
#(least value is the closest ; found at the end of the list)
test_model('stars', 'overhead') 


Most probable words are: 
tensor(5.3923e-05, grad_fn=<MseLossBackward0>) countryside
tensor(4.8481e-05, grad_fn=<MseLossBackward0>) river
tensor(4.5200e-05, grad_fn=<MseLossBackward0>) the
tensor(3.9935e-05, grad_fn=<MseLossBackward0>) tree
tensor(3.5002e-05, grad_fn=<MseLossBackward0>) offering
tensor(3.3159e-05, grad_fn=<MseLossBackward0>) day
tensor(2.1214e-05, grad_fn=<MseLossBackward0>) twinkled
