In [1]:
import torch
from torch import nn
import numpy as np
import pandas as pd
import sys
import random
from gensim.models import Word2Vec
from gensim.models import KeyedVectors



In [2]:

# Sample sentences
sentences = [
    "The sun rose over the horizon, painting the sky in hues of orange and pink.",
    "Birds chirped cheerfully as they flitted from tree to tree, welcoming the new day.",
    # "In the distance, a river meandered lazily through the countryside, reflecting the morning light.",
    # "The scent of freshly brewed coffee wafted through the air, promising warmth and comfort.",
    # "People bustled about, their footsteps echoing against the pavement as they hurried to their destinations.",
    # "Amidst the hustle and bustle, a sense of serenity enveloped the city park, where joggers and dog-walkers enjoyed the tranquil surroundings.",
    # "As noon approached, the temperature rose, and the city streets became a mosaic of shadows and sunlight.",
    # "The distant sound of laughter floated through the air, a reminder of the joys of human connection.",
    # "In the evening, the city came alive with the glow of neon lights and the buzz of nightlife.",
    # "As the night grew darker, stars twinkled overhead, offering a sense of wonder and possibility."
]


# sentences = ["The cat is cute.", "The sun never sets in the west."]
# # Generate a paragraph
# paragraph = ""
# word_count = 0

# while word_count < 1000:
#     sentence = random.choice(sentences)
#     paragraph += " " + sentence
#     words = sentence.split()
#     word_count += len(words)

# # print(paragraph)
# # print("\nTotal words in paragraph:", word_count)

def remove_punctuation(sentence):
    punctuation_chars = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    cleaned_sentence = sentence.translate(str.maketrans('', '', punctuation_chars))
    return cleaned_sentence


sentences_split = []

for sentence in sentences:
    sentence = remove_punctuation(sentence)
    sentences_split.append(sentence.lower().split(' '))
sentences_split


[['the',
  'sun',
  'rose',
  'over',
  'the',
  'horizon',
  'painting',
  'the',
  'sky',
  'in',
  'hues',
  'of',
  'orange',
  'and',
  'pink'],
 ['birds',
  'chirped',
  'cheerfully',
  'as',
  'they',
  'flitted',
  'from',
  'tree',
  'to',
  'tree',
  'welcoming',
  'the',
  'new',
  'day']]

In [3]:
X = []
Y = []

In [4]:
model = Word2Vec(sentences_split, vector_size=96, window=5, min_count=1, workers=4)

# Save the model
model.save("word2vec.model")

# Load the model
loaded_model = Word2Vec.load("word2vec.model")

pairings_dict = {}
for sentence_split in sentences_split:
    # print(sentence_split)
    for i in range(1, len(sentence_split) - 1):
        # print(sentence_split[i])

        key_vector = loaded_model.wv[sentence_split[i - 1]] + loaded_model.wv[sentence_split[i + 1]]
        X.append(key_vector)
        Y.append(loaded_model.wv[sentence_split[i]])
        

In [5]:
X_tensor = torch.from_numpy(np.array(X))
Y_tensor = torch.from_numpy(np.array(Y))

In [6]:
Y_tensor.shape

torch.Size([25, 96])

In [7]:
X_tensor.shape

torch.Size([25, 96])

In [8]:
class CBOWNetwork(nn.Module):

    def __init__(self):
        super().__init__()
        self.leaky_relu = nn.LeakyReLU()
        self.linear_layer1 = nn.Linear(96, 32)
        self.context_layer = nn.Linear(32, 96)


    def forward(self, x):
        x = self.linear_layer1(x)
        x = self.leaky_relu(x)
        x = self.context_layer(x)
        x = self.leaky_relu(x)

        return x

In [9]:
cbow = CBOWNetwork()

In [10]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(cbow.parameters(), lr=3e-2)

In [11]:
epochs = 10000
for i in range(epochs):
    loss = loss_fn(cbow(X_tensor), Y_tensor)
    print(f'EPOCHS {i + 1} | LOSS {loss}')
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

EPOCHS 1 | LOSS 0.004018350970000029
EPOCHS 2 | LOSS 0.0011478274827823043
EPOCHS 3 | LOSS 0.00021502480376511812
EPOCHS 4 | LOSS 9.008080814965069e-05
EPOCHS 5 | LOSS 4.706854451796971e-05
EPOCHS 6 | LOSS 8.40669235913083e-05
EPOCHS 7 | LOSS 3.913317414117046e-05
EPOCHS 8 | LOSS 3.897729402524419e-05
EPOCHS 9 | LOSS 3.969630051869899e-05
EPOCHS 10 | LOSS 4.038510087411851e-05
EPOCHS 11 | LOSS 4.3280626414343715e-05
EPOCHS 12 | LOSS 4.170705142314546e-05
EPOCHS 13 | LOSS 4.233366416883655e-05
EPOCHS 14 | LOSS 4.291228106012568e-05
EPOCHS 15 | LOSS 4.352281030151062e-05
EPOCHS 16 | LOSS 4.390904723550193e-05
EPOCHS 17 | LOSS 4.43172684754245e-05
EPOCHS 18 | LOSS 4.465703386813402e-05
EPOCHS 19 | LOSS 4.492651351029053e-05
EPOCHS 20 | LOSS 4.511077713686973e-05
EPOCHS 21 | LOSS 4.930102659272961e-05
EPOCHS 22 | LOSS 4.551821984932758e-05
EPOCHS 23 | LOSS 4.573987462208606e-05
EPOCHS 24 | LOSS 4.5901564590167254e-05
EPOCHS 25 | LOSS 4.6001485316082835e-05
EPOCHS 26 | LOSS 4.60564297100063

In [12]:
def test_model(pre_word, post_word):
    key_vector = loaded_model.wv[pre_word] + loaded_model.wv[post_word]
    return cbow(torch.from_numpy(key_vector))
    

In [18]:
ans_vector = test_model('the', 'rose')

In [55]:
def create_vocab(sentences):
    vocab = set()
    for sentence in sentences:
        sentence = remove_punctuation(sentence)
        words = sentence.lower().split()  # Split the sentence into words
        vocab.update(words)  # Add the words to the vocabulary set
    return list(vocab)


def get_ans_word(ans_vector):
    min_loss = sys.maxsize
    ans_word = ''

    print('Most probable words are: ')
    
    for word in vocab:
        vector = loaded_model.wv[word]
        current_loss = loss_fn(ans_vector, torch.tensor(np.array(vector)))
        # print(ans_vector.shape)
        # print(np.array(vector).shape)
        if current_loss < min_loss:
            min_loss = current_loss
            ans_word = word
            print(loss_fn(ans_vector, torch.tensor(np.array(vector))), word)

    # return ans_word

In [56]:
vocab = create_vocab(sentences)

In [58]:
get_ans_word(ans_vector)

Most probable words are: 
tensor(4.1440e-05, grad_fn=<MseLossBackward0>) new
tensor(3.9453e-05, grad_fn=<MseLossBackward0>) welcoming
tensor(3.3200e-05, grad_fn=<MseLossBackward0>) sun
tensor(3.1592e-05, grad_fn=<MseLossBackward0>) over
