In [2]:
import torch
from torch import nn
import numpy as np
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pandas as pd
import random

In [2]:

# Sample sentences
sentences = [
    "The sun rose over the horizon, painting the sky in hues of orange and pink.",
    "Birds chirped cheerfully as they flitted from tree to tree, welcoming the new day.",
    "In the distance, a river meandered lazily through the countryside, reflecting the morning light.",
    "The scent of freshly brewed coffee wafted through the air, promising warmth and comfort.",
    "People bustled about, their footsteps echoing against the pavement as they hurried to their destinations.",
    "Amidst the hustle and bustle, a sense of serenity enveloped the city park, where joggers and dog-walkers enjoyed the tranquil surroundings.",
    "As noon approached, the temperature rose, and the city streets became a mosaic of shadows and sunlight.",
    "The distant sound of laughter floated through the air, a reminder of the joys of human connection.",
    "In the evening, the city came alive with the glow of neon lights and the buzz of nightlife.",
    "As the night grew darker, stars twinkled overhead, offering a sense of wonder and possibility."
]

# Generate a paragraph
paragraph = ""
word_count = 0

while word_count < 1000:
    sentence = random.choice(sentences)
    paragraph += " " + sentence
    words = sentence.split()
    word_count += len(words)

# print(paragraph)
# print("\nTotal words in paragraph:", word_count)


In [3]:
paragraph = "This is a cat. The cat is white"


sentences_untampered = paragraph.split('.')
sentences = [sentence.lower().strip() for sentence in sentences_untampered]
sentences



['this is a cat', 'the cat is white']

In [4]:
vocab = []

def remove_punctuation(sentence):
    punctuation_chars = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    cleaned_sentence = sentence.translate(str.maketrans('', '', punctuation_chars))
    return cleaned_sentence

cleaned_sentences = []

for sentence in sentences:
    cleaned_sentence = remove_punctuation(sentence)
    cleaned_sentences.append(cleaned_sentence)
    
sentences = cleaned_sentences.copy()

for sentence in sentences:
    for word in sentence.split(" "):
        vocab.append(word.lower())

# vocab = [word.lower() for word in paragraph.split(" ")]
for word in vocab:
    if word == '' or word == ',' or word == '.':
        vocab.remove(word)

In [5]:
sentences

['this is a cat', 'the cat is white']

In [6]:
vocab = list(set(vocab))

In [7]:
len(vocab)

6

In [8]:
ohe_dict = {}

for word in vocab:
    if word not in ohe_dict.keys():
        ohe_dict[word] = [0 for word in vocab]
        insert_index = vocab.index(word)
        ohe_dict[word][insert_index] = 1



In [9]:
ohe_dict

{'cat': [1, 0, 0, 0, 0, 0],
 'a': [0, 1, 0, 0, 0, 0],
 'the': [0, 0, 1, 0, 0, 0],
 'white': [0, 0, 0, 1, 0, 0],
 'is': [0, 0, 0, 0, 1, 0],
 'this': [0, 0, 0, 0, 0, 1]}

In [10]:
# make pairings

def two_word_avg_ohe(word1, word2):
    return list((np.array(ohe_dict[word1]) + np.array(ohe_dict[word2])) / 2)

pairings = []
for sentence in sentences:
    words = [word for word in sentence.split(' ')]

    for i in range(0, len(words) - 2):
        # appending input and output
        pairings.append([two_word_avg_ohe(words[i], words[i + 2]), ohe_dict[words[i + 1]]])


In [11]:
pairings



[[[0.0, 0.5, 0.0, 0.0, 0.0, 0.5], [0, 0, 0, 0, 1, 0]],
 [[0.5, 0.0, 0.0, 0.0, 0.5, 0.0], [0, 1, 0, 0, 0, 0]],
 [[0.0, 0.0, 0.5, 0.0, 0.5, 0.0], [1, 0, 0, 0, 0, 0]],
 [[0.5, 0.0, 0.0, 0.5, 0.0, 0.0], [0, 0, 0, 0, 1, 0]]]

In [12]:
X = []
Y = []
for pair in pairings:
    X.append(pair[0])
    Y.append(pair[1])

In [13]:
X_tensor = torch.from_numpy(np.array(X))
Y_tensor = torch.from_numpy(np.array(Y))

In [14]:
layer_shape = len(vocab)
layer_shape

6

In [15]:
class CBOW(nn.Module):

    def __init__(self):
        super().__init__()
        self.container = nn.Sequential(
            nn.Linear(6, 3),
            nn.ReLU(),
            # nn.Linear(16, 16),
            # nn.ReLU(),
            nn.Linear(3, 3),
            nn.Softmax(dim=0)
        )

    def forward(self, x):
        return self.container(x)

In [16]:
cbow = CBOW()

In [17]:
cbow(X_tensor.float())

tensor([[0.2640, 0.2576, 0.2431],
        [0.2523, 0.2514, 0.2485],
        [0.2238, 0.2357, 0.2635],
        [0.2599, 0.2554, 0.2449]], grad_fn=<SoftmaxBackward0>)

In [18]:
Y

[[0, 0, 0, 0, 1, 0],
 [0, 1, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 1, 0]]

In [19]:
# convert output ohe vector to label
output_labels = []

for i in range(len(Y)):
    output_labels.append(Y[i].index(1))

unaltered_output_labels = output_labels
unaltered_output_labels

[4, 1, 0, 4]

In [20]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cbow.parameters(), lr=3e-4)

In [21]:
def replace_with_consecutive(a):
    mapping = {}
    current_index = 0

    for num in a:
        if num not in mapping:
            mapping[num] = current_index
            current_index += 1
    
    replaced_list = [mapping[num] for num in a]    
    return replaced_list

In [22]:
output_labels = replace_with_consecutive(output_labels)

In [23]:
output_labels

[0, 1, 2, 0]

In [24]:
for i in range(2000): # run for 100 epochs
    loss = loss_fn(cbow(X_tensor.float()), torch.tensor(output_labels))
    print(f"EPOCH => {i + 1} | LOSS => {loss}")
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

EPOCH => 1 | LOSS => 1.088977575302124
EPOCH => 2 | LOSS => 1.0889521837234497
EPOCH => 3 | LOSS => 1.0889267921447754
EPOCH => 4 | LOSS => 1.0889012813568115
EPOCH => 5 | LOSS => 1.0888757705688477
EPOCH => 6 | LOSS => 1.0888502597808838
EPOCH => 7 | LOSS => 1.08882474899292
EPOCH => 8 | LOSS => 1.0887991189956665
EPOCH => 9 | LOSS => 1.088773488998413
EPOCH => 10 | LOSS => 1.0887478590011597
EPOCH => 11 | LOSS => 1.0887222290039062
EPOCH => 12 | LOSS => 1.0886964797973633
EPOCH => 13 | LOSS => 1.0886707305908203
EPOCH => 14 | LOSS => 1.088645100593567
EPOCH => 15 | LOSS => 1.0886192321777344
EPOCH => 16 | LOSS => 1.0885934829711914
EPOCH => 17 | LOSS => 1.0885674953460693
EPOCH => 18 | LOSS => 1.0885416269302368
EPOCH => 19 | LOSS => 1.0885157585144043
EPOCH => 20 | LOSS => 1.0884897708892822
EPOCH => 21 | LOSS => 1.0884637832641602
EPOCH => 22 | LOSS => 1.088437795639038
EPOCH => 23 | LOSS => 1.0884116888046265
EPOCH => 24 | LOSS => 1.0883855819702148
EPOCH => 25 | LOSS => 1.0883593

In [25]:
def ohe_encode(word, vocab):
    ohe_vector = [0 for word in vocab]
    insert_index = vocab.index(word)
    ohe_vector.insert(insert_index, 1)
    return ohe_vector


def get_keys_from_value(dictionary, value):
    keys = []
    for key, val in dictionary.items():
        if val == value:
            keys.append(key)
    return keys

In [26]:
def test_cbow(word1, word2):
    test_sample = torch.from_numpy(np.array(two_word_avg_ohe(word1, word2))).float()

    output_list = list(cbow(test_sample).detach().numpy())
    max_val_index = np.argmax(output_list)
    output_list = [0 for ele in output_list]
    output_list[max_val_index] = 1
    
    resultant_ohe_one_index = unaltered_output_labels[output_labels.index(output_list.index(1))]
    result_word_ohe_vector = [0 for word in vocab]
    result_word_ohe_vector[resultant_ohe_one_index] = 1
    result_word = get_keys_from_value(ohe_dict, result_word_ohe_vector)
    return result_word

In [27]:
test_cbow("the", "is")

['cat']

In [1]:


# Example sentences
sentences = [['I', 'love', 'natural', 'language', 'processing'],
             ['Word2Vec', 'is', 'a', 'popular', 'technique', 'in', 'NLP']]

# Train the Word2Vec model


# Find similar words
similar_words = loaded_model.wv.most_similar('natural')

print("Vector for 'natural':", vector)
print("Similar words to 'natural':", similar_words)




Vector for 'natural': [-0.00515546 -0.0066698  -0.0077762   0.008313   -0.00198329 -0.00685567
 -0.00415414  0.00514404 -0.00287037 -0.00375115  0.00162149 -0.00277665
 -0.0015838   0.0010742  -0.0029783   0.00852062  0.00391217 -0.00995996
  0.00625924 -0.00675653  0.00076952  0.00440582 -0.00510369 -0.00211238
  0.00809505 -0.00424318 -0.00763772  0.00925996 -0.00215684 -0.00471855
  0.00857066  0.00428435  0.00432514  0.00928441 -0.00845455  0.0052557
  0.00203867  0.00418854  0.00169684  0.00446506  0.00448828  0.0061048
 -0.00320261 -0.00457565 -0.00042513  0.00253303 -0.00326466  0.0060579
  0.0041561   0.00776559  0.00256812  0.00811733 -0.0013858   0.00807757
  0.00371654 -0.00804667 -0.00393553 -0.00247152  0.00489369 -0.00087115
 -0.00282985  0.0078341   0.00932326 -0.00161363 -0.00515947 -0.00470124
 -0.00484814 -0.0096023   0.00137195 -0.00422638  0.00252748  0.00561502
 -0.00406557 -0.00959801  0.00154618 -0.00670048  0.00249683 -0.00378187
  0.00707964  0.00063897  0.0035