In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import defaultdict
import numpy as np


text = "Hey my name is Arnav and i like NLP."


text = text.lower().split()


vocab = set(text)
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}


embedding_dim = 100
window_size = 2
learning_rate = 0.001
epochs = 1000


data = []
for i in range(len(text)):
    target_word = text[i]
    context_words = [text[j] for j in range(i - window_size, i + window_size + 1) if i != j and 0 <= j < len(text)]
    for context_word in context_words:
        data.append((word_to_idx[target_word], word_to_idx[context_word]))


class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target, context):
        target_embeds = self.embeddings(target)
        context_scores = self.linear(target_embeds)
        return context_scores


model = SkipGram(len(vocab), embedding_dim)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


for epoch in range(epochs):
    total_loss = 0
    np.random.shuffle(data)
    for target_word, context_word in data:
        optimizer.zero_grad()
        target_word = torch.tensor(target_word, dtype=torch.long)
        context_word = torch.tensor(context_word, dtype=torch.long)
        context_scores = model(target_word, context_word)
        loss = F.cross_entropy(context_scores, context_word)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{epochs}] Loss: {total_loss}')


word_embeddings = model.embeddings.weight.detach().numpy()


def find_similar_words(embedding_matrix, target_word, top_n=5):
    target_idx = word_to_idx.get(target_word)
    if target_idx is None:
        print(f"'{target_word}' not found in vocabulary.")
        return

    target_embedding = embedding_matrix[target_idx]
    similarities = np.dot(embedding_matrix, target_embedding)


    most_similar_indices = np.argsort(similarities)[::-1][:top_n]
    most_similar_words = [idx_to_word[idx] for idx in most_similar_indices]

    return most_similar_words


target_word = "like"
similar_words = find_similar_words(word_embeddings, target_word)
print(f"Words similar to '{target_word}': {similar_words}")




Epoch [100/1000] Loss: 40.31522452831268
Epoch [200/1000] Loss: 39.80284970998764
Epoch [300/1000] Loss: 39.412530303001404
Epoch [400/1000] Loss: 39.537276923656464
Epoch [500/1000] Loss: 38.93186718225479
Epoch [600/1000] Loss: 38.874234080314636
Epoch [700/1000] Loss: 39.381654143333435
Epoch [800/1000] Loss: 39.318923592567444
Epoch [900/1000] Loss: 39.07107764482498
Epoch [1000/1000] Loss: 39.16583997011185
Words similar to 'like': ['like', 'arnav', 'and', 'i', 'my']
