## Word Embeddings in Pytorch

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
word_to_ix = {'hello':0, 'world':1}
embeds = nn.Embedding(2,5)
lookup_tensor = torch.tensor([word_to_ix['world']], dtype=torch.long)
hello_embed = embeds(lookup_tensor)

In [4]:
hello_embed

tensor([[-0.6288, -2.7615, -0.1449, -2.0777, -1.0309]],
       grad_fn=<EmbeddingBackward0>)

In [5]:
hello_embed.shape, embeds.weight.shape

(torch.Size([1, 5]), torch.Size([2, 5]))

In [6]:
lookup_tensor

tensor([1])

## Word2Vec Embeddings

This transforming words into dense vector representations, capturing semantic relationships.

---

### How Are Vectors Learned?

The idea is simple but brilliant:

- Initialize word vectors randomly.  
- Use a shallow neural network (just 1 hidden layer).  
- Train it using **CBOW** or **Skip-Gram** on a large corpus.  
- The model learns to predict well by adjusting the word vectors.  

Once training is done, the weights of the hidden layer are the **word embeddings**.

> The model doesn't care about the actual predictions; it just wants embeddings that are useful for making them.

---

### How Does It Work?

There are two main architectures for Word2Vec:

---

#### 1. CBOW (Continuous Bag of Words)

Predicts the **target word** from **surrounding context words**.

It's like:

> Given: `"The ___ barks at night"` → Predict: `"dog"`

CBOW uses **context to predict the word**.

---

#### 2. Skip-Gram

Does the reverse: uses the **target word** to predict **context words**.

It's like:

> Given: `"dog"` → Predict: `"The", "barks", "at", "night"`

Skip-Gram is **more powerful** for capturing **rare words** or **richer semantic structure**.

## sample corpus

In [7]:
corpus = [
    "the quick brown fox jumped over the lazy dog",
    "i love natural language processing",
    "word2vec is a great tool for embeddings"
]

## Preprocessing and Vocab Building

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import random
import numpy as np


tokenized_corpus = [sentence.lower().split() for sentence in corpus]
all_words = [word for sentence in tokenized_corpus for word in sentence]

vocab = list(set(all_words))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for i, w in word2idx.items()}
vocab_size = len(vocab)

print(f'vocab size: {vocab_size}')

vocab size: 20


## Generate skip-gram pairs

In [9]:
def generate_skip_gram_pairs(tokenized_corpus, window_size=2):
    pairs = []
    for sentence in tokenized_corpus:
        for i, target_word in enumerate(sentence):

            left_sentence = sentence[max(i - window_size, 0): i]
            right_sentence = sentence[i+1: i + window_size + 1]
            context_window = left_sentence + right_sentence

            for context_word in context_window:
                pairs.append((target_word, context_word))
    return pairs

pairs = generate_skip_gram_pairs(tokenized_corpus)
print("Sample pairs:", pairs[:5])

Sample pairs: [('the', 'quick'), ('the', 'brown'), ('quick', 'the'), ('quick', 'brown'), ('quick', 'fox')]


In [10]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embeddings = nn.Embedding(vocab_size, embedding_dim) # target word
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim) # neighbour/context word


    def forward(self, target, context, negative_samples):
        target_embed = self.target_embeddings(target)           # (batch_size, embed_dim)
        context_embed = self.context_embeddings(context)        # (batch_size, embed_dim)
        negative_embed = self.context_embeddings(negative_samples)  # (batch_size, K, embed_dim)

        # positive score
        pos_score = torch.mul(target_embed, context_embed).sum(dim=1)
        pos_loss = torch.log(torch.sigmoid(pos_score))

        # negative score
        neg_score = torch.bmm(negative_embed, target_embed.unsqueeze(2)).squeeze()
        neg_loss = torch.log(torch.sigmoid(-neg_score)).sum(dim=1)

        return - (pos_loss + neg_loss).mean()

In [11]:
def get_negative_samples(batch_size, K):
    return torch.randint(0, vocab_size, (batch_size, K))

get_negative_samples(5, 3)

tensor([[18, 15, 13],
        [ 9, 11, 17],
        [ 1,  7,  6],
        [ 3,  7,  7],
        [ 8,  2,  2]])

In [12]:
embedding_dim = 50
model = Word2Vec(vocab_size, embedding_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# helper func to get negative samples
def get_negative_samples(batch_size, K):
    return torch.randint(0, vocab_size, (batch_size, K))

indexed_pairs = [(word2idx[t], word2idx[c]) for t, c in pairs]

epochs = 100
batch_size = 16
K = 5  # number of negative samples

for epoch in range(epochs):
    total_loss = 0
    random.shuffle(indexed_pairs)
    for i in range(0, len(indexed_pairs), batch_size):
        batch = indexed_pairs[i:i+batch_size]
        if len(batch) == 0:
            continue
        target_batch = torch.tensor([t for t, _ in batch])
        context_batch = torch.tensor([c for _, c in batch])
        negative_batch = get_negative_samples(len(batch), K)

        optimizer.zero_grad()
        loss = model(target_batch, context_batch, negative_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

Epoch 0, Loss: 106.0953
Epoch 10, Loss: 41.7375
Epoch 20, Loss: 24.6760
Epoch 30, Loss: 12.0751
Epoch 40, Loss: 7.8676
Epoch 50, Loss: 8.8930
Epoch 60, Loss: 7.4441
Epoch 70, Loss: 7.4629
Epoch 80, Loss: 7.1671
Epoch 90, Loss: 7.1041


In [13]:
def get_embedding(word):
    idx = word2idx[word]
    return model.target_embeddings.weight[idx].detach().numpy()

print("Embedding for 'language':", get_embedding('language')[:5])

Embedding for 'language': [-0.11118357 -0.41420472  0.06129085 -1.1068505   1.3873903 ]


In [14]:
# check the similarity between the 2 words
wordvec = get_embedding('word2vec')
quick = get_embedding('quick')

np.dot(wordvec, quick) / (np.linalg.norm(wordvec) * np.linalg.norm(quick))

np.float32(0.20346515)

In [3]:
!pip install numpy==1.23.5 -qq
!pip install gensim -qq

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.23.5 which is incompatible.
chex 0.1.89 requires numpy>=1.24.1, but you have numpy 1.23.5 which is incompatible.
pymc 5.21.2 requires numpy>=1.25.0, but you have numpy 1.23.5 which is incompatible.
jax 0.5.2 requires numpy>=1.25, but you have numpy 1.23.5 which is incompatible.
scikit-image 0.25.2 requires numpy>=1.24, but you have numpy 1.23.5 which is incompatible.
imbalanced-learn 0.13.0 requires numpy<3,>=1.24.3, but you have numpy 1.23.5 which is incompatible.
albumentations 2.0.5 requires numpy>=1.24.4, but y

In [1]:
from gensim.models import Word2Vec
import numpy as np

corpus = [
    "the quick brown fox jumped over the lazy dog",
    "i love natural language processing",
    "word2vec is a great tool for embeddings"
]

tokenized = [sentence.lower().split() for sentence in corpus]

model = Word2Vec(
    sentences=tokenized,
    vector_size=50,
    window=2,
    min_count=1,
    sg=1, # 1 = Skip-Gram; 0 = CBOW
    negative=5,
    epochs=16
)

word_vectors = model.wv

In [4]:
word_vectors.similarity('word2vec', 'quick')

0.03305311

In [6]:
word_vectors.most_similar(positive=['natural', 'processing'], negative=['language'], topn=1)

[('jumped', 0.22528912127017975)]

In [7]:
print("Top 5 words similar to 'language':")
for word, score in word_vectors.most_similar('language', topn=5):
    print(f"{word} ({score:.2f})")

Top 5 words similar to 'language':
brown (0.19)
word2vec (0.19)
natural (0.17)
lazy (0.12)
jumped (0.10)


In [8]:
# Note: this was trained on an extremly small dataset, but the same principle applies to a larger dataset