# GloVe Implementation

GloVe = **G**lobal **Ve**ctors for word representation.

Goal:
Learn a vector for each word so that:

* Similar words ⇒ similar vectors
* Word relationships (like *king - man + woman ≈ queen*) show up as vector arithmetic

How it’s different from word2vec:

* **word2vec** learns from *local* context (sliding windows, predicting neighbors).
* **GloVe** learns from **global co-occurrence counts**:
  “How often does word *i* appear near word *j* in the whole corpus?”

In [1]:
# math isliye qki log, sqrt, exp etc ka zaroorat hoga.
# Co-occurance counts pe log lagaya jaata hai (bhot bade numbers ko compress karne k liye)
import math

# Har words ki freq chahiye vocab banane ke liye
# default dict: agar koi aise key ko access karna chahe jo exist nhi karta to uske liye ye ek default value bana deta hai
from collections import Counter, defaultdict

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


In [2]:
corpus = [
    "king queen man woman",
    "king man strong",
    "queen woman kind",
    "man woman child",
    "king queen royal family",
    "queen royal palace",
    "man king leader",
]


In [3]:
# Tokenize and build vocab

# Very basic: text ko lower me convert karo + space ke basis pe split karo
def tokenize(text):
  return text.lower().split()

# Har Sentence ke word level pe tokenize kar raha hai.
# Eg: corpus = ["I love deep learning", "GloVe is cool"]
# Result: [["i", "love", "deep", "learning"], ["glove", "is", "cool"]]
tokenized_corpus = [tokenize(sentence) for sentence in corpus]


# Build Vocab
# sentence ke corpus me se ek sentence pakdo, uss ek sentence ka ek word pakdo, fir uska frequency batao
# result: {"i": 3, "love": 2, "deep": 1, ...}
word_counts = Counter(word for sent in tokenized_corpus for word in sent)

# Ab is word count ke keys (words) ko pick karo and usse sort kar do
vocab = sorted(word_counts.keys())

# Sorted dict ke index and word ko lo and usse "word, index" ke form me store karo
# word ko number form me represent karne me kaam aayega
word2id = {w: i for i, w in enumerate(vocab)}

# yaha "index, word" format me store karo
# ya index ke basis pe word ko find krne me kaam aayega (evaluation time)
id2word = {i: w for w, i in word2id.items()}

vocab_size = len(vocab)
print("Vocab: ", vocab)
print("Vocab size: ", vocab_size)

Vocab:  ['child', 'family', 'kind', 'king', 'leader', 'man', 'palace', 'queen', 'royal', 'strong', 'woman']
Vocab size:  11


In [4]:
# Build co-occurance matrix
''' :For each position i in a sentence, look at words within a window
 (say 2) around it and count those as co-occurrences.
'''

# tokenized_corpus = [['king', 'queen', 'man', 'woman'], ['king', 'man', 'strong']]
# word2idx = "word, index" ki mapping wali dict
# window_size = center word ke left/right kitne words pick karne hai as context words
def build_cooccurance(tokenized_corpus, word2id, window_size=2):

  # {(center_id, context_id): coocurance count} wali dict
  cooccurance_matrix = defaultdict(float)

  # tokenized corpus ke har ek sentence ko pick karo...
  # Eg: ['king', 'queen', 'man', 'woman'] ← Sentence 1
  for sentence in tokenized_corpus:

    # uss word ke corrosponding jo id hai wo pick karo...
    # Eg: queen -> 1
    ids = [word2id[word] for word in sentence]

    # ab har id and uska index pick karo aur fir...
    # Eg: (0, 1)
    for center_idx, center_id in enumerate(ids):

      # look at context words in [center - window size, center + window size]
      # window size ka start pick karo, 0 ya fir center - window size...jo bada hai wo le lo. Ye ensure karega ki -ve index wale words pick naa ho...warna wo error create karenge
      start = max(0, center_idx - window_size)


      # sentence ka last id ya fir center + window size...jo v chhota hai wo pick karo. Chhota isliye qki bada id lenge to out of range ka issue aa jaayega.
      end = min(len(ids), center_idx + window_size + 1)

      # ab mere paas start v hai and end v hai...nd ye mera context index (window) hai
      # mai har ek context id pick karunga and aage badhunga...
      for ctx_idx in range(start, end):

        # agar context id aur center id same hai...to ignore karo
        if ctx_idx == center_idx:
          continue

        # ab mere paas range aa gaya hai context window ka.
        # uss range ka use karke asli context id pick karo ids se.
        context_id = ids[ctx_idx]

        # optionally weight the distance (closer words = stronger)
        # center index nd context index k beech ka distance calc. kro
        # aur 1 se divide karke pata chalega (normalized hoke) ki word ka weight kya hai.
        distance = abs(center_idx - ctx_idx)
        weight = 1.0 / distance

        # key: (center id, context id) & value: existing count + weight
        # agli baar agar pair dobaara mila to gradually weight accumulate hoga
        cooccurance_matrix[(center_id, context_id)] += weight

  return cooccurance_matrix

cooccurance_matrix = build_cooccurance(tokenized_corpus, word2id, window_size=2)
print("Number of co-occurring pairs:", len(cooccurance_matrix))
print('(i, j) -> count\n')
for key, value in cooccurance_matrix.items():
  print(key, ' -> ', value)

Number of co-occurring pairs: 38
(i, j) -> count

(3, 7)  ->  2.0
(3, 5)  ->  2.5
(7, 3)  ->  2.0
(7, 5)  ->  1.0
(7, 10)  ->  1.5
(5, 3)  ->  2.5
(5, 7)  ->  1.0
(5, 10)  ->  2.0
(10, 7)  ->  1.5
(10, 5)  ->  2.0
(3, 9)  ->  0.5
(5, 9)  ->  1.0
(9, 3)  ->  0.5
(9, 5)  ->  1.0
(7, 2)  ->  0.5
(10, 2)  ->  1.0
(2, 7)  ->  0.5
(2, 10)  ->  1.0
(5, 0)  ->  0.5
(10, 0)  ->  1.0
(0, 5)  ->  0.5
(0, 10)  ->  1.0
(3, 8)  ->  0.5
(7, 8)  ->  2.0
(7, 1)  ->  0.5
(8, 3)  ->  0.5
(8, 7)  ->  2.0
(8, 1)  ->  1.0
(1, 7)  ->  0.5
(1, 8)  ->  1.0
(7, 6)  ->  0.5
(8, 6)  ->  1.0
(6, 7)  ->  0.5
(6, 8)  ->  1.0
(5, 4)  ->  0.5
(3, 4)  ->  1.0
(4, 5)  ->  0.5
(4, 3)  ->  1.0


In [5]:
# Turn co-occurance dict into a dataset

# (Dataset) bata rha hai ki hm apna custom dataset bana rhe hain
# (i, j, xij) ka pair hoga...as data
class CooccuranceDataset(Dataset):

  # jab ye dataset banega to sbse phle ye givn code run hoga.
  # ye (i, j, xij) ka pair banayega nd usko data object me store kr lega
  def __init__(self, cooc_dict):
    self.data = [(i, j, xij) for (i, j), xij in cooc_dict.items()]

  # poore dataset ka size (length) batayega
  def __len__(self):
    return len(self.data)

  # specific index ka corrosponding i, j, xij nikaal ne ka feature
  # ye tensor ke form me return karenge
  def __getitem__(self, idx):
    i, j, xij = self.data[idx]
    return (
            torch.tensor(i, dtype=torch.long),
            torch.tensor(j, dtype=torch.long),
            torch.tensor(xij, dtype=torch.float32),
        )

dataset = CooccuranceDataset(cooccurance_matrix)


dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [6]:
# Define GloVe Model
class GloVe(nn.Module):

  # yaha hm unique words (vocab size), and har word ka vector (embed_dim) le rhe hain
  def __init__(self, vocab_size, embedding_dim):

    # ab jaise hi ye class run hoga...mere paas vocab size nd embed size immediately aa jaayega
    super().__init__()
    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim

    # Word and context embedding
    # ab hm word nd context ka embedding banayenge (vocab size nd embed dim ke basis pe)
    self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
    self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)

    # Biases
    # yaha hm bias banayenge. Isme sirf vocab size hoga nd sirf 1 hi embed dim hoga
    self.word_biases = nn.Embedding(vocab_size, 1)
    self.context_biases = nn.Embedding(vocab_size, 1)

    # Initialize reasonable small
    # xavier_uniform_ standard weight init hai jo gradient ko stable rakhta hai
    # Random chhote values deta hai aise range me jo layer ke size pe depend karta hai
    nn.init.xavier_uniform_(self.word_embeddings.weight)
    nn.init.xavier_uniform_(self.context_embeddings.weight)

    # bias ke initial values 0 rakho.
    nn.init.zeros_(self.word_biases.weight)
    nn.init.zeros_(self.context_biases.weight)

  def forward(self, i_indices, j_indices, x_ij, x_max=100.0, alpha=0.75):
    """
      i_indices: batch of word ids (center)
      j_indices: batch of word ids (context)
      x_ij: batch of co-occurrence counts
      x_max: threshold for weighting function
      alpha: weighting exponent
    """

    # yaha hm word embedding se ith index ka row ka weight nikaal rhe hain
    w_i = self.word_embeddings(i_indices)            # (batch, dim)
    w_j = self.context_embeddings(j_indices)         # (batch, dim)
    b_i = self.word_biases(i_indices).squeeze(-1)    # (batch,)
    b_j = self.context_biases(j_indices).squeeze(-1) # (batch,)

    # Dot product w_i^T w_j
    dot = (w_i * w_j).sum(dim=1)  # (batch,)

    # log(X_ij)
    log_x_ij = torch.log(x_ij)

    # Weighting function f(x)
    # f(x) = (x/x_max)^alpha if x < x_max else 1
    # x_ij < x_max...isse 0-1 ke beech value rahega
    # torch.pow(...) smooth curve bana dega...chhote x pe bhot chhota value
    weight = torch.pow(x_ij / x_max, alpha)

    # agar x_ij > x_max ke to pow ke wajah se value >1 ho sakta hai.
    # Isliye isko clamp kr diye 1 pe...
    weight = torch.clamp(weight, max=1.0)

    # GloVe loss for this batch (vector)
    # loss_term_(ij)​=f(X_ij​)⋅(error_ij​)^2
    loss_terms = weight * torch.pow(dot + b_i + b_j - log_x_ij, 2)

     # Mean over batch is fine (original is sum but scale doesn’t matter much)
    # • Paper me to pure corpus pe sum hota hai
    # • Lekin training me hm batch by batch kar rahe to mean vs sum ka farak sirf scale ka hota hai
    # • Mean zyada stable hota hai
    # • 0.5* because L2 loss ka standard form: 0.5*(Error)^2
    loss = 0.5 * torch.mean(loss_terms)
    return loss

  def get_word_vectors(self):
    # Often, people use w + w̃ as the final embedding
    return self.word_embeddings.weight.data + self.context_embeddings.weight.data

In [7]:
# Train the model
embedding_dim = 50
model = GloVe(vocab_size, embedding_dim)

optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

num_epochs = 100

for epoch in range(1, num_epochs + 1):
    total_loss = 0.0
    for i_indices, j_indices, x_ij in dataloader:
        optimizer.zero_grad()
        loss = model(i_indices, j_indices, x_ij)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(x_ij)

    avg_loss = total_loss / len(dataset)
    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch}/{num_epochs} - Loss: {avg_loss:.4f}")

Epoch 1/100 - Loss: 0.0079
Epoch 10/100 - Loss: 0.0005
Epoch 20/100 - Loss: 0.0001
Epoch 30/100 - Loss: 0.0001
Epoch 40/100 - Loss: 0.0000
Epoch 50/100 - Loss: 0.0000
Epoch 60/100 - Loss: 0.0000
Epoch 70/100 - Loss: 0.0000
Epoch 80/100 - Loss: 0.0000
Epoch 90/100 - Loss: 0.0000
Epoch 100/100 - Loss: 0.0000


In [8]:
# Inspect the learned embeddings

import torch.nn.functional as F

def most_similar(query_word, model, word2id, id2word, top_k=5):
    if query_word not in word2id:
        print(f"Word '{query_word}' not in vocabulary")
        return

    word_vectors = model.get_word_vectors()  # (vocab_size, dim)
    word_vectors = F.normalize(word_vectors, dim=1)

    query_id = word2id[query_word]
    query_vec = word_vectors[query_id].unsqueeze(0)  # (1, dim)

    # Cosine similarity with all words
    similarities = torch.mm(query_vec, word_vectors.t()).squeeze(0)  # (vocab_size,)

    # Get top_k+1 because first will be the word itself
    sim_values, sim_indices = torch.topk(similarities, top_k + 1)

    print(f"Most similar to '{query_word}':")
    for score, idx in zip(sim_values, sim_indices):
        w = id2word[idx.item()]
        if w == query_word:
            continue
        print(f"  {w:10s}  (cosine similarity: {score.item():.3f})")


In [9]:
most_similar("king", model, word2id, id2word, top_k=5)


Most similar to 'king':
  man         (cosine similarity: 0.792)
  woman       (cosine similarity: 0.454)
  queen       (cosine similarity: 0.315)
  palace      (cosine similarity: 0.009)
  kind        (cosine similarity: -0.159)


In [10]:
most_similar("queen", model, word2id, id2word, top_k=5)


Most similar to 'queen':
  man         (cosine similarity: 0.407)
  woman       (cosine similarity: 0.325)
  king        (cosine similarity: 0.315)
  royal       (cosine similarity: 0.208)
  child       (cosine similarity: -0.163)


In [21]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [30]:
%cd /content/drive/MyDrive/"Colab Notebooks"/"GenAI with Python and PyTorch"/"Chapter 3"

/content/drive/MyDrive/Colab Notebooks/GenAI with Python and PyTorch/Chapter 3


In [31]:
torch.save(model.state_dict(), "glove_state.pth")