<a href="https://colab.research.google.com/github/Amar-cmd/GenAI-with-Python-and-PyTorch-Code/blob/main/Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# MAth import kar rhe hain qki isme log, exp, sqrt etc hota hai...
# jo word2vec me kaam aa sakta hai
# (loss calculate nd negative sampling ka math)
import math

# Torch ka use hoga tensor banane (torch.tensor, torch.randn)
# GPU pe computation karne ke liye (.to(device))
import torch

# Yaha neural network banana hai to fir `torch.nn` chahiye
# nn use karke mai layers banaunga 'nn.Linear', 'nn.Embedding' etc.
# word2vec me mainly nn.Embedding layer use karenge jo wods ko vectors me convert karega
import torch.nn as nn

# model ko train karne ke liye optimizer (Adam, SGD etc) v chahiye
# w2v me v hm har batch ke baad embeddings update karenge optimizer se
import torch.optim as optim

# Ab data ko handle karna hai
# Dataset → ek custom class banayenge jo bataega:
#  • total samples (__len__)
#  • ith sample (__getitem__)

# DataLoader → ye Dataset se data uthata hai:
#  • batches banata hai, shuffle krta hai, training loop clean bana deta hai

# w2v me hm input word + context/negative samples ko properly batches me
# feed karne ke liye use karenge
from torch.utils.data import Dataset, DataLoader

# yaha mujhe words ki frequency gin ni hai
# Counter - ek list of words lega → frequency of each word return karega
# w2v me useful hai:
# • vocab banane ke liye
# • rare words ko filter karne ke liye
# • negative sampling ke probabilities set karne ke liye(frequent words zyada chance, rare ko kam)
from collections import Counter

# Ab hugging face ka dataset library use karunga
# load_dataset se real world text dataset kiad jarege
from datasets import load_dataset

# Setup Agnostic Device

In [None]:
'''
ab yaha mai check karungna ki kon sa device available hai.
torch.cuda.is_available chck karega ki kya mere paas GPU ka access hai ya nhi
agar GPU nhi hai to default device CPU ko bana dega nd wha saara kaam hoga
'''
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device: ", device)

In [None]:
!nvidia-smi

# Load Text Data

In [None]:
'''
ab mai dataset library ke `load_dataset` ki madad se 2 data lunga.
'wikitext; → cleaned version hai
'wikitext-2-raw-v1': uncleaned version hai jo original k kaafi close hai
                     (includes punctuations, caps etc)
'split=train': matlab sirf training wala data load karunga...test wala nhi
'''
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')

# ab mujhe text dekhna hai dataset se...isliye mai "text" column uthaa lunga
lines = dataset["text"]

In [None]:
# ab word2vec ke liye corpus ready karna hai, let's do it

# ek khaali list banaunga jisme words (tokens) jama karunga:
# ["this", "is", "first", "sentence", "and", "ye", "dusra", "hai", ...]
tokens = []

# ab text ke har ek line ko line-by-line lunga aur...
for line in lines:


  # uske aage-peeche ke extra spaces ya new line characters hata dunga...
  line = line.strip()

  # aur jo empty line hai...usse ignore kar dunga
  # wikitext me heading kuch aisi hai: '===heading==='
  # usse v hata dunga (ignore karke)
  if not line or line.startswith('='):
    continue

  # ab mere paas jo v lines bachi hain...usko mai lower me conver karunga,
  # whitespace ke basis pe split kar dunga...
  # aur list ke andar isse add kartaa jaaunga.
  tokens.extend(line.lower().split())

print(f"Number of tokens = {len(tokens)}")

# To keep training reasonable fast for demo, use only first N tokens
max_tokens = 4000000
tokens = tokens[:max_tokens]
print("Total tokens used:", len(tokens))

# Build Vocabulary

In [None]:
# av tak hmlg words nikaale hain jo training me dala jaayega.
# ab yaha se mai unn words ka ek dict banaunga jo w2v me use hoga

# ye ek limit hai. Koi v word 5 baar se kam aaya to usko ignore karenge (rare words)
# list me aane ke liye wo 5 ya usse zyada baar repeat hona chahiye
min_freq = 5

# let's count ki kon sa word kitni baar aaya
# { "the": 50000, "of": 30000, "data": 1200, "india": 800, ... }
word_counts = Counter(tokens)


# ab mujhe vocab banani hai. sabse phle ek special token rakhunga <unk> ke
# naam se, baaki me sirf whi words rakhunga ji 'min_freq' ya usse zyada baar
# aaye hain
# Result: ["<unk>", "the", "of", "and", "to", "in", ...]
vocab = ["<unk>"] + [w for w, c in word_counts.items() if c >= min_freq]

'''
ab mujhe words → indices mapping banana hoga qki model ko words se zyada
number pasand hai

result = word_to_idx = {"<unk>": 0, "the": 1, "of": 2, "and": 3,...}

ye mapping important hai.
• jab hum koi sentence token list me rakhte hain, to hr word ko uske
  index me convert karenge:
  "the" → 1, "india" → maybe 365, unknown/new word → 0 (<unk>)
'''
word_to_idx = {w: i for i, w in enumerate(vocab)}

# ho sakta hai ki index ke basis pe word chahiye hoga...to index to word
# ka dictionary v bana leta hu...(debugging, ya nearest neighbour)
# Result: ids_to_word = {0: "<unk>", 1: "the", 2: "of", ...}
ids_to_word = {i: w for w, i in word_to_idx.items()}


# Ab main count kar raha hoon ki final vocabulary me total kitne unique
# tokens aaye.
# isme unknown v included hai...means: 1 (unk) + total unique words
vocab_size = len(vocab)
print(f"Vocabulary size = {vocab_size}")

In [None]:
# yaha se word → number me convert hota hai

'''
Ab mere paas tokens me saare words sequence me pade hain...lekin model
ko words nhi, numbers (indices) chahiye. To chalo har word ko uske
index me conver karte hain
'''

# token se ek word (w) lo, aur dict se uss word ka index nikaalo agar hai to
# warna wha 0 daal do (unknown)
corpus_indices = [word_to_idx.get(w, 0) for w in tokens]
corpus_indices[:10]

# Dataset: Skipgram with negative sampling

In [None]:
# yaha mai PyTorch Dataset bana raha hu...jo specifically skipgram w2v ke liye hai

class SkipGramDataset(Dataset):
  def __init__(self, indices, vocab_size, window_size=2, num_negatives=5):
    """
        indices      : poora corpus numbers me: corpus_indices (word ids ki list).
        vocab_size   : kitne unique words hain (embedding matrix ka size)
        window_size  : center word ke aas-paas kitne context words dekhne hain
        num_negatives: har (center, context) ke saath kitne negative words sample karne hain.
    """
    self.indices = indices
    self.vocab_size = vocab_size
    self.window_size = window_size
    self.num_negatives = num_negatives

  # BUILD (CENTER, CONTEXT) PAIR
  # (center, context) store karne ke liye ek list bana rhe hain
    pairs = []

    # hm har position wale word ko ek possible center maan rhe hain
    # center pos = 0, 1, 2...len(indices) - 1
    for center_pos in range(len(indices)):

      # iss index pe jo v word hai...wo mera center word hai (assume)
      center = indices[center_pos]


      # yaha hm sliding window bana rhe hain
      # window size 2 hai to...
      # offset hoga -2, -1, 0, +1, +2 ← mtlb `2 left, 1 left, khud center, 1 right, 2 right`
      for offset in range(-window_size, window_size + 1):


        # yaha mera literal context position hai
        # 8, 9, 10, 11, 12
        context_pos = center_pos + offset

        # ye safety check hai out of range exception ka.
        # jo v value corpus ke bahar jaa rha hai...usko ignore karo
        if context_pos < 0 or context_pos >= len(indices):
          continue


        # center ka context nhi maan na hai...usko ignore karna hai
        if context_pos == center_pos:
          continue

        # woh neighbor word ka index utha liya.
        context = indices[context_pos]

        # pair me append kar diye ek tuple of center and context
        pairs.append((center, context))


    self.pairs = pairs
    print("Number of (center, context) pairs:", len(self.pairs))


    # BUILD NEGATIVE SAMPLING DISTRIBUTION

    # Count how ofter each word index appear
    # Result example: {0: 1234, 1: 56789, 2: 1000, ...}
    word_freqs = Counter(indices)

    # Make a tensor of frequencies with length vocab_size
    # Result: [1234, 56789, 1000...] in the form of float tensor
    # in case index aaya hi nhi to Counter default value 0 dega
    freqs = torch.tensor(
        [word_freqs[i] for i in range(vocab_size)],
        dtype = torch.float
    )

    # As in word2vec paper: raise to power 0.75
    # Logic: agar word ka freq zyada hai (is, of, the, etc)..to wo distribution ko dominate krta hai
    # 0.75 power frequent word ko thoda compress karta hai nd rare word ko thoda chance mil jaata hai
    freqs = freqs ** 0.75

    # Normalize to make it a probability distribution
    # ab ye freqs ko probability me convert karna hai
    # isse ek tensor milega jiska sum ~ 1 hoga
    self.neg_sampling_dist = freqs / freqs.sum()


  # total kitne (center, context) pair hain...wo dekh rha hai
  def __len__(self):
    return len(self.pairs)

  # ith index position wale dataset me value kaisa dikhega...ye pata chalega
  def __getitem__(self, idx):
    center, context = self.pairs[idx]

    # Sample negative word indices from the vocab
    # Means...mujhe vocab se num_negatives words sample kar do...
    # jaha har word ka prob. neg_sampling_dist se decide ho.
    # Replacements allowed hai...means same -ve words diff. times allowed hai
    negatives = torch.multinomial(
        self.neg_sampling_dist,
        self.num_negatives,
        replacement = True
    )


    # ab saare indices ko proper pytorch tensors me bana deta hu...
    # taaki directly model me feed ho sake.
    # isko long me daale hain qki nn.Embedding ko integer (long) chahiye hote hain
    center = torch.tensor(center, dtype=torch.long)
    context = torch.tensor(context, dtype=torch.long)
    negatives = negatives.long()

    # last me (center, context, negatives) ko return karna hai
    return center, context, negatives

In [None]:
# Create dataset and dataloader
dataset = SkipGramDataset(corpus_indices, # poore corpus ka integer sequence [12, 45, 7, 89, ...]

                          # kitne unique words hai
                          # negative sampling ke liye frequency tensor ka size decide hoga
                          # prob. dist. (-ve sampling dist) ka shape set hoga
                          vocab_size=vocab_size,

                          # center word ke left-right 2 words context ke liye jaayenge
                          # ctr = brown, ctx = [the, quick, fox, jumps]
                          window_size=2,

                          # har (ctr, ctx) positive pair ke saath 5 random -ve words aayenge training sample se
                          num_negatives=5)

dataloader = DataLoader(dataset, # dataset se items uthata hai

                        # 512 samples milega...
                        # 512 center words
                        # 512 positive context words
                        # 512 x 5 negative words
                        batch_size=512,

                        # dataset ke indices ko reshuffle kar do
                        shuffle=True
                        )

In [None]:
# MODEL: WORD2VEC SKIPGRAM + NEGATIVE SAMPLING
class Word2VecSkipGram(nn.Module):
  def __init__(self,
               vocab_size, # kitne words hain
               embed_dim): # har word ka vector kitne dimension ka hoga (e.g. 100, 300)
    super().__init__()


    '''
    Har word kv center banta hai aur kv context.
    w2v me in dono roles ke liye alag embeddings rakhe jaate hain.
    '''
    # embeddings when word acts as the center
    self.in_embed = nn.Embedding(vocab_size, embed_dim)
    # embeddings when word acts as the context
    self.out_embed = nn.Embedding(vocab_size, embed_dim)



    # ab embeddings ko initial value deke initialize karna hai

    # ye chhota sa number hai. agar embed_dim 100 hai...
    # to initial range [-0.005, 0.005] ke beech hoga
    init_range = 0.5 / embed_dim

    # center embeddings ko chhoti random values de rhe hain.
    # agar sab 0 se start hua to saare word initially same honge...jo ki nhi hai
    # random init se har word thoda alag start karta hai.
    self.in_embed.weight.data.uniform_(-init_range, init_range)


    # context embeddings ko zero se init kar rahe ho.
    # grad aayega to update hota rahega
    self.out_embed.weight.data.zero_()

    # ye design choice hai...in embed ko randm nd output embed ko 0, gradually learn



  # forward me hm directly loss compute karenge
  # ye typical classifier jaisa logits return nhi krega
  # yehi pe sigmoid + log + loss sab ho jaayega
  def forward(self, center_words, pos_context, neg_context):
      """
      center_words : (batch,) → har entry ek word index hai
      pos_context  : (batch,) → har center ke liye ek +ve context word index
      neg_context  : (batch, num_negatives) → har center ke liye k -ve word indices
      """

      # Lookup embeddings
      # (batch, num_negatives, embed_dim) == (B, K, D)
      center_emb = self.in_embed(center_words) #(batch,) → (batch, embed_dim)
      pos_emb = self.out_embed(pos_context) # (batch, ) → (batch, embed_dim)
      neg_emb = self.out_embed(neg_context) # (batch, num_negatives) → (batch, num_negatives, embed_dim)


      # POSITIVE PART → “ye real pair hai, score high karo”
      # center • positive context (element wise multiply)
      # dot product ke baad dim=1 (embedding ke features) ka sum kar do; dim=0 batch hai
      pos_score = torch.sum(center_emb * pos_emb, dim=1)

      # We want sigmoid(pos_score) close to 1
      # sigmoid ko [0,1] probability me convert karna hai:
      # 1) high dot product → sigmoid ≈ 1
      # 2) low dot product → sigmoid ≈ 0
      # + 1e-10: for stability → kahi sigmoid bilkul 0 ho jaaye to log(0) na ho.
      pos_loss = -torch.log(torch.sigmoid(pos_score) + 1e-10)





      # NEGATIVE PART

      # center_emb: (batch, embed_dim) -> (batch, embed_dim, 1)
      # neg_emb   : (batch, num_neg, embed_dim)
      # yaha .unsqueeze(2) karne se center_emb nd neg_embed dono same dimension ka hoga → matrix multiplication possible → no shape mismatch error

      # bmm = match matrix multiplication
      # har batch ke liye
      # • neg_emb[b] shape: (K, D)
      # • center_emb[b] shape: (D, 1)
      # • result: (K, 1) = har -ve word ja dot product center ke saath
      # Overall result (including batch also) → (B, K, 1)
      # .unsqueeze() karke last dim (1) ho hata do → (B, K)
      neg_score = torch.bmm(neg_emb, center_emb.unsqueeze(2)).squeeze()



      # We want sigmoid(neg_score) close to 0
      # yaha 1-sigmoid(neg_score) ~ 1
      # log(1-sigmoid(...)) ~ 0
      # .sum() isliye qki har example me multiple -ve words hain (k negatives)
      # unn sab ke losses ko sum karo and last me total -ve loss per center de do.
      neg_loss = -torch.sum(
          torch.log(1 - torch.sigmoid(neg_score) + 1e-10),
          dim=1
      )

      # combine and average over batch
      loss = (pos_loss + neg_loss).mean()

      return loss

In [None]:
# Training

# embedding dimension decide kare hain...har word ko 100 length ke vector se represent karenge
embed_dim = 50 # 200, 300...are common


# ab model bana ke usko sahi device pe send kr rahe hain (CPU/GPU)
# w2v do embedding matrix banayega:
# • in_embed = (vocab_size, embed_dim)
# • out_embed = (vocab_size, embed_dim)
model = Word2VecSkipGram(vocab_size, embed_dim).to(device)

# adam adaptive optimizer hai...lr ko har param ke liye smart tareeke se adjust karta hai
optimizer = optim.Adam(model.parameters(), lr=0.002)

# mai poora dataset model ko 10 baar dikhaaunga
num_epochs = 1


# for each epoch from 0 - 49...
for epoch in range(num_epochs):

  # training mode me jaao
  # agar batchnorm ya dropout hota to unka behaviour change ho jaata
  # lekin iss case me nhi hai to as such change nhi hoga
  model.train()

  # total loss ko start me 0 se initialize karo
  total_loss = 0.0


  # yaha ctr, ctx, neg ko sahi device pe send karo jaha data available hai
  # warna error milega...expected device cuda but got gpu
  for center, context, negatives in dataloader:
    center = center.to(device)
    context = context.to(device)
    negatives = negatives.to(device)

    # pichle batch ke gradients ko phle clear kar do
    # pytorch me gradients by default accumulate hote hain...
    # agar ye nhi kara to backprop prev. gradient ke upar add ho jaayega
    optimizer.zero_grad()

    # ab forward pass karo:
    # • embeddings lookup
    # • positive score
    # • negative score
    # • pos_loss, neg_loss
    # • combine → mean
    # loss ek scalar tensor hoga (ek avg value)
    loss = model(center, context, negatives)

    # ab loss pe backprop karo nd loss ke hisab se har wt ka gradient nikaal do.
    loss.backward()

    # ab gradient ke hisaab se weight ko update karo
    # Adam → `param = param - lr * (processed_gradient)`
    optimizer.step()

    # iss epoch ke liye har batches ka total loss add karo
    total_loss += loss.item()

  print(f"Epoch {epoch+1}/{num_epochs}, loss = {total_loss: .4f}")

In [None]:
# Use the learned embeddings: find similar words

# yaha sirf inference kar rhe hain...training nhi
@torch.no_grad()

# ek word ko embedding vector me convert karna hai
def get_word_embedding(word: str) -> torch.Tensor:

  # phle word ka index nikaal lo (from available vocab dict)
  idx = word_to_idx.get(word, 0)

  # ab iss index wale row ka weight nikaal lunga in_embed se...jo model me hai,
  # aur usse return kar dunga
  return model.in_embed.weight[idx]


# again inference mode hai, grad nhi chahiye
@torch.no_grad()

# ek word ke top-k similar word predict karna hai
def most_similar(query_word: str, top_k: int = 5):

  # agar word vocab me nhi hai to bata do ki similar word predict nhi ho paayega
  # Ab main us word ka actual embedding vector le raha hoon.
  if query_word not in word_to_idx:
    print(f"'{query_word}' is not in the vocab.")
    return

  # query word ka embedding lo jo ek tensor hoga...wo store kar lo
  # shape: (embed_dim,)
  query_vec = get_word_embedding(query_word)

  # iss all_embs me saare words ke embeddings ek saath aa gaye
  # Shape: (vocab_size, embed_dim)
  all_embs = model.in_embed.weight

  # Cosine similarity: (v . w / (|v||w|))
  # Shape: (vocab_size, embed_dim) • (embed_dim,) = (vocab_size,)
  # all_emb.norm(...) = har word embedding ki L2 norm
  # query_vec.norm(...) = query word ki L2 norm
  # L2 norm = Euclidean norm = ||x||₂ = √(x₁² + x₂² + ... + xₙ²)
  #   dim=0 → “kitne examples / words hain” ka axis
  # dim=1 → “har example ka vector / features” ka axis
  sims = torch.matmul(all_embs, query_vec) / (
      all_embs.norm(dim=1) * query_vec.norm() + 1e-10
  )

  values, indices = torch.topk(sims, top_k + 1) # +1 to include the word itself, will skip it later
  print(f"\nWords most similar to '{query_word}':")

  # yaha topk+1 similarity ke liye (score, idx) use kro and uspe iterate karo
  for score, idx in zip(values, indices):

    # index ki madad se uska word nikaal lo (ids_to_word) dict ka use karke
    w = ids_to_word[idx.item()]

    # Agar query word khud word me aaye to usse ignore karo
    if w == query_word:
      continue
    print(f"{w:15s} similarity = {score.item(): .3f}")

In [None]:
most_similar("king")

In [None]:
most_similar("queen")

In [None]:
most_similar("london")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/"Colab Notebooks"/"GenAI with Python and PyTorch"/"Chapter 3"

In [None]:
torch.save(model.state_dict(), "word2vec_state.pth")

# Page 70 (from book) - Using gensim library


In [None]:
import re
import pandas as pd
import numpy as np
import nltk
from sklearn.datasets import fetch_20newsgroups

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
stop_words = nltk.corpus.stopwords.words('english')

In [None]:
def normalize_document(doc):
  doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
  doc = doc.lower()
  doc = doc.strip()

  tokens = nltk.word_tokenize(doc)

  filtered_tokens = [token for token in tokens if token not in stop_words]

  doc = ' '.join(filtered_tokens)

  return doc

normalize_corpus = np.vectorize(normalize_document)

In [None]:
cats = ['alt.atheism', 'sci.space']
newsgroup_train = fetch_20newsgroups(subset='train',
                                     categories=cats,
                                     remove=('headers', 'footers', 'quotes'))

In [None]:
print('Number of news articles = {}'.format(len(newsgroup_train.data)))

In [None]:
norm_corpus = normalize_corpus(newsgroup_train.data)
norm_corpus

In [None]:
!pip install gensim

In [None]:
from gensim.models import word2vec

In [None]:
tokenize_corpus = [nltk.word_tokenize(doc) for doc in norm_corpus]

In [None]:
embedding_size = 32
context_window = 20
min_word_count = 1
sample = 1e-3
sg = 1

w2v_model = word2vec.Word2Vec(tokenize_corpus,
                              vector_size = embedding_size,
                              window=context_window,
                              min_count = min_word_count,
                              sg = sg,
                              sample=sample,
                              epochs=200)

In [None]:
print(f"Unique numbers of words in the model={w2v_model.wv.vectors.shape[0]}")

In [None]:
w2v_model.wv['sun']

In [None]:
w2v_model.wv.most_similar(positive=['god'])

In [None]:
w2v_model.wv.most_similar(positive=['sun'])