<a href="https://colab.research.google.com/github/Deven10103/Custom-Embedding-Model/blob/main/Custom%20CBOW%20and%20Skip-gram%20Models%20using%20Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading and setting up environment

In [None]:
!pip install numpy torch transformers
!pip install re collections random

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset,DataLoader

from transformers import AutoTokenizer

import re
from collections import Counter,defaultdict
import random

# Setting up **Tokenizer**

In [None]:
def get_tokenizer():
  try:
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    return tokenizer
  except:
    print("WordPeice tokenizer not aavilable.")


# Data Preparation

In [None]:
def build_vocab(texts,tokenizer):
  vocab={}
  words=[]

  for text in texts:
    tokens=tokenizer.encode(text,add_special_tokens=False)
    for token in tokens:
      if token not in vocab:
        word=tokenizer.decode([token])
        if word not in vocab:
            vocab[word]=len(vocab)
            words.append(word)

  return vocab,words

def prepare_training_data(texts,tokenizer, vocab,window_size=2):
  cbow_data=[]
  skipgram_data=[]

  id_to_word={id : tokenizer.decode([id]) for id in tokenizer.vocab.values()}

  for text in texts:
    tokens=tokenizer.encode(text,add_special_tokens=False)
    indexed_tokens=[vocab[id_to_word[token_id]] for token_id in tokens if token_id in id_to_word and id_to_word[token_id] in vocab]

    if len(indexed_tokens)<2*window_size+1:
      continue

    for i in range(window_size,len(indexed_tokens)-window_size):
      target=indexed_tokens[i]
      context=indexed_tokens[i-window_size:i]+indexed_tokens[i+1:i+window_size+1]

      cbow_data.append((context,target))

      for ctx_word in context:
        skipgram_data.append((target,ctx_word))

  return cbow_data,skipgram_data

# **CBOW** model using Python

In [None]:
class CBOWModel(nn.Module):
  def __init__(self,vocab_size,embedding_dim):
    super(CBOWModel,self).__init__()
    self.embeddings=nn.Embedding(vocab_size,embedding_dim)
    self.linear=nn.Linear(embedding_dim,vocab_size)

  def forward(self,context):
    #context: (batch_size,context_size)

    embeds=self.embeddings(context)
    #embeds: (batch_size,context_size,embedding_dim)

    context_vec = torch.mean(embeds, dim=1)
    #context_vec: (batch_size,embedding_dim)

    output=self.linear(context_vec)
    #output: (batch_size,vocab_size)

    return output

# **Skipgram** model using Python

In [None]:
class SkipGram(nn.Module):
  def __init__(self,vocab_size,embedding_dim):
    super(SkipGram,self).__init__()
    self.embeddings=nn.Embedding(vocab_size,embedding_dim)
    self.linear=nn.Linear(embedding_dim,vocab_size)

  def forward(self,target):
    #target: (batch_size)

    embed=self.embeddings(target)
    #embed: (batch_size,embedding_dim)

    output=self.linear(embed)
    #output: (batch_size,vocab_size)

    return output

# Data Prepearation

In [None]:
class Word2VecDataset(Dataset):
    def __init__(self, data, model_type='cbow'):
        self.data = data
        self.model_type = model_type

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.model_type == 'cbow':
            context, target = self.data[idx]
            return torch.tensor(context, dtype=torch.long), torch.tensor(target, dtype=torch.long)
        else:
            target, context = self.data[idx]
            return torch.tensor(target, dtype=torch.long), torch.tensor(context, dtype=torch.long)

def train_pytorch_model(model, train_loader, epochs=5, lr=0.01):

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    model.train()
    for epoch in range(epochs): # Corrected variable name from epochs to epoch
        total_loss = 0
        for batch_idx, (x, y) in enumerate(train_loader):

            optimizer.zero_grad()

            output = model(x)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}')

    return model

# **Implementation** Function

In [None]:
def run_demo(use_larger_dataset=False):

  sample_texts = [
      "Lower newer wider space",
      "The cat sat on the mat",
      "The quick brown fox jumps over the lazy dog",
      "Natural language processing is a fascinating filed of study",
      "Machine Learning Models can learn word embeddings from text",
      "Word vectors capture semantic ralationships between words",
      "Deep learning has revilutionized natural language processing"
  ]

  if use_larger_dataset:
    try:
      # Load from a file
      #with open('file_name.txt','r',encoding='utf-8') as f:
      #   texts = f.readLines()

      # Load from a library
      #from datasets import load_datasets
      #dataset = load_dataset('dataset name')
      #texts = [item['text']] for item in dataset[['train']]

      texts = sample_texts * 100
    except Exception as e:
          print("Error loading larger dataset: {e}, Using sample dataset.")
          texts = sample_texts
  else:
    texts = sample_texts


  tokenizer = get_tokenizer()

  vocab,words = build_vocab(texts,tokenizer)
  vocab_size = len(vocab)
  embedding_dim = 20;

  cbow_data,skipgram_data = prepare_training_data(texts,tokenizer,vocab)

  print("Pytorch Models Training")

  print(" Training PyTorch CBOW Model")
  cbow_model = CBOWModel(vocab_size,embedding_dim)
  cbow_dataset = Word2VecDataset(cbow_data,model_type='cbow')
  cbow_loader = DataLoader(cbow_dataset,batch_size=16,shuffle=True)

  trained_cbow = train_pytorch_model(cbow_model,cbow_loader,epochs=10)

  print(" Training PyTorch Skip-gram model")
  skipgram_model = SkipGram(vocab_size,embedding_dim)
  skipgram_dataset = Word2VecDataset(skipgram_data,model_type='skipgram')
  skipgram_loader = DataLoader(skipgram_dataset,batch_size=16,shuffle=True)

  trained_skipgram = train_pytorch_model(skipgram_model,skipgram_loader,epochs=10)

  print("Model Architecture and Embeddings")

  print(" CBOW Model Architecture")
  print(trained_cbow)

  print(" Skip-gram Model Embeddings")
  print(trained_skipgram)

  with torch.no_grad():
    if vocab_size > 0:
      words_to_print = words[:min(2,vocab_size)]
      print(f"Words: {words_to_print}")

      sample_embeddings_cbow = trained_cbow.embeddings.weight[:min(2,vocab_size)]
      print(f"Shape: {sample_embeddings_cbow.shape}")
      print(f"Sample Values: {sample_embeddings_cbow}")

  with torch.no_grad():
    if vocab_size > 0:
      words_to_print = words[:min(2,vocab_size)]
      print(f"Words: {words_to_print}")

      sample_embeddings_skipgram = trained_skipgram.embeddings.weight[:min(2,vocab_size)]
      print(f"Shape: {sample_embeddings_cbow.shape}")
      print(f"Sample Values: {sample_embeddings_skipgram}")

    print("CBOW Prediction:")

  context_words = ['word', 'vectors' ,'semantic',"between"]
  context_words_lower=[word.lower() for word in context_words]
  context_words=context_words_lower
  try:
    context_indices = [vocab[word] for word in context_words]
    if len(context_words) == len(context_indices):
      context_tensor = torch.tensor([context_indices],dtype=torch.long)
      with torch.no_grad():
        output = trained_cbow(context_tensor)
        predicted_idx = torch.argmax(output,dim=1).item()
        predicted_word = words[predicted_idx]
        print(f"Context: {context_words}, Predicted Target Word: {predicted_word}")

    else:
      print("One or more context words not found in vocabullary.")

  except KeyError:
    print("One or more context words not found in vocabullary.")


  print("Skip-gram Prediction:")
  target_word = "capture"
  target_word_lower = target_word.lower()
  target_word = target_word_lower
  try:
    target_index = vocab[target_word]
    target_tensor = torch.tensor([target_index],dtype=torch.long)
    with torch.no_grad():
      output = trained_skipgram(target_tensor)
      top_k=5
      top_k_indices = torch.topk(output,top_k,dim=1).indices.squeeze().tolist()
      predicted_context_words = [words[idx] for idx in top_k_indices]
      print(f"Target Word: {target_word}, Predicted COntext Words (Top {top_k}): {predicted_context_words}")

  except KeyError:
    print("One or more context words not found in vocabullary.")

  return {
      'pytorch_cbow': trained_cbow,
      'pytorch_skipgram': trained_skipgram,
      'vocab': vocab,
      'words': words,
      'tkenizer': tokenizer,
      'vocab_size': vocab_size,
      'embedding_dim': embedding_dim
  }

# **Main** function

In [None]:
if __name__ == "__main__":
  models = run_demo(use_larger_dataset=False)

  #For larger datasets
  #models = run_demo(use_larger_dataset=True)

Pytorch Models Training
 Training PyTorch CBOW Model
Epoch 1/10, Loss: 3.9247
Epoch 2/10, Loss: 3.7594
Epoch 3/10, Loss: 3.6118
Epoch 4/10, Loss: 3.4685
Epoch 5/10, Loss: 3.3268
Epoch 6/10, Loss: 3.1885
Epoch 7/10, Loss: 3.0512
Epoch 8/10, Loss: 2.9181
Epoch 9/10, Loss: 2.7846
Epoch 10/10, Loss: 2.6529
 Training PyTorch Skip-gram model
Epoch 1/10, Loss: 4.1147
Epoch 2/10, Loss: 3.7085
Epoch 3/10, Loss: 3.4131
Epoch 4/10, Loss: 3.1580
Epoch 5/10, Loss: 2.9318
Epoch 6/10, Loss: 2.7332
Epoch 7/10, Loss: 2.5584
Epoch 8/10, Loss: 2.4014
Epoch 9/10, Loss: 2.2603
Epoch 10/10, Loss: 2.1392
Model Architecture and Embeddings
 CBOW Model Architecture
CBOWModel(
  (embeddings): Embedding(51, 20)
  (linear): Linear(in_features=20, out_features=51, bias=True)
)
 Skip-gram Model Embeddings
SkipGram(
  (embeddings): Embedding(51, 20)
  (linear): Linear(in_features=20, out_features=51, bias=True)
)
Words: ['lower', 'newer']
Shape: torch.Size([2, 20])
Sample Values: tensor([[ 0.9419,  0.2819, -0.2717, -

# Using **pre**-**built** **Embedding** **Models** to measure semantic similarity

In [None]:
!pip install scikit-learn
!pip install numpy==1.24.3
!pip install gensim

In [None]:
import gensim.downloader as api
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
print("Loading Word2Vec Google News model...")
w2v_model = api.load("word2vec-google-news-300")

def show_word2vec_relationships():
  words = ['king','queen','man','woman','bright','dark','light']
  print("\n--- Word2Vec Embeddings ---")
  for word in words:
    if word in w2v_model:
      print(f"{word}: {w2v_model[word][:10]}..")

  print("\n--- Word2Vec Relationships ---")
  print("Similarity (king,queen): ", w2v_model.similarity('king','queen'))
  print("Similarity (man,woman): ", w2v_model.similarity('man','woman'))

  result = w2v_model.most_similar(positive=['woman','king'],negative=['man'])

  print("\n Word2Vec Analogy (king-man+woman):")
  print(result[:5])

show_word2vec_relationships()

Loading Word2Vec Google News model...

--- Word2Vec Embeddings ---
king: [ 0.12597656  0.02978516  0.00860596  0.13964844 -0.02563477 -0.03613281
  0.11181641 -0.19824219  0.05126953  0.36328125]..
queen: [ 0.00524902 -0.14355469 -0.06933594  0.12353516  0.13183594 -0.08886719
 -0.07128906 -0.21679688 -0.19726562  0.05566406]..
man: [ 0.32617188  0.13085938  0.03466797 -0.08300781  0.08984375 -0.04125977
 -0.19824219  0.00689697  0.14355469  0.0019455 ]..
woman: [ 0.24316406 -0.07714844 -0.10302734 -0.10742188  0.11816406 -0.10742188
 -0.11425781  0.02563477  0.11181641  0.04858398]..
bright: [-0.01586914  0.12353516  0.06640625 -0.0546875   0.18164062 -0.2421875
  0.12255859 -0.28710938 -0.08203125  0.08837891]..
dark: [ 0.12109375  0.14550781  0.14550781 -0.20605469  0.04711914 -0.01867676
  0.03588867 -0.20507812  0.19824219  0.15429688]..
light: [ 0.12988281  0.17382812  0.10302734 -0.25195312  0.04003906 -0.09130859
  0.08984375 -0.15429688  0.04589844  0.08007812]..

--- Word2Vec