<a href="https://colab.research.google.com/github/Alijgh003/my_word2vector/blob/main/my_word_to_vector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

In [None]:
imdb_data = load_dataset("imdb")
imdb_data

In [None]:
train_dataset = imdb_data['unsupervised'][:5000]

In [None]:
!pip install nltk

In [None]:
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download("punkt_tab")
nltk.download("stopwords")

def tokenize_and_normalize(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = word_tokenize(text)
    return  tokens

sample_text = "Hello! This is an example sentence. Let's see how it gets tokenized."
tokens = tokenize_and_normalize(sample_text)
print(tokens)


In [None]:
%%time
corpus = [tokenize_and_normalize(row) for row in train_dataset['text']]

In [None]:
def corpus_to_distinct_words(corpus):
  words = sorted(list({word for sublist in corpus for word in sublist}))
  return words, len(words)

distinct_words, num_of_distinct_words = corpus_to_distinct_words(corpus)
distinct_words = ['<START>'] + ['<END>'] + distinct_words
num_of_distinct_words += 2
word_to_index = {word: index for index, word in enumerate(distinct_words)}

print(f'distinct words: {distinct_words[10000:10011]}')
print(f'num of distinct words: {num_of_distinct_words}')

In [None]:
import torch

In [None]:
WINDOW_SIZE = 3

In [None]:
%%time
centers, outsides = [], []
for text in corpus:
  before_center = [word_to_index['<START>']] * WINDOW_SIZE
  for i, word in enumerate(text):
    down = max(0, i-WINDOW_SIZE)
    up = min(len(text), i+WINDOW_SIZE+1)
    centers.append(word_to_index[word])
    before_center = [word_to_index[w] for w in (WINDOW_SIZE-(i-down)) * ['<START>'] + text[down: i]]
    after_center = [word_to_index[w] for w in text[i+1: up] + ['<END>'] * (WINDOW_SIZE-(up-1-i))]
    outsides.append(before_center + after_center)

print(len(outsides))
centers = torch.tensor(centers).view(-1,1)
outsides = torch.tensor(outsides)

In [None]:
centers.shape, outsides.shape

In [None]:
def get_negative_samples(total_samples, positive_indices , num_of_negative_samples):
  probabilities = torch.ones((positive_indices.shape[0], total_samples))
  probabilities[torch.arange(positive_indices.shape[0]).view(-1,1).repeat(1,positive_indices.shape[1]),positive_indices] = 0
  probabilities = probabilities / probabilities.sum(dim=-1,keepdim=True)
  sampled_index = torch.multinomial(probabilities, num_of_negative_samples)

  return sampled_index

In [None]:
%%time
get_negative_samples(num_of_distinct_words, outsides[:100], 600).shape

In [None]:
centers.shape, outsides.shape

In [None]:
EMBD_SIZE =100

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.functional as F

class My_Word2Vec(nn.Module):
    def __init__(self, num_of_distinct_words, emb_dim ):
        super(My_Word2Vec, self).__init__()
        self.U = nn.Embedding(num_embeddings=num_of_distinct_words, embedding_dim=emb_dim)
        self.V = nn.Embedding(num_embeddings=num_of_distinct_words, embedding_dim=emb_dim)

    def forward(self, centers, outsides):
        center_embedding = self.U.weight[centers]
        context_embedding = self.V.weight[outsides]
        return (center_embedding * context_embedding).sum(dim=-1)

    def loss_f(self, predictions, labels):
        criterion = nn.BCEWithLogitsLoss()
        loss = criterion(predictions, labels)
        return loss

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

def train(centers, outsides, max_epoch, num_of_distinct_words,model,optimizer,batch_size = 100):
  model.train()
  dataset = TensorDataset(centers, outsides)

  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
  for epoch in range(max_epoch):
    i = 0

    for batch_features, batch_labels in tqdm(dataloader, desc=f"Epoch {epoch+1}/{max_epoch}", ncols=100):
        optimizer.zero_grad()

        batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

        neg_samples = get_negative_samples(num_of_distinct_words, batch_labels, batch_labels.shape[-1]*100).to(device)
        context = torch.cat((batch_labels,neg_samples), dim=-1).to(device)

        predictions = model(batch_features, context)

        positive_labels = torch.ones_like(batch_labels).to(device)
        negative_lables = torch.zeros_like(neg_samples).to(device)
        labels = torch.cat((positive_labels, negative_lables), dim=-1).to(torch.float32).to(device)

        loss = model.loss_f(predictions, labels)
        loss.backward()
        optimizer.step()
        if(i%10 == 0):
          print(f'{i=}, loss={loss.item():.4f}')
        i += 1
    print(f"Epoch [{epoch+1}/{max_epoch}], Loss: {loss.item():.4f}")



In [None]:
model = My_Word2Vec(num_of_distinct_words, EMBD_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.5)
model.to(device)

train(centers, outsides, 100, num_of_distinct_words, model, optimizer,1000)