<a href="https://colab.research.google.com/github/Dimildizio/DS_course/blob/main/Neural_networks/NLP/Embeddings/word2vec_ods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word2Vec from scratch using pytorch

In [1]:
import numpy as np
import re
import string
import torch
from torch import nn

from torch.nn import functional as F
from typing import Dict

## Create a general class to use it instead of global parameters

In [85]:
class CFG:
  """
  General metaclass for global parameters
  """
  window_size = 10
  dims = 300
  lr = 0.01
  neg_size = 20
  epochs = 10
  pattern = r"[a-zа-яё0-9_]+"


## Data preprocessing

In [3]:
def clean(inp: str) -> str:
    """
    Preprocess the data
    """
    inp = inp.translate(str.maketrans(string.punctuation, " "*len(string.punctuation)))
    inp = re.sub(r'\s+', ' ', inp.lower())
    return inp


In [4]:
def get_ids(data):
  """
  Create unique values and get enumerated dictionary
  """
  vocab = set(clean(data).split())
  vocab_size = len(vocab)

  enum_words = dict(zip(range(vocab_size), vocab))
  return enum_words

#### Looks like we won't make it without proper tokenization

In [64]:
class Tokenizer:
  def __init__(self, txt):
    self.word_pattern = re.compile(CFG.pattern)
    self.vocab = self._get_vocab_list(txt)
    self.idx_word_dict = self._get_idx_n_words


  def tokenize(self, txt):
    return list(map(lambda x: self.idx_word_dict[x], self._get_re_result(txt)))

  @property
  def _get_idx_n_words(self):
    return dict(zip(self.vocab, range(len(self.vocab))))


  def _get_re_result(self, txt):
    cleaned = clean(txt).lower()
    result = self.word_pattern.findall(cleaned)
    return result

  def _get_vocab_list(self, txt):
    result = self._get_re_result(txt)
    return list(set(result))

## Create Word2Vec class

In [55]:
class Word2Vec(nn.Module):

    def __init__(self, size: int):
        super().__init__()
        self.vocab_size = size
        self.center_emb = torch.nn.Embedding(size, CFG.dims)
        self.context_emb = torch.nn.Embedding(size, CFG.dims)

    def forward(self, X):
      middle_idx = len(X) // 2
      middle_word = torch.LongTensor([X[middle_idx]])
      center_embeddings = self.center_emb(middle_word).flatten()

      contexts = torch.LongTensor(X[:middle_idx] + X[:middle_idx+1])
      context_embeddings = self.context_emb(contexts)
      noise = torch.randint(0, self.vocab_size, (CFG.neg_size,))
      noise_embeddings = self.context_emb(noise)

      pos_score = F.logsigmoid(context_embeddings @ center_embeddings)
      neg_score = F.logsigmoid(-noise_embeddings @ center_embeddings)

      result = pos_score.sum() + neg_score.sum()
      return result

## Main function to train Word2Vec

In [57]:
def train(data: str) -> Dict[str, np.array]:
    """
    Train the word2vec model
    return a dict:
     -key: a word itself
     -value: embeddings
    """
    tokenizer = Tokenizer(data)
    tokens = tokenizer.tokenize(data)
    model = Word2Vec(len(tokenizer.vocab))
    optimizer = torch.optim.Adam(model.parameters(), lr=CFG.lr)
    criterion = lambda x: -x

    for epoch in range(CFG.epochs):
      curr_loss = float(0)
      token_windows = len(tokens) - CFG.window_size+1
      for idx in range(token_windows):
        optimizer.zero_grad()
        window = tokens[idx:idx+CFG.window_size]  # requires a fix
        loss = criterion(model(window))
        loss.backward()
        curr_loss += loss.item()

      result_weights = model.context_emb.weight.data.numpy()
      result = dict(zip(tokenizer.idx_word_dict.keys(), result_weights))
    return result

Test the functionality

In [7]:
data = 'A quick brown fox jumps over a lazy dog! Hi, is it your string?'

In [65]:
w2v = train(data)

## Skipgram

In [72]:
class SkipGram(nn.Module):

    def __init__(self, size: int):
        super().__init__()
        self.vocab_size = size
        self.center_emb = torch.nn.Embedding(size, CFG.dims)
        self.context_emb = torch.nn.Embedding(size, CFG.dims)

    def forward(self, target, context):
        center_word = torch.LongTensor([target])
        center_embedding = self.center_emb(center_word).flatten()

        context_words = torch.LongTensor([context])
        context_embedding = self.context_emb(context_words)

        score = F.logsigmoid(context_embedding @ center_embedding)

        return score



In [73]:
def train_skipgram(data: str) -> Dict[str, np.array]:
    """
    Train the word2vec model using SkipGram approach
    return a dict:
     -key: a word itself
     -value: embeddings
    """
    tokenizer = Tokenizer(data)
    tokens = tokenizer.tokenize(data)
    model = SkipGram(len(tokenizer.vocab))
    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.lr)
    criterion = lambda x: -x

    for epoch in range(CFG.epochs):
      curr_loss = float(0)
      token_windows = len(tokens) - CFG.window_size+1
      for idx in range(token_windows):
          for context_idx in range(max(0, idx - CFG.window_size), min(token_windows, idx + CFG.window_size + 1)):
              optimizer.zero_grad()
              target = tokens[context_idx]
              context = tokens[idx]
              loss = criterion(model(target, context))
              loss.backward()
              curr_loss += loss.item()

      result_weights = model.context_emb.weight.data.numpy()
      result = dict(zip(tokenizer.idx_word_dict.keys(), result_weights))
    return result



In [86]:
train_skipgram(data)