<a href="https://colab.research.google.com/github/Dimildizio/DS_course/blob/main/Neural_networks/NLP/Embeddings/word2vec_ods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word2Vec from scratch using pytorch

In [26]:
import numpy as np
import re
import string
import torch
from torch import nn

from torch.nn import functional as F
from typing import Dict

## Create a general class to use it instead of global parameters

In [128]:
class CFG:
  """
  General metaclass for global parameters
  """
  abs_window = 2
  dims = 300
  lr = 0.1
  neg_size = 20
  epochs = 10
  zero = torch.tensor(0.0)
  pattern = r"[a-zа-яё0-9_]+"



## Data preprocessing

In [91]:
def data_loader(filename: str) -> str:
  with open(filename, 'rt', encoding='cp1251') as f:
    text = ' '.join([line.strip() for line in f.readlines()[:-7]])
  return text

In [36]:
def clean(inp: str) -> str:
    """
    Preprocess the data
    """

    inp = inp.translate(str.maketrans(string.punctuation, " "*len(string.punctuation)))
    inp = inp.lower()
    inp = re.sub('[^ 0-9a-яА-Я]+', ' ', inp)
    inp = re.sub(r'\s+', ' ', inp)
    return inp


#### Looks like we won't make it without proper tokenization

In [90]:
class Tokenizer:
  def __init__(self, txt):
    self.word_pattern = re.compile(CFG.pattern)
    self.vocab = self._get_vocab_list(txt)
    self.idx_word_dict = self._get_idx_n_words


  def tokenize(self, txt):
    return list(map(lambda x: self.idx_word_dict[x], self._get_re_result(txt)))

  @property
  def _get_idx_n_words(self):
    return dict(zip(self.vocab, range(len(self.vocab))))


  def _get_re_result(self, txt):
    cleaned = clean(txt).lower()
    result = self.word_pattern.findall(cleaned)
    return result

  def _get_vocab_list(self, txt):
    result = self._get_re_result(txt)
    return list(set(result))

## Skipgram

In [98]:
class SkipGram(nn.Module):

    def __init__(self, size: int):
        super().__init__()
        self.embeddings = torch.nn.Embedding(size, CFG.dims)

    def forward(self, target):
        return self.embeddings(target)


In [108]:
def train(data: str) -> Dict[str, np.array]:
  """
  Train the word2vec model using SkipGram approach
  return a dict:
    -key: a word itself
    -value: embeddings
  """

  tokenizer = Tokenizer(data)
  tokens = tokenizer.tokenize(data)
  model = SkipGram(len(tokenizer.vocab))
  optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.lr)
  criterion = lambda x: -x

  for epoch in range(CFG.epochs):
    curr_loss = []

    for idx, center_word in enumerate(tokens):
      optimizer.zero_grad()
      # get context tokens for around the center word (-2:+2) excluding center word
      context_idxs = torch.LongTensor([tokens[i] for i in range(
                                      max(0, idx-CFG.abs_window),
                                      min(len(tokens), idx+CFG.abs_window)
                                      ) if i != idx])

      # get negative samples
      neg_idxs = torch.randint(0, len(tokenizer.vocab), (CFG.neg_size,))

      # get embeddings for center word
      center_emb = model(torch.LongTensor([center_word]))
      # get embeddings for context words
      context_embs = model(context_idxs)
      # get embeddings for neg samples
      neg_embs = model(neg_idxs)

      # get pos dist of cos similarity
      pos = F.cosine_similarity(center_emb, context_embs).mean()
      # get neg distance
      neg = F.cosine_similarity(center_emb, neg_embs).mean()

      # calculate loss
      loss = torch.max(CFG.zero, pos-neg+1) #margin is 1
      loss.backward()
      optimizer.step()

      curr_loss.append(loss.item())
    print('loss:', np.mean(curr_loss))

  result_weights = model.embeddings.weight.data.numpy()
  result = dict(zip(tokenizer.idx_word_dict.keys(), result_weights))
  return result



In [124]:
data = 'A quick brown fox jumps over a lazy dog! Hi, is it your string?'

In [132]:
result = train(data)

loss: 0.9120730076517377
loss: 0.6605848371982574
loss: 0.5762889810970852
loss: 0.540666150195258
loss: 0.5156122956957135
loss: 0.48132035987717764
loss: 0.522578260728291
loss: 0.5025157843317304
loss: 0.5116442143917084
loss: 0.4911403698580606


### Train on a larger corpus

In [None]:
text = clean(data_loader('master.txt'))
train(text)
#takes too long, breaking into batches might if it had been the only case