In [1]:
from collections import Counter
import gc
import numpy as np


class SkipGramBatcher():
    def __init__(self, corpus, window_size=2, batch_size=3, vocab_size=5000, unk_text='<UNK>'):
        self.window_size = window_size
        self.vocab_size = vocab_size - 1
        self.batch_size = batch_size
        self.unk_text = unk_text

        # 1. Count all word occurencies.
        counted_words = Counter(corpus).most_common(self.vocab_size)
        # create dict using dict comprehension
        self.idx_to_word = {idx: word for idx, (word, count) in enumerate(counted_words)}
        self.word_to_idx = {word: idx for idx, (word, count) in enumerate(counted_words)}

        # append '<UNK>' token to dictionaries
        last_idx = len(self.idx_to_word)
        self.idx_to_word[last_idx] = self.unk_text
        self.word_to_idx[self.unk_text] = last_idx
        indexed = self.words_to_indexes(corpus)
        
        # transform corpus from strings to indexes, to reduce memory usage
        self.corpus_indexes = np.asarray(
            indexed,
            dtype=np.int32
        )

        gc.collect()

    def words_to_indexes(self, words):
        unk_index = self.word_to_idx[self.unk_text]
        idxes = [self.word_to_idx.get(word, unk_index) for word in words]
        return idxes

    def indexes_to_words(self, indexes):
        words = [self.idx_to_word[index] for index in indexes]
        return words

    def __iter__(self):
        self.batch_start_pos = 0
        return self

    def get_random_sample(self, center_id):
        left_window = np.arange(max(0, center_id - self.window_size),
                                center_id)
        right_window = np.arange(center_id + 1,
                                 min(center_id + self.window_size + 1, len(self.corpus_indexes)))
        window = np.concatenate((left_window, right_window))
        position = np.random.choice(window)
        return self.corpus_indexes[position]

    def __next__(self):
        if self.batch_start_pos >= len(self.corpus_indexes):
            raise StopIteration
        else:
            batch_position_in_corpus = np.arange(
                self.batch_start_pos,
                min(self.batch_start_pos + self.batch_size, len(self.corpus_indexes))
            )
            x_batch = np.asarray(self.corpus_indexes[batch_position_in_corpus])
            # draw a word from window of a selected word
            y_batch = np.asarray([self.get_random_sample(selected_word_position)
                                  for selected_word_position in batch_position_in_corpus]).flatten()
            # for selected_word_position in batch_position_in_corpus:
            #     y_batch.append(self.get_random_sample(selected_word_position))
            self.batch_start_pos += self.batch_size
            return x_batch, y_batch

# Naive word2vec

This task can be formulated very simply. Follow this [paper](https://arxiv.org/pdf/1411.2738.pdf) and implement word2vec like a two-layer neural network with matrices $W$ and $W'$. One matrix projects words to low-dimensional 'hidden' space and the other - back to high-dimensional vocabulary space.

![word2vec](https://i.stack.imgur.com/6eVXZ.jpg)

You can use TensorFlow/PyTorch (numpy too, if you love to calculate gradients on your own and want some extra points, but don't forget to numerically check your gradients) and code from your previous task. Again: you don't have to implement negative sampling (you may reduce your vocabulary size for faster computation).

**Results of this task**:
 * trained word vectors (mention somewhere, how long it took to train)
 * plotted loss (so we can see that it has converged)
 * function to map token to corresponding word vector
 * beautiful visualizations (PCE, T-SNE), you can use TensorBoard and play with your vectors in 3D (don't forget to add screenshots to the task)
 * qualitative evaluations of word vectors: nearest neighbors, word analogies

**Extra:**
 * quantitative evaluation:
   * for intrinsic evaluation you can find datasets [here](https://aclweb.org/aclwiki/Analogy_(State_of_the_art))
   * for extrincis evaluation you can use [these](https://medium.com/@dataturks/rare-text-classification-open-datasets-9d340c8c508e)

Also, you can find any other datasets for quantitative evaluation. If you chose to do this, please use the same datasets across tasks 3, 4, 5 and 6.

Again. It is **highly recommended** to read this [paper](https://arxiv.org/pdf/1411.2738.pdf)

Example of visualization in tensorboard:
https://projector.tensorflow.org

Example of 2D visualisation:

![2dword2vec](https://www.tensorflow.org/images/tsne.png)

If you struggle with something, ask your neighbor. If it is not obvious for you, probably someone else is looking for the answer too. And in contrast, if you see that you can help someone - do it! Good luck!

In [2]:
import torch
import gc
from torch.autograd import Variable

#### Constants

In [33]:
VOCAB_SIZE = 500
BATCH_SIZE = 10

#### Load corpus into batcher

In [34]:
text = []
with open('./data/text8', 'r') as text8:
    text = text8.read().split()
text = text[:1000]

# text = ['first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'class', 'other']
batcher = SkipGramBatcher(corpus=text, vocab_size=VOCAB_SIZE, batch_size=BATCH_SIZE)

# free memory
text = []
gc.collect()

0

In [35]:
embedding_dims = 50
W1 = Variable(torch.randn(VOCAB_SIZE, embedding_dims).float(), requires_grad=True)
W2 = Variable(torch.randn(embedding_dims, VOCAB_SIZE).float(), requires_grad=True)
num_epochs = 1
learning_rate = 0.001

# tensor to store oh-encoded data
x_onehot = torch.FloatTensor(BATCH_SIZE, VOCAB_SIZE)

quanity = 10
loss_history = []

for epo in range(num_epochs):
    for i, (data, target) in enumerate(batcher):
        # if we have not full batch (at the end of the corpus), OH-encoding will not work
        if data.shape[0] != BATCH_SIZE:
            continue
        
        # get data (center word) and target (word, sampled from window)
        tensor_data = torch.from_numpy(np.expand_dims(data, axis=1)).type(torch.LongTensor)
        tensor_target = torch.from_numpy(target).type(torch.LongTensor)
        
        # get input and expected data
        x_onehot.zero_()
        x_onehot.scatter_(1, tensor_data, 1)
        y = tensor_target

        # forward propagate
        z1 = torch.matmul(x_onehot, W1)
        z2 = torch.matmul(z1, W2)
        
        # apply log softmax
        log_softmax = torch.nn.functional.log_softmax(z2, dim=0)
        # apply the negative log likelihood loss
        # nll_loss takes (batch_size, vocab_size)
        loss = torch.nn.functional.nll_loss(log_softmax, y)
        
        # backpropagate
        loss.backward()
        
        # learning step
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data
        
        if (i % quanity) == 0:
            print(f'Loss at batch {i}: {loss}')
#             print(f'update step:\n {W1.grad.data}')
            loss_history.append(loss)
        
        # delete backpropagation data
        W1.grad.data.zero_()
        W2.grad.data.zero_()

Loss at batch 0: 10.238473892211914
Loss at batch 10: 11.569684982299805
Loss at batch 20: 8.318129539489746
Loss at batch 30: 13.311193466186523
Loss at batch 40: 11.74460506439209
Loss at batch 50: 6.547219753265381
Loss at batch 60: 10.56187629699707
Loss at batch 70: 11.087947845458984
Loss at batch 80: 10.150960922241211
Loss at batch 90: 7.417031764984131
