## Solving tasks from Distributive semantics section of NLP Course from tepik

In [44]:
# !pip3 install numpy

In [45]:
import sys
import ast
import numpy as np
import random
import copy

def parse_array(s):
    return np.array(ast.literal_eval(s))

def read_array():
    return parse_array(sys.stdin.readline())

def write_array(arr):
    print(repr(arr))

def normalize_list(list):
    return [l.tolist() if isinstance(l, np.ndarray) else l for l in list]


### 1. Generation of examples for training Word2Vec Skip Gram Negative Sampling

We are training Word2Vec Skip Gram Negative Sampling with a window of a given width. For example, a window of size 5 implies that words that are no more than 2 positions to the left or right from the central word are considered positive examples. The center word is not counted as a context word.

Write function, which generates training examples from the text. Every training example must look like a 3-element tuple $(CenterWord,CtxWord,Label)$, where $CenterWord∈N$ - token identifier in the middle of the window, $CtxWord∈N$ - identifier of adjacent token, $Label∈{0,1} - 1$ if $CtxWordCtxWord$ is positive and $0$, it is a negative example.

Function should return the list with training examples.

Arugment ns_rate sets the number of negative examples to generate for each positive example. When sampling negative words, it is usually not checked whether the word appears in the window. Thus, among negative examples, positive ones may appear.

Input text was already tokenized and tokens were replaced with their identifiers.

Tests are generated randomly, constraints:

 - len(text) < 20
 - window_size <= 11, нечётное
 - vocab_size < 100
 - ns_rate < 3
Words have identifiers 0..vocab_size - 1 (as returns np.random.randint).

NB, that -3 // 2 != -(3 // 2).

In [46]:
def get_window(text, window_size):
    for backward, current in enumerate(range(len(text)), start=0 - (window_size // 2)):
        if backward < 0:
            backward = 0
        context = list(text[backward:current]) + list(text[current + 1:current + 1 + window_size // 2])
        center = text[current]
        yield center, context
        
def generate_w2v_sgns_samples(text, window_size, vocab_size, ns_rate):
    """
    text - list of integer numbers - ids of tokens in text
    window_size - odd integer - width of window
    vocab_size - positive integer - number of tokens in vocabulary
    ns_rate - positive integer - number of negative tokens to sample per one positive sample

    returns list of training samples (CenterWord, CtxWord, Label)
    """
    res = []

    for center, context_values in get_window(text, window_size):
        for context in context_values:
            res.append([center, context, 1])
            for n in range(ns_rate):
                res.append([center, random.choice(np.array(range(0, vocab_size))), 0])

    return res

In [47]:
text = [1, 0, 1, 0, 0, 5, 0, 3, 5, 5, 3, 0, 5, 0, 5, 2, 0, 1, 3]
window_size = 4
vocab_size = 6
ns_rate = 1

result = generate_w2v_sgns_samples(text, window_size, vocab_size, ns_rate)

write_array(result)

[[1, 0, 1], [1, 0, 0], [1, 1, 1], [1, 4, 0], [0, 1, 1], [0, 2, 0], [0, 1, 1], [0, 3, 0], [0, 0, 1], [0, 3, 0], [1, 1, 1], [1, 0, 0], [1, 0, 1], [1, 0, 0], [1, 0, 1], [1, 0, 0], [1, 0, 1], [1, 2, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [0, 3, 0], [0, 0, 1], [0, 0, 0], [0, 5, 1], [0, 2, 0], [0, 1, 1], [0, 5, 0], [0, 0, 1], [0, 0, 0], [0, 5, 1], [0, 1, 0], [0, 0, 1], [0, 4, 0], [5, 0, 1], [5, 4, 0], [5, 0, 1], [5, 3, 0], [5, 0, 1], [5, 1, 0], [5, 3, 1], [5, 1, 0], [0, 0, 1], [0, 4, 0], [0, 5, 1], [0, 5, 0], [0, 3, 1], [0, 5, 0], [0, 5, 1], [0, 4, 0], [3, 5, 1], [3, 1, 0], [3, 0, 1], [3, 0, 0], [3, 5, 1], [3, 5, 0], [3, 5, 1], [3, 4, 0], [5, 0, 1], [5, 4, 0], [5, 3, 1], [5, 5, 0], [5, 5, 1], [5, 3, 0], [5, 3, 1], [5, 2, 0], [5, 3, 1], [5, 0, 0], [5, 5, 1], [5, 3, 0], [5, 3, 1], [5, 4, 0], [5, 0, 1], [5, 5, 0], [3, 5, 1], [3, 0, 0], [3, 5, 1], [3, 0, 0], [3, 0, 1], [3, 5, 0], [3, 5, 1], [3, 2, 0], [0, 5, 1], [0, 4, 0], [0, 3, 1], [0, 1, 0], [0, 5, 1], [0, 5, 0], [0, 0, 1], [0, 0, 0], [5, 3, 1]

### 2. Training Word2Vec Skip Gram Negative Sampling for one example

We are teaching Word2Vec Skip Gram Negative Sampling.

Write a function that updates the model weights when receiving one training example in the format $(CenterWord, CtxWord, Label)$.

During training, model predictions are calculated using the formula $P(CtxWord∣CenterWord)=σ(WCenterWord,:​⋅DCtxWord,:​)$

The loss function is binary cross-entropy.

In [48]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def update_w2v_weights(center_embeddings, context_embeddings, center_word, context_word, label, learning_rate):
    """
    center_embeddings - VocabSize x EmbSize
    context_embeddings - VocabSize x EmbSize
    center_word - int - identifier of center word
    context_word - int - identifier of context word
    label - 1 if context_word is real, 0 if it is negative
    learning_rate - float > 0 - size of gradient step
    """
    sigma = sigmoid(np.dot(center_embeddings[center_word], context_embeddings[context_word]))
    update_center = np.dot(learning_rate * (sigma - label), context_embeddings[context_word])
    update_context = np.dot(learning_rate * (sigma - label), center_embeddings[center_word])
    center_embeddings[center_word] -= update_center
    context_embeddings[context_word] -= update_context


In [49]:
center_embeddings = [[0.3449417709491044, 0.6762047256081501, 0.9583446027893963],
                     [0.6247126159157468, 0.22038323197740317, 0.29717611444948355],
                     [0.9836099232994968, 0.3847689688960674, 0.033312247867206435],
                     [0.4217704869846559, 0.0023859008971685025, 0.009686915033163657],
                     [0.6933070658521228, 0.9705089533296152, 0.9189360293193337],
                     [0.024858486425111903, 0.11331113152689753, 0.6492144300167894],
                     [0.7861289466352543, 0.227319130535791, 0.8165251907260063],
                     [0.7672181161105678, 0.04865001026002924, 0.07514404284170773]]
context_embeddings = [[0.4628817426583818, 0.7747296319956671, 0.1374808935513827],
                      [0.17026823169513283, 0.4094733988461122, 0.3175531656197459],
                      [0.2910876746161247, 0.6340566555548147, 0.23158010794029804],
                      [0.8449042648180852, 0.4796593509107806, 0.11278090182290745],
                      [0.049097778744511156, 0.6254116250148337, 0.13038703647472905],
                      [0.882545488649187, 0.6223076699449618, 0.1633041302523962],
                      [0.6704032810194875, 0.941803340812521, 0.7358646489592193],
                      [0.9875878745059805, 0.17935677165390562, 0.6798846454394736]]
center_word = 2
context_word = 5
label = 0
learning_rate = 0.342405260598321

In [50]:
update_w2v_weights(center_embeddings, context_embeddings,
                   center_word, context_word, label, learning_rate)

center_embeddings = normalize_list(center_embeddings)
context_embeddings = normalize_list(context_embeddings)

write_array(center_embeddings)
write_array(context_embeddings)

[[0.3449417709491044, 0.6762047256081501, 0.9583446027893963], [0.6247126159157468, 0.22038323197740317, 0.29717611444948355], [0.7561584406822226, 0.22438652516534294, -0.008774836618697823], [0.4217704869846559, 0.0023859008971685025, 0.009686915033163657], [0.6933070658521228, 0.9705089533296152, 0.9189360293193337], [0.024858486425111903, 0.11331113152689753, 0.6492144300167894], [0.7861289466352543, 0.227319130535791, 0.8165251907260063], [0.7672181161105678, 0.04865001026002924, 0.07514404284170773]]
[[0.4628817426583818, 0.7747296319956671, 0.1374808935513827], [0.17026823169513283, 0.4094733988461122, 0.3175531656197459], [0.2910876746161247, 0.6340566555548147, 0.23158010794029804], [0.8449042648180852, 0.4796593509107806, 0.11278090182290745], [0.049097778744511156, 0.6254116250148337, 0.13038703647472905], [0.6290474670186392, 0.5231442006778062, 0.15471882755224034], [0.6704032810194875, 0.941803340812521, 0.7358646489592193], [0.9875878745059805, 0.17935677165390562, 0.679