In [2]:
import random
def init_random_seed(value=0):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)
    torch.cuda.manual_seed(value)
    torch.backends.cudnn.deterministic = True

In [3]:
%load_ext autoreload
%autoreload 2


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch import nn
from torch.nn import functional as F

#import dlnlputils
#from dlnlputils.data import tokenize_corpus, build_vocabulary, texts_to_token_ids, \
#    PaddedSequenceDataset, Embeddings
#from dlnlputils.pipeline import train_eval_loop, predict_with_model, init_random_seed
#from dlnlputils.visualization import plot_vectors

init_random_seed()

## Алгоритм обучения - Skip Gram Negative Sampling

**Skip Gram** - предсказываем соседние слова по центральному слову

**Negative Sampling** - аппроксимация softmax

$$ W, D \in \mathbb{R}^{Vocab \times EmbSize} $$

$$ \sum_{CenterW_i} P(CtxW_{-2}, CtxW_{-1}, CtxW_{+1}, CtxW_{+2} | CenterW_i; W, D) \rightarrow \max_{W,D} $$

$$ P(CtxW_{-2}, CtxW_{-1}, CtxW_{+1}, CtxW_{+2} | CenterW_i; W, D) = \prod_j P(CtxW_j | CenterW_i; W, D) $$
    
$$ P(CtxW_j | CenterW_i; W, D) = \frac{e^{w_i \cdot d_j}} { \sum_{j=1}^{|V|} e^{w_i \cdot d_j}} = softmax \simeq \frac{e^{w_i \cdot d_j^+}} { \sum_{j=1}^{k} e^{w_i \cdot d_j^-}}, \quad k \ll |V| $$

In [7]:
def make_diag_mask(size, radius):
    """Квадратная матрица размера Size x Size с двумя полосами ширины radius вдоль главной диагонали"""
    #idxs = torch.arange(size)
    #abs_idx_diff = (idxs.unsqueeze(0) - idxs.unsqueeze(1)).abs()
    #mask = ((abs_idx_diff <= radius) & (abs_idx_diff > 0)).float()
    mask = np.ones((size, size))
    for i in range(size):
        for j in range(size): 
            if ((i == j) | (i - j > radius) | (j - i > radius)):
                mask[i, j] = 0
    return mask

make_diag_mask(10, 1)

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]])

In [5]:
def find_index(text, matrix, state):
    lst = []
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            if (matrix[i, j] == state):
                lst_1 = []
                lst_1.append(text[i])
                lst_1.append(text[j])
                lst_1.append(state)
                lst.append(lst_1)
    return lst

In [22]:
text = [1, 0, 1, 0, 0, 5, 0, 3, 5, 5, 3, 0, 5, 0, 5, 2, 0, 1, 3]
window_size = 3
ns_rate = 1
voc = 6

generated_text = [x for x in range(voc)]

matrix = make_diag_mask(len(text), (window_size // 2))
pos_list_exist = np.array(find_index(text, matrix, 1))
neg_list_exist = np.array(find_index(text, matrix, 0))

neg_list_generate = np.array(find_index(generated_text, make_diag_mask(len(generated_text), window_size // 2), 0))

if len(neg_list_exist) >= len(pos_list_exist)*ns_rate:
    ans = np.concatenate((pos_list_exist, neg_list_exist[:ns_rate*len(pos_list_exist), :]))
else:
    neg_list_exist = np.concatenate(neg_list_generate, neg_list_exist)
    ans = np.concatenate((pos_list_exist, neg_list_exist[:ns_rate*len(pos_list_exist), :]))

In [23]:
neg_list_generate

array([[0, 0, 0],
       [0, 2, 0],
       [0, 3, 0],
       [0, 4, 0],
       [0, 5, 0],
       [1, 1, 0],
       [1, 3, 0],
       [1, 4, 0],
       [1, 5, 0],
       [2, 0, 0],
       [2, 2, 0],
       [2, 4, 0],
       [2, 5, 0],
       [3, 0, 0],
       [3, 1, 0],
       [3, 3, 0],
       [3, 5, 0],
       [4, 0, 0],
       [4, 1, 0],
       [4, 2, 0],
       [4, 4, 0],
       [5, 0, 0],
       [5, 1, 0],
       [5, 2, 0],
       [5, 3, 0],
       [5, 5, 0]])

In [16]:
print(len(fs_pos_list))
print(len(fs_neg_list))
print(len(ans))

36
325
72


In [24]:
ans

array([[1, 0, 1],
       [0, 1, 1],
       [0, 1, 1],
       [1, 0, 1],
       [1, 0, 1],
       [0, 1, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 5, 1],
       [5, 0, 1],
       [5, 0, 1],
       [0, 5, 1],
       [0, 3, 1],
       [3, 0, 1],
       [3, 5, 1],
       [5, 3, 1],
       [5, 5, 1],
       [5, 5, 1],
       [5, 3, 1],
       [3, 5, 1],
       [3, 0, 1],
       [0, 3, 1],
       [0, 5, 1],
       [5, 0, 1],
       [5, 0, 1],
       [0, 5, 1],
       [0, 5, 1],
       [5, 0, 1],
       [5, 2, 1],
       [2, 5, 1],
       [2, 0, 1],
       [0, 2, 1],
       [0, 1, 1],
       [1, 0, 1],
       [1, 3, 1],
       [3, 1, 1],
       [1, 1, 0],
       [1, 1, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 5, 0],
       [1, 0, 0],
       [1, 3, 0],
       [1, 5, 0],
       [1, 5, 0],
       [1, 3, 0],
       [1, 0, 0],
       [1, 5, 0],
       [1, 0, 0],
       [1, 5, 0],
       [1, 2, 0],
       [1, 0, 0],
       [1, 1, 0],
       [1, 3, 0],
       [0, 0, 0],
       [0,