In [51]:
import collections
import numpy as np
import copy
import re
import time

import torch
import gensim


# Утилиты текстовые

def text2words(source, min_token_size=0):
    return [i.lower() for i in re.sub('[^a-zA-Z]',' ', source).split(' ') if len(i) > min_token_size]

def words2ids(words, vocabulary):
    return [vocabulary[token] for token in words if token in vocabulary]

def words2vocabulary(words, max_size=1000000, padding_word=None):
    word_counts = collections.defaultdict(int)

    for token in words:
        word_counts[token] += 1

    sorted_word_counts = [(padding_word, 0)]+ sorted(word_counts.items(),
                                reverse=True,
                                key=lambda pair: pair[1])

    if len(word_counts) > max_size:
        sorted_word_counts = sorted_word_counts[:max_size]

    vocabulary = {word: i for i, (word, _) in enumerate(sorted_word_counts)}

    return vocabulary

# Iterable dataset батчей - окон слов

class WordsDataset(torch.utils.data.Dataset):
    def __init__(self, words, targets, out_len=100, pad_value=0):
        self.words = words
        self.targets = targets
        self.out_len = out_len
        self.pad_value = pad_value

    def __len__(self):
        return len(self.words)

    def __getitem__(self, item):
        txt = [self.words[i] for i in range(item - self.out_len, item)]
        txt = torch.tensor(txt, dtype=torch.long)

        return txt, torch.tensor(0, dtype=torch.long)

# Собственно нейросеть

class NN(torch.nn.Module):
    def __init__(self, vocab_size, emb_size, sentence_len, radius=5, negative_samples_n=5):
        super().__init__()
        self.vocab_size = vocab_size
        self.negative_samples_n = negative_samples_n
        self.embeddings = torch.nn.Embedding(self.vocab_size, emb_size, padding_idx=0)

        # двухдиагональная матрица с полосами ширины radius из единиц и нулями на главной диагонали
        self.positive_sim_mask = sum([torch.diag(torch.ones(sentence_len), diagonal=i)[:-abs(i), :-abs(i)] for i in range(-radius, radius+1) if i != 0])
    
    def forward(self, batch):
        batch_size = batch.shape[0]          # batch: BatchSize x SentSize 
        batch_embs = self.embeddings(batch)   # BatchSize x SentSize x EmbSize

        # SkipGram
        positive_embs = batch_embs.permute(0, 2, 1)                         # EmbSize x SentSize
        positive_sims = torch.bmm(batch_embs, positive_embs)   
        positive_probs = torch.sigmoid(torch.bmm(batch_embs, positive_embs)) # SentSize x SentSize
        positive_loss = torch.nn.functional.binary_cross_entropy(positive_probs * self.positive_sim_mask,
                                               self.positive_sim_mask.expand_as(positive_probs))

        # NegativeSampling
        negative_words = torch.randint(1, self.vocab_size,
                                       size=(batch_size, self.negative_samples_n))
        negative_embs = self.embeddings(negative_words).permute(0, 2, 1)     # EmbSize x NegSamplesN        
        negative_probs = torch.sigmoid(torch.bmm(batch_embs, negative_embs)) # SentSize x NegSamplesN
        negative_loss = torch.nn.functional.binary_cross_entropy(negative_probs, negative_probs.new_zeros(negative_probs.shape))

        return positive_loss + negative_loss # -> min

# Обучаем нейросеть

def Train(model, train_dataset, test_dataset,
                    learningRate=1e-2, epoch_n=3, batch_size=10,
                    max_batches_train=100000, max_batches_test=100000):

    optimizer = torch.optim.Adam(model.parameters(), lr = learningRate)

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=10, shuffle=True)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=10, shuffle=False)

    best_loss = float('inf')
    best_model = copy.deepcopy(model)

    for epoch in range(epoch_n):
        model.train()
        batches_train = 0
        
        for (batch, _) in train_dataloader:
            if batches_train > max_batches_train:
                break

            pred = model(batch)
            model.zero_grad()
            pred.backward()
            optimizer.step()
            
            batches_train += 1

        model.eval()
        test_loss = 0
        batches_test = 0

        torch.no_grad()
        for (batch, _) in test_dataloader:
            if batches_test > max_batches_test:
                break
                
            pred = model(batch)
            test_loss += float(pred)
            batches_test += 1

        test_loss /= batches_test

        if test_loss < best_loss:
            best_loss = test_loss
            best_model = copy.deepcopy(model)
        else:
            break
             
    return best_loss, best_model


# Инкапсуляция word2vec

class Word2Vec:
    def __init__(self, words, n=100, r=5, ns_n=10):
        train_words = words
        test_words = words

        vocabulary = words2vocabulary(words, padding_word='NULL')

        train_ids = words2ids(train_words, vocabulary)
        test_ids = words2ids(test_words, vocabulary)
        sentence_len = 20
        
        train_dataset = WordsDataset(train_ids,
                                        np.zeros(len(train_ids)),
                                        out_len=sentence_len)
        test_dataset = train_dataset
        
        nn = NN(len(vocabulary), n, sentence_len,
                        radius=r, negative_samples_n=ns_n)
        
        best_loss, best_model = Train(nn,
                                        train_dataset,
                                        test_dataset,
                                        learningRate=0.01,
                                        epoch_n=3,
                                        batch_size=10)                                        

        embs = best_model.embeddings.weight.detach().numpy()

        self.embeddings = embs / (np.linalg.norm(embs, ord=2, axis=-1, keepdims=True) + 1e-4)
        self.vocabulary = vocabulary
        self.id2word = {i: w for w, i in vocabulary.items()}

    def most_similar(self, word, topk=10):
        return self.most_similar_vector(self.get_vector(word), topk=topk)

    def most_similar_vector(self, query_vector, topk=10):
        if query_vector is None:
            return []

        topk = topk+1
        similarities = self.embeddings @ query_vector
        best_indices = np.argpartition(-similarities, topk, axis=0)[:topk]
        result = [(self.id2word[i], similarities[i]) for i in best_indices]
        result.sort(key=lambda x: -x[1])
        return result[1:]

    def get_vector(self, word):
        if word not in self.vocabulary:
            return None
        return self.embeddings[self.vocabulary[word]]
    
    def get_vocabulary(self):
        voc = {}
        for word, id in self.vocabulary.items():
            voc[word] = self.embeddings[id]
        return voc

def train(data: str):
    words = text2words(data, min_token_size=2)
    word2vec = Word2Vec(words)
    return word2vec.get_vocabulary()
    

In [52]:
data = open('war_and_peace.txt', encoding='utf-8').read()

words = text2words(data, min_token_size=2)
sentences = [[words[i] for i in range(item - 20, item)] for item in range(0, len(words))]

In [53]:
word2vec = Word2Vec(words)

In [54]:
word2vec.embeddings[word2vec.vocabulary["hussars"]]

array([-6.23179972e-02, -1.04786847e-02,  2.46027503e-02, -5.99943027e-02,
       -1.36597231e-01, -1.89535115e-02, -1.24940708e-01, -1.61932200e-01,
        9.71919745e-02, -3.23042125e-02, -3.80800068e-02,  3.11368983e-02,
       -2.44162232e-01, -8.35448056e-02, -3.58248502e-02, -2.39695199e-02,
        9.05313343e-02,  4.02893424e-02, -1.60309244e-02,  9.01376158e-02,
        5.73389493e-02, -7.66259292e-03, -1.08058453e-01,  3.94985341e-02,
       -3.18879224e-02, -1.14959069e-01,  1.77577510e-01, -1.61834969e-03,
       -1.83096454e-01,  9.62313116e-02,  1.21666463e-02, -5.09796143e-02,
        1.62627339e-01, -4.97003943e-02, -1.22622073e-01, -8.42875615e-02,
       -3.01153082e-02,  6.10896572e-02,  1.89320505e-01,  1.04606776e-02,
        3.00640762e-02, -2.83870250e-02, -8.19226652e-02,  9.83823016e-02,
        2.58618798e-02,  1.34922072e-01, -7.30558634e-02,  4.63353395e-02,
        6.16451725e-02,  6.74023479e-02, -1.57215688e-02, -5.54751337e-01,
       -1.00429691e-01, -

In [30]:
word2vec.embeddings[word2vec.vocabulary["dragoons"]]

KeyError: 'dragoons'

In [31]:
word2vec.embeddings[word2vec.vocabulary["dragoons"]] - word2vec.embeddings[word2vec.vocabulary["hussars"]]

KeyError: 'dragoons'

In [55]:
word2vec.get_vocabulary()

{'NULL': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       dtype=float32),
 'the': array([-2.62689237e-02, -3.36231440e-02, -2.37546656e-02, -1.21029289e-02,
        -9.78355333e-02, -1.63412318e-02,  6.34971354e-03,  3.30275856e-02,
         4.61483039e-02,  1.63566582e-02,  2.36685090e-02, -4.16434649e-03,
         2.06902288e-02, -3.10262106e-02,  1.40137272e-02,  1.35739725e-02,
        -2.00749328e-03,  3.77570055e-02,  4.08754908e-02,  4.77359667e-02,
        -1.64709054e-02,  2.92206481e-02, -3.09983697e-02,  8.77174586e-02,
         5.11558615e-02,  3.76217999e-02, -1.77516919e-02

In [34]:
word2vec.most_similar_vector(
    word2vec.get_vector("andrew") 
    - word2vec.get_vector("prince") 
    + word2vec.get_vector("princess"), topk=40)

[('mary', 0.925492),
 ('andrew', 0.82827437),
 ('not', 0.77408135),
 ('said', 0.771927),
 ('that', 0.7639543),
 ('her', 0.76363033),
 ('but', 0.75292647),
 ('tell', 0.7446366),
 ('for', 0.74352056),
 ('and', 0.7432073),
 ('did', 0.7413348),
 ('nicholas', 0.7400751),
 ('she', 0.73682857),
 ('more', 0.73500925),
 ('you', 0.73161834),
 ('come', 0.7293008),
 ('thought', 0.7276484),
 ('went', 0.7260052),
 ('natasha', 0.72554016),
 ('the', 0.72483873),
 ('leave', 0.7219814),
 ('replied', 0.71648717),
 ('what', 0.71603274),
 ('rost', 0.71428806),
 ('why', 0.7142109),
 ('could', 0.70224184),
 ('there', 0.701215),
 ('himself', 0.6998744),
 ('all', 0.6967155),
 ('know', 0.6965703),
 ('your', 0.6963006),
 ('with', 0.69525266),
 ('they', 0.6948179),
 ('his', 0.6936432),
 ('such', 0.69280493),
 ('pierre', 0.69252706),
 ('say', 0.6920229),
 ('also', 0.6899534),
 ('from', 0.6897564),
 ('understood', 0.68924105)]

In [39]:
word2vec.most_similar("dragoons")

[('defend', 0.43215737),
 ('uhlans', 0.40895194),
 ('ferdinand', 0.3891443),
 ('noncommissioned', 0.35259026),
 ('seated', 0.34550554),
 ('stationed', 0.34464905),
 ('imperial', 0.34040946),
 ('died', 0.33877364),
 ('bitch', 0.33842742)]

In [36]:
word2vecG = gensim.models.Word2Vec(sentences=sentences,
                                   window=5)

In [37]:
word2vecG.wv.most_similar('hussars', topn=10)

[('cossacks', 0.6115525364875793),
 ('uhlans', 0.5075559616088867),
 ('pavlograd', 0.48941218852996826),
 ('gate', 0.4735693633556366),
 ('emperors', 0.4669438898563385),
 ('picket', 0.4638209342956543),
 ('polish', 0.45140326023101807),
 ('soldiers', 0.44261106848716736),
 ('horses', 0.43232592940330505),
 ('assembling', 0.4309908449649811)]