In [18]:
import json
import re

import collections
from collections import Counter

import scipy
from scipy import sparse
from scipy.sparse import linalg

import numpy as np

In [19]:
from utils import clear_sentences, create_vocabulary, create_corpus_matrix, compute_embeddings
from utils import Word2VecController, Doc2VecController

# Preparation

In [11]:
path_data_prepaired = '../dataset/dataset.json'

In [7]:
data = None
with open(path_data_prepaired) as file_data:
    data = json.load(file_data)

In [12]:
%time sentences = clear_sentences(data)

CPU times: user 3.02 s, sys: 237 ms, total: 3.25 s
Wall time: 3.25 s


# Pipeline

## Utils for creating bag of words models and corpus matricies

In [13]:
%time vocabulary = create_vocabulary(sentences, r=200)

CPU times: user 2.48 s, sys: 2.49 ms, total: 2.49 s
Wall time: 2.49 s


In [20]:
%time corpus_matrix = create_corpus_matrix(sentences, vocabulary)

CPU times: user 52.5 s, sys: 108 ms, total: 52.6 s
Wall time: 52.6 s


## Embedding calculators, custom realizations

In [21]:
%time embedding_matrix = compute_embeddings(corpus_matrix=corpus_matrix, k=3)

CPU times: user 1min 45s, sys: 13.2 s, total: 1min 58s
Wall time: 1min 34s


## Class for comparison and unified executing all of embedding computers

<hr>

# Riemannian Optimization

<hr>

vocabulary, corpus_matrix - we have before we start optimization problem

In [60]:
def computer_grad_matrix(X, corpus_matrix, k=3):
    
    def sigmoid_fun(x):
        return 1 / (1 + np.exp(-x))
    
    grad_F = np.zeros(corpus_matrix.shape)
    
    all_observations = corpus_matrix.sum()
    sum_over_words = np.array(corpus_matrix.sum(axis=0)).flatten()
    sum_over_contexts = np.array(corpus_matrix.sum(axis=1)).flatten()

    for word_index_1, word_index_2 in zip(corpus_matrix.nonzero()[0], 
                                          corpus_matrix.nonzero()[1]):

        sg_count = corpus_matrix[word_index_1, word_index_2]

        pwc = sg_count
        pw = sum_over_contexts[word_index_1]
        pc = sum_over_words[word_index_2]
        
        xwc = X[word_index_1, word_index_2]

        grad_F[word_index_1, word_index_2] = pwc * sigmoid_fun(-xwc) - k * pw * pc / all_observations * sigmoid_fun(xwc)
    
    return grad_F

In [66]:
# Riemannian Optimization for SGNS
def riemannian_optimization(X=None, corpus_matrix=None, step=0.001, k=3, max_iter=50, alpha=0.5):
    
    U, S, V = np.linalg.svd(X)
    
    for i in range(1, max_iter):
        
        Y = X + step * computer_grad_matrix(X, corpus_matrix, k=k)
        _U, _S = np.linalg.qr(Y @ V)
        _V, _S = np.linalg.qr(Y.T @ U)
        
        print('Step', i, 'norm change at step', np.linalg.norm(_U @ _S @ _V - X, ord='fro'))
        
        X = _U @ _S @ _V
        U = _U
        V = _V
        
        
    U, S, V = np.linalg.svd(X)
    
    return U @ np.diag(np.power(S, alpha))

In [69]:
X = np.ones(corpus_matrix[:100, :100].shape)
E = riemannian_optimization(X=X, corpus_matrix=corpus_matrix[:100, :100], step=1e-4, max_iter=10)

Step 1 norm change at step 160.4826250411553
Step 2 norm change at step 160.27247963108454
Step 3 norm change at step 165.76569669509223
Step 4 norm change at step 161.72842061628853
Step 5 norm change at step 153.74452825779244
Step 6 norm change at step 146.5226279797441
Step 7 norm change at step 143.2818997825804
Step 8 norm change at step 158.05942227407567
Step 9 norm change at step 150.16802731917258


## Class for converting Sentences with Word2Vec class into vectors for representing sentences in vector form

# Comparison models of embeddings

## There are 3 base models, which included into our experiments for classifient feedbacks grades

In [54]:
import sklearn
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier