In [1]:
import copy
import json
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/egor_baryshnikov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Preprocessing

In [2]:
from preprocessing import clear_sentences

In [3]:
path_data_prepaired = '../dataset/dataset.json'

In [4]:
data = None
with open(path_data_prepaired) as file_data:
    data = json.load(file_data)

In [5]:
%time sentences = clear_sentences(data)

CPU times: user 4.24 s, sys: 366 ms, total: 4.61 s
Wall time: 4.64 s


In [6]:
path_data_simtest = '../dataset/SimLex-999/SimLex-999.txt'

In [7]:
simtest_data = pd.read_table(path_data_simtest)

## Data Processing

In [8]:
from models import Word2Vec

In [9]:
class models_calculation():
    def __init__(self, k, d):
        self.model = Word2Vec(sentences)
        self.k = k
        self.d = d
        self.model.d = self.d
        
        self.model.create_vocabulary(r=200)
        self.model.create_corpus_matrix(L=2)
        
    def IMF_embedds(self, alpha=0.5):
        model=self.model
        k=self.k
        
        model_IMF = copy.deepcopy(model)
        model_IMF.compute_embedds_IMF(k=k, alpha=0.5)
        self.IMF = model_IMF

    def RO_embedds(self, step=1e-4, max_iter=20, alpha=0.5):
        model_IMF=self.IMF
        k=self.k
        
        print('Iterations number searching')
        model_RO = copy.deepcopy(model_IMF)
        model_RO.compute_embedds_riem(k=k, step=step, max_iter=max_iter, alpha=alpha)

        best_iter = model_RO.r_iter_errs[max(model_RO.r_iter_errs)]

        print('Computing embedds')
        model_RO = copy.deepcopy(model_IMF)
        model_RO.compute_embedds_riem(k=k, step=step, max_iter=best_iter+1, alpha=alpha)
        self.RO = model_RO

    def EMF_embedds(self, eps=5e-1, max_iter=20, step=1e-8):
        model=self.model
        k=self.k
        
        model_EMF = copy.deepcopy(model)
        model_EMF.compute_embedds_EMF(k=k, eps=eps, max_iter=max_iter, step=step)
        self.EMF = model_EMF

In [10]:
models_dict = {}

for d in (100, 200, 500):
    print('d=',d)
    model = models_calculation(0, d) #set k=0 on the initial step, it will be changed on the next iteration
    for k in (1, 5, 15):
        print('k=',k)
        model.k = k
        
        print('\nIMF search...')
        model.IMF_embedds()
        models_dict['IMF', k, d] = model.IMF
        
        print('\nRO search...')
        model.RO_embedds()
        models_dict['RO', k, d] = model.RO
        
        print('\nEMF search...')
        model.EMF_embedds()
        models_dict['EMF', k, d] = model.EMF

d= 100
Creating vocabulary
Creating corpus matrix
k= 1

IMF search...
Computing of words embeddings
Value of the SGNS's objective:  -76186142.69471958

RO search...
Iterations number searching
0 iteration: 
 -76186142.69471958
1 iteration: 
 -85560616.01931854
2 iteration: 
 -75898831.05456166
3 iteration: 
 -84957342.02088748
4 iteration: 
 -75674843.78356472
5 iteration: 
 -84458476.27821697
6 iteration: 
 -75488854.07356055
7 iteration: 
 -83980840.83289337
8 iteration: 
 -75478127.47456549
9 iteration: 
 -83862267.72343409
10 iteration: 
 -75230003.41898552
11 iteration: 
 -83446072.77166566
12 iteration: 
 -76019797.81246333
13 iteration: 
 -80780259.81320225
14 iteration: 
 -77445307.90474801
15 iteration: 
 -80956128.12322044
16 iteration: 
 -79609923.77838483
17 iteration: 
 -81488143.04636326
18 iteration: 
 -79866034.63360685
19 iteration: 
 -81598929.09113212
Computing embedds
0 iteration: 
 -76186142.69471958
1 iteration: 
 -85560616.01931854
2 iteration: 
 -75898831.054561

In [25]:
for col in 'word1', 'word2':
    simtest_data = simtest_data.loc[simtest_data[col].map(lambda x: x in list(models_dict.values())[0].vocab)] #all the vocabularies are identical

## Testing

In [39]:
def calculate_spearman(model, name, simtest_data= simtest_data, w1_colname= 'word1', w2_colname= 'word2'):

    vec = pd.DataFrame()
    for col in w1_colname, w2_colname:
        if 'EMF' in name:
            vec[col] = simtest_data[col].apply(lambda x: model.get_word_embedding2(x))
        else:
            vec[col] = simtest_data[col].apply(lambda x: model.get_word_embedding(x))

    cosine_sim_lambda = lambda x: np.float64(cosine_similarity(vec.loc[x, w1_colname].reshape(1, -1),
                                                               vec.loc[x, w2_colname].reshape(1, -1)))
    for i in simtest_data.index:
        vec.loc[i, 'cos_sim'] = cosine_sim_lambda(i)
    
    return spearmanr(np.array(simtest_data['SimLex999']), np.array(vec['cos_sim']))

In [40]:
spearman_results = {}
for key, model in models_dict.items():
    spearman_results[key] = calculate_spearman(model= model, name= key)

In [62]:
#spearman_results_df = pd.DataFrame.from_dict(spearman_results)
#spearman_results_df.index = ['corr', 'p_value']
#spearman_results_df.columns.set_names(names = ('method', 'k', 'd'), inplace = True)

## Classification tests

In [70]:
# create dataframe from the json file
df = pd.read_json(path_data_prepaired)

In [71]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix

In [74]:
def get_features(model, name):
    if 'EMF' in name:
        X = model.get_features_matrix2(sentences)
    else:
        X = model.get_features_matrix(sentences)

    return X

In [89]:
clf_results = {}
for key, model in models_dict.items():
    print('{}\n\tGetting features'.format(key))
    X = get_features(model= model, name= key)
        
    del_idx = np.argwhere(np.isnan(X))[:, 0]
    
    X = X[~np.isnan(X).any(axis=1)]
    
    y = (df['overall'] > 3).apply(int)
    y = y.drop(del_idx)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True)
    
    f1_scores = []
    
    for clf in (LogisticRegression(), LinearSVC()):
        print('\t{} operation'.format(str(clf).split('(')[0]))
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        f1 = f1_score(y_true= y_test, y_pred= y_pred)
        f1_scores.append(f1)
        
    clf_results[key] = f1_scores

('IMF', 1, 100)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('RO', 1, 100)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('EMF', 1, 100)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('IMF', 5, 100)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('RO', 5, 100)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('EMF', 5, 100)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('IMF', 15, 100)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('RO', 15, 100)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('EMF', 15, 100)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('IMF', 1, 200)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('RO', 1, 200)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('EMF', 1, 200)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation


  review_vec /= words_count


('IMF', 5, 200)
	Getting features
	LogisticRegression operation




	LinearSVC operation
('RO', 5, 200)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('EMF', 5, 200)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation




('IMF', 15, 200)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('RO', 15, 200)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('EMF', 15, 200)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation




('IMF', 1, 500)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('RO', 1, 500)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('EMF', 1, 500)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation




('IMF', 5, 500)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('RO', 5, 500)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('EMF', 5, 500)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation




('IMF', 15, 500)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('RO', 15, 500)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation
('EMF', 15, 500)
	Getting features


  review_vec /= words_count


	LogisticRegression operation




	LinearSVC operation




In [93]:
clf_results_df = pd.DataFrame.from_dict(clf_results)
clf_results_df.index = ['Logistic Regression', 'Support Vector Classificator']
clf_results_df.columns.set_names(names = ('method', 'k', 'd'), inplace = True)