In [1]:
import pandas as pd


file_name = 'test_set_002.csv'

data = pd.read_csv(file_name, delimiter=',')

print(data.shape)
data.head(5)

(26861, 3)


Unnamed: 0,defs,word,len
0,колпак на лампе,абажур,15
1,настоятель католического монастыря,аббат,34
2,католический священник,аббат,22
3,католический монастырь,аббатство,22
4,слово из первых букв,аббревиатура,20


In [2]:
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer


nlp = spacy.load('ru2')
nlp.add_pipe(nlp.create_pipe('sentencizer'), first=True)
stops = stopwords.words("russian")
doc_list = []


for text in data['defs']:
    text = ''.join(x for x in text if x not in punctuation)
    words = word_tokenize(text, language="russian")
    words = [word for word in words if word not in stops]
    text = ' '.join(words)
    doc = nlp(text)
    text = ' '.join(word.lemma_ for word in doc)
    doc_list.append(text)

vectorizer = TfidfVectorizer()
vectorizer.fit(doc_list)

vocab = vectorizer.vocabulary_
arr_of_idf = vectorizer.idf_


In [3]:
def get_word_tf_idf(word):
    if word in vocab:
        return arr_of_idf[vocab[word]]
    else:
        return 1

In [4]:
dt = []

for word in data['word']:
    dt.append(word)
    
for text in doc_list:
    dt.append(text.split())


In [5]:
import gensim

w2v_fpath = "all.norm-sz100-w10-cb0-it1-min100.w2v"
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_fpath, binary=True, unicode_errors='ignore', limit=500000)
w2v_model.init_sims(replace=True)
for word, score in w2v_model.most_similar("дерево"):
    print(word, score)

деревце 0.867724597454071
деревцо 0.8552745580673218
дерево, — 0.8466764688491821
дерево… 0.8452415466308594
срубленное 0.8443728685379028
росшее 0.8330211639404297
развесистое 0.8261986970901489
лиственное 0.8218693733215332
сучковатое 0.8129539489746094
раскидистое 0.8127985000610352


In [6]:
from gensim.models import Word2Vec

idf_w2v_model = Word2Vec(size=100)
idf_w2v_model.build_vocab(w2v_model.vocab)

for word in w2v_model.vocab:
    idf_w2v_model.wv[word] = w2v_model[word] * get_word_tf_idf(word)

# Sum of idf-vectors with cosine metric

In [16]:
# model: sum of idf-vectors


def get_words(sentence, prefix=""):
    
    sentence = ''.join(x for x in sentence if x not in punctuation)
    doc = nlp(sentence)
    sentence = ' '.join(word.lemma_ for word in doc)
    words = word_tokenize(sentence, language="russian")
    words = [word for word in words if word not in stops]
    words = [word for word in words if word in idf_w2v_model.wv.vocab]
    
    if words != []:
        sum_similar = idf_w2v_model.wv.most_similar(positive=words, topn=40)
        res = [i[0] for i in sum_similar if prefix == i[0][:len(prefix)]]
        return res
    else:
        return []

In [17]:
# testing on wiki_dataset (test_set_002.csv) topn=3

guessed = 0
words_non_in_dict = 0

for i in range(len(data)):
    word = data['word'][i]
    text = data['defs'][i]
    
    if word not in idf_w2v_model.wv.vocab:
        words_non_in_dict += 1
        continue
        
    if word in get_words(text)[:3]:
        guessed += 1

success = (guessed / len(data)) * 100
print('wiki_dataset score is', success, '%')
print('There were ', words_non_in_dict, ' words not from vocabulary')

wiki_dataset score is 4.314805852350992 %
There were  1299  words not from vocabulary


In [18]:
file = 'test_set_human_003.csv'

human_data = pd.read_csv(file, comment='#')

print(human_data.shape)
human_data.head(5)

(962, 3)


Unnamed: 0,defs,word,len
0,легкое агрегатное состояние,газ,58
1,оскорбление самого значимого,святотатство,53
2,разновидность чего-либо,род,42
3,возвышенность наоборот,низина,20
4,скрученная еда,рулет,63


In [19]:
# testing on human_dataset (test_set_human_003.csv) topn=3

guessed = 0
words_non_in_dict = 0

for i in range(len(human_data)):
    word = human_data['word'][i]
    text = human_data['defs'][i]
    
    if word not in idf_w2v_model.wv.vocab:
        words_non_in_dict += 1
        continue
    
    if word in get_words(text)[:3]:
        guessed += 1

success = (guessed / len(human_data)) * 100
print('human_dataset score is', success, '%')
print('There were ', words_non_in_dict, ' words not from vocabulary')

human_dataset score is 8.004158004158004 %
There were  33  words not from vocabulary


In [20]:
# testing on wiki_dataset (test_set_002.csv) topn=10

guessed = 0
words_non_in_dict = 0

for i in range(len(data)):
    word = data['word'][i]
    text = data['defs'][i]
    
    if word not in idf_w2v_model.wv.vocab:
        words_non_in_dict += 1
        continue
        
    if word in get_words(text)[:10]:
        guessed += 1

success = (guessed / len(data)) * 100
print('wiki_dataset score is', success, '%')
print('There were ', words_non_in_dict, ' words not from vocabulary')

wiki_dataset score is 8.871598227914076 %
There were  1299  words not from vocabulary


In [21]:
# testing on human_dataset (test_set_human_003.csv) topn=10

guessed = 0
words_non_in_dict = 0

for i in range(len(human_data)):
    word = human_data['word'][i]
    text = human_data['defs'][i]
    
    if word not in idf_w2v_model.wv.vocab:
        words_non_in_dict += 1
        continue
    
    if word in get_words(text)[:10]:
        guessed += 1

success = (guessed / len(human_data)) * 100
print('human_dataset score is', success, '%')
print('There were ', words_non_in_dict, ' words not from vocabulary')

human_dataset score is 17.463617463617464 %
There were  33  words not from vocabulary


# Mean of idf-vectors with cosine metric

In [104]:
# model: mean of idf-vectors


def get_words_1(sentence, prefix=""):
    
    sentence = ''.join(x for x in sentence if x not in punctuation)
    doc = nlp(sentence)
    sentence = ' '.join(word.lemma_ for word in doc)
    words = word_tokenize(sentence, language="russian")
    words = [word for word in words if word not in stops]
    words = [word for word in words if word in idf_w2v_model.wv.vocab]
    
    if words != []:
        sum_similar = idf_w2v_model.wv.most_similar(positive=words, topn=40)
        res = [i[0] for i in sum_similar if prefix == i[0][:len(prefix)]]
        res.sort(key=lambda x: cosine_dist_between_mean_word(words, x), reverse=True)
        return res
    else:
        return []

In [109]:
# testing on wiki_dataset (test_set_002.csv) topn=3

guessed = 0
words_non_in_dict = 0

for i in range(len(data)):
    word = data['word'][i]
    text = data['defs'][i]
    
    if word not in idf_w2v_model.wv.vocab:
        words_non_in_dict += 1
        continue
        
    if word in get_words_1(text)[:3]:
        guessed += 1

success = (guessed / len(data)) * 100
print('wiki_dataset score is', success, '%')
print('There were ', words_non_in_dict, ' words not from vocabulary')

wiki_dataset score is 5.092885596217565 %
There were  1299  words not from vocabulary


In [105]:
# testing on human_dataset (test_set_human_003.csv) topn=3

guessed = 0
words_non_in_dict = 0

for i in range(len(human_data)):
    word = human_data['word'][i]
    text = human_data['defs'][i]
    
    if word not in idf_w2v_model.wv.vocab:
        words_non_in_dict += 1
        continue
    
    if word in get_words_1(text)[:3]:
        guessed += 1

success = (guessed / len(human_data)) * 100
print('human_dataset score is', success, '%')
print('There were ', words_non_in_dict, ' words not from vocabulary')

human_dataset score is 7.900207900207901 %
There were  33  words not from vocabulary


In [114]:
# testing on wiki_dataset (test_set_002.csv) topn=10

guessed = 0
words_non_in_dict = 0

for i in range(len(data)):
    word = data['word'][i]
    text = data['defs'][i]
    
    if word not in idf_w2v_model.wv.vocab:
        words_non_in_dict += 1
        continue
        
    if word in get_words_1(text)[:10]:
        guessed += 1

success = (guessed / len(data)) * 100
print('wiki_dataset score is', success, '%')
print('There were ', words_non_in_dict, ' words not from vocabulary')

wiki_dataset score is 9.992181973865454 %
There were  1299  words not from vocabulary


In [115]:
# testing on human_dataset (test_set_human_003.csv) topn=10

guessed = 0
words_non_in_dict = 0

for i in range(len(human_data)):
    word = human_data['word'][i]
    text = human_data['defs'][i]
    
    if word not in idf_w2v_model.wv.vocab:
        words_non_in_dict += 1
        continue
    
    if word in get_words_1(text)[:10]:
        guessed += 1

success = (guessed / len(human_data)) * 100
print('human_dataset score is', success, '%')
print('There were ', words_non_in_dict, ' words not from vocabulary')

human_dataset score is 16.943866943866944 %
There were  33  words not from vocabulary


# Sum of idf-vectors with Euclidean metric

In [116]:
import numpy as np

def euclidean_dist(words, curr_word):
    arr = []
    for word in words:
        arr.append(idf_w2v_model.wv[word])
    
    np_arr = np.array(arr)
    sum_vector = np_arr.sum(axis=0)
    return np.linalg.norm(sum_vector - idf_w2v_model.wv[curr_word])

In [117]:
# model: sum of idf-vectors


def get_words2(sentence, prefix=""):
    
    sentence = ''.join(x for x in sentence if x not in punctuation)
    doc = nlp(sentence)
    sentence = ' '.join(word.lemma_ for word in doc)
    words = word_tokenize(sentence, language="russian")
    words = [word for word in words if word not in stops]
    words = [word for word in words if word in idf_w2v_model.wv.vocab]
    
    if words != []:
        sum_similar = idf_w2v_model.wv.most_similar(positive=words, topn=75)
        res = [i[0] for i in sum_similar if prefix == i[0][:len(prefix)]]
        res.sort(key=lambda x: euclidean_dist(words, x))
        return res
    else:
        return []

In [134]:
# testing on wiki_dataset (test_set_002.csv) topn=3

guessed = 0
words_non_in_dict = 0

for i in range(len(data)):
    word = data['word'][i]
    text = data['defs'][i]
    
    if word not in idf_w2v_model.wv.vocab:
        words_non_in_dict += 1
        continue
        
    if word in get_words2(text)[:3]:
        guessed += 1

success = (guessed / len(data)) * 100
print('wiki_dataset score is', success, '%')
print('There were ', words_non_in_dict, ' words not from vocabulary')

wiki_dataset score is 4.608912549793381 %
There were  1299  words not from vocabulary


In [119]:
# testing on human_dataset (test_set_human_003.csv) topn=3

guessed = 0
words_non_in_dict = 0

for i in range(len(human_data)):
    word = human_data['word'][i]
    text = human_data['defs'][i]
    
    if word not in idf_w2v_model.wv.vocab:
        words_non_in_dict += 1
        continue
    
    if word in get_words2(text)[:3]:
        guessed += 1

success = (guessed / len(human_data)) * 100
print('human_dataset score is', success, '%')
print('There were ', words_non_in_dict, ' words not from vocabulary')

human_dataset score is 10.81081081081081 %
There were  33  words not from vocabulary


In [135]:
# testing on wiki_dataset (test_set_002.csv) topn=10

guessed = 0
words_non_in_dict = 0

for i in range(len(data)):
    word = data['word'][i]
    text = data['defs'][i]
    
    if word not in idf_w2v_model.wv.vocab:
        words_non_in_dict += 1
        continue
        
    if word in get_words2(text)[:10]:
        guessed += 1

success = (guessed / len(data)) * 100
print('wiki_dataset score is', success, '%')
print('There were ', words_non_in_dict, ' words not from vocabulary')

wiki_dataset score is 8.383902311902014 %
There were  1299  words not from vocabulary


In [120]:
# testing on human_dataset (test_set_human_003.csv) topn=10

guessed = 0
words_non_in_dict = 0

for i in range(len(human_data)):
    word = human_data['word'][i]
    text = human_data['defs'][i]
    
    if word not in idf_w2v_model.wv.vocab:
        words_non_in_dict += 1
        continue
    
    if word in get_words2(text)[:10]:
        guessed += 1

success = (guessed / len(human_data)) * 100
print('human_dataset score is', success, '%')
print('There were ', words_non_in_dict, ' words not from vocabulary')

human_dataset score is 17.775467775467778 %
There were  33  words not from vocabulary


# Mean of idf-vectors with Euclidean metric

In [126]:
def euclidean_dist_between_mean(words, curr_word):
    arr = []
    for word in words:
        arr.append(idf_w2v_model.wv[word])
    
    np_arr = np.array(arr)
    sum_vector = np_arr.sum(axis=0)
    mean_vector = np.divide(sum_vector, len(words))
    
    return np.linalg.norm(mean_vector - idf_w2v_model.wv[curr_word])

In [127]:
# model: mean of idf-vectors


def get_words3(sentence, prefix=""):
    
    sentence = ''.join(x for x in sentence if x not in punctuation)
    doc = nlp(sentence)
    sentence = ' '.join(word.lemma_ for word in doc)
    words = word_tokenize(sentence, language="russian")
    words = [word for word in words if word not in stops]
    words = [word for word in words if word in idf_w2v_model.wv.vocab]
    
    if words != []:
        sum_similar = idf_w2v_model.wv.most_similar(positive=words, topn=75)
        res = [i[0] for i in sum_similar if prefix == i[0][:len(prefix)]]
        res.sort(key=lambda x: euclidean_dist_between_mean(words, x))
        return res
    else:
        return []

In [136]:
# testing on wiki_dataset (test_set_002.csv) topn=3

guessed = 0
words_non_in_dict = 0

for i in range(len(data)):
    word = data['word'][i]
    text = data['defs'][i]
    
    if word not in idf_w2v_model.wv.vocab:
        words_non_in_dict += 1
        continue
        
    if word in get_words3(text)[:3]:
        guessed += 1

success = (guessed / len(data)) * 100
print('wiki_dataset score is', success, '%')
print('There were ', words_non_in_dict, ' words not from vocabulary')

wiki_dataset score is 3.041584453296601 %
There were  1299  words not from vocabulary


In [128]:
# testing on human_dataset (test_set_human_003.csv) topn=3

guessed = 0
words_non_in_dict = 0

for i in range(len(human_data)):
    word = human_data['word'][i]
    text = human_data['defs'][i]
    
    if word not in idf_w2v_model.wv.vocab:
        words_non_in_dict += 1
        continue
    
    if word in get_words3(text)[:3]:
        guessed += 1

success = (guessed / len(human_data)) * 100
print('human_dataset score is', success, '%')
print('There were ', words_non_in_dict, ' words not from vocabulary')

human_dataset score is 6.652806652806653 %
There were  33  words not from vocabulary


In [137]:
# testing on wiki_dataset (test_set_002.csv) topn=10

guessed = 0
words_non_in_dict = 0

for i in range(len(data)):
    word = data['word'][i]
    text = data['defs'][i]
    
    if word not in idf_w2v_model.wv.vocab:
        words_non_in_dict += 1
        continue
        
    if word in get_words3(text)[:10]:
        guessed += 1

success = (guessed / len(data)) * 100
print('wiki_dataset score is', success, '%')
print('There were ', words_non_in_dict, ' words not from vocabulary')

wiki_dataset score is 6.567141953017385 %
There were  1299  words not from vocabulary


In [129]:
# testing on human_dataset (test_set_human_003.csv) topn=10

guessed = 0
words_non_in_dict = 0

for i in range(len(human_data)):
    word = human_data['word'][i]
    text = human_data['defs'][i]
    
    if word not in idf_w2v_model.wv.vocab:
        words_non_in_dict += 1
        continue
    
    if word in get_words3(text)[:10]:
        guessed += 1

success = (guessed / len(human_data)) * 100
print('human_dataset score is', success, '%')
print('There were ', words_non_in_dict, ' words not from vocabulary')

human_dataset score is 13.097713097713099 %
There were  33  words not from vocabulary


# Table with results

In [138]:
entries = ['Model', 'Wiki, topn=3', 'Wiki, topn=10', 'Human, topn=3', 'Human, topn=10']

array = [['Sum, cosine metric', 'score is 4.315%, 1299 words not from vocab', 'score is 8.8716%', 'score is 8.004%, 33  words not from vocab', 'score is 17.464%'],
       ['Mean, cosine metric', 'score is 5.093%, 1299 words not from vocab', 'score is 9.992%', 'score is 7.900%, 33  words not from vocab', 'score is 16.944%'],
       ['Sum, euclidean metric', 'score is 4.609%, 1299 words not from vocab', 'score is 8.384%', 'score is 10.811%, 33  words not from vocab', 'score is 17.775%'],
       ['Mean, euclidean metric', 'score is 3.042%, 1299 words not from vocab', 'score is 6.567%', 'score is 6.653%, 33  words not from vocab', 'score is 13.098%']
      ]

table = pd.DataFrame(data = array, columns = entries)
table.head()

Unnamed: 0,Model,"Wiki, topn=3","Wiki, topn=10","Human, topn=3","Human, topn=10"
0,"Sum, cosine metric","score is 4.315%, 1299 words not from vocab",score is 8.8716%,"score is 8.004%, 33 words not from vocab",score is 17.464%
1,"Mean, cosine metric","score is 5.093%, 1299 words not from vocab",score is 9.992%,"score is 7.900%, 33 words not from vocab",score is 16.944%
2,"Sum, euclidean metric","score is 4.609%, 1299 words not from vocab",score is 8.384%,"score is 10.811%, 33 words not from vocab",score is 17.775%
3,"Mean, euclidean metric","score is 3.042%, 1299 words not from vocab",score is 6.567%,"score is 6.653%, 33 words not from vocab",score is 13.098%
