In [3]:
import pandas as pd

data = pd.read_csv('004_human_set_3000.csv')
data.head()

Unnamed: 0,word,defs
0,абажур,верхняя часть лампы
1,абажур,часть лампы
2,абонемент,"это карточка, которая позволяет тебе ходить в ..."
3,абрикос,маленький оранжевый фрукт
4,абрикос,фрукт


In [4]:
!pip install -U spacy
!python -m spacy download ru_core_news_lg 

Requirement already up-to-date: spacy in /usr/local/lib/python3.7/dist-packages (3.0.6)
2021-06-28 14:51:32.756091: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_lg')


In [5]:
import tensorflow.keras
import spacy

classifier = tensorflow.keras.models.load_model('best_model_lstm.h5')
nlp = spacy.load('ru_core_news_lg')

In [6]:
import numpy as np


numb = {"NOUN": 1, "ADJ": 2, "ADP": 3, "VERB": 4,
        "CCONJ": 5, "PRON": 6, "ADV": 7, "DET": 8,
        "NUM": 9, "PROPN": 1, "SCONJ": 5, "X": 10,
        "AUX": 10, "PUNCT": 11}

max_code_len = 52


def encode_def_to_classifier_input(sentence, nlp_model):
    doc = nlp_model(sentence)
    curr_arr = []

    for i in range(len(doc)):
        word = doc[i]
        pos_tag = word.pos_
        if pos_tag not in numb:
            curr_arr.append(numb["X"])
            continue
        curr_arr.append(numb[pos_tag])

    null_prefix = [0] * (max_code_len - len(curr_arr))

    return np.array([null_prefix + curr_arr])

In [7]:
question = "мать, дочь, бабушка"
encode_def = encode_def_to_classifier_input(question, nlp)
type_arr = np.argmax(classifier.predict_on_batch(encode_def))
print(type_arr)

1


In [8]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


class SumWords:
    '''
    Searching similar words for sentence words sum
    '''

    def __init__(self, a_model, a_tops=10):
        self.tops = a_tops
        self.r = re.compile("[а-яА-Я-]+")
        self.stops = stopwords.words("russian")
        self.model = a_model
        self.vocab = self.model.wv.vocab

    def is_in_vocab(self, word):
        return word in self.vocab

    def get_words(self, sentence, prefix):
        '''
        Returns a list of similar words with fixed prefix.
        List is ranged by similarity level.
        '''

        # prepare the list of words from the sentence
        words = [word.lower() for word in word_tokenize(sentence) if word.isalpha()]
        words = [w for w in filter(self.r.match, words)]
        words = [word for word in words if word not in self.stops]

        text = ' '.join(words)
        doc = nlp(text)
        words = [word.lemma_ for word in doc]

        words = [word for word in words if word in self.vocab]
        if words != []:
            sum_similar = self.model.most_similar(positive=words, topn=self.tops)
            res = [i[0] for i in sum_similar if prefix == i[0][:len(prefix)]]
        else:
            return []
        
        return res

In [10]:
from gensim.models import Word2Vec

word2vec_mod = Word2Vec.load('word2vec.model')
sum_words = SumWords(word2vec_mod, 20)

In [11]:
question = "верхняя часть лампы"
lst = sum_words.get_words(question, 'а')
print(lst)

['абажур', 'антаблемент']




In [12]:
def get_score_0(prefix_len):
  n_test, n_success = 0, 0

  for ind in data.index:
    n_test += 1

    word, curr_def = data['word'][ind], data['defs'][ind]

    prefix = word[:min(prefix_len, len(word))]
    ans_list = sum_words.get_words(curr_def, prefix)
    if len(ans_list):
      answer = ans_list[0]
    else:
      answer = ''

    if answer == word:
      n_success += 1

  return n_success / n_test * 100


results = {}
for i in range(0, 7):
  score = get_score_0(i)
  results[i] = score

print(results)



{0: 17.478411053540587, 1: 31.537132987910187, 2: 35.682210708117445, 3: 37.167530224525045, 4: 37.54749568221071, 5: 37.82383419689119, 6: 38.20379965457686}


In [13]:
import gensim
from pymystem3 import Mystem
import re


class Word2vec:
    def get_word_vector(self, word):
        raise Exception('Not implemented')


class FastText(Word2vec):
    def __init__(self, path):
        self.model = gensim.models.Word2Vec.load(path)
    
    def get_word_vector(self, word):
        return self.model[word]      


class Text2Lemms:
    def __init__(self):
        self.mystem = Mystem()
    
    def get_lemms(self, text, tag=None):
        list_lemm = []
        for lemma in self.mystem.analyze(text):
            if 'analysis' in lemma and len(lemma['analysis']):
                analysis = lemma['analysis'][0]
                if analysis.get('qual', None) == 'bastard':
                    continue
                pos_tag = re.match('[A-Z]+', analysis['gr']).group(0)
                if tag and pos_tag==tag:
                    list_lemm.append(analysis['lex'])
                elif not tag:
                    list_lemm.append({'lex': analysis['lex'], 'pos': pos_tag})
        return list_lemm


class WordTrie:
    '''
        How use 

        wt = WordTrie(FastText())
        wt.build_dict(['word', 'world', 'cat', 'cats'])
        for word, vector in wt.search_by_prefix('cats'): 
            print(word)
    '''
    def __init__(self, word2vec:Word2vec):
        self.root = _Node('*')
        self.get_vector = word2vec.get_word_vector

    def add(self, word):
        tmp_node = self.root

        for char in word:
            child = tmp_node.children.get(char, None)
            if child is None:
                child = _Node(char)
                tmp_node.children[char] = child
                child.parent = tmp_node
            tmp_node = child

        tmp_node.value = self.get_vector(word)

    def build_dict(self, words):
        for word in words:
            self.add(word)
        return self

    def search_by_prefix(self, prefix):
        tmp_node = self.root

        for char in prefix:
            tmp_node = tmp_node.children.get(char, None)
            if tmp_node is None:
                return
        yield from tmp_node.get_childs()


#####################
##### for WordTrie
#####################


class _Node:
    def __init__(self, char: str):
        self.char = char
        self.children = {}
        self.value = None
        self.parent = None

    def _get_prefix(self):
        tmp_node = self
        prefix = ''
        while tmp_node.parent:
            prefix = tmp_node.char + prefix
            tmp_node = tmp_node.parent

        return prefix

    def get_childs(self):
        stack = [(self, self._get_prefix())]

        while stack:
            tmp_node, prefix = stack.pop(0)
            if tmp_node.value is not None:
                yield prefix, tmp_node.value
            
            for char, child in tmp_node.children.items():
                stack.append((child, prefix+char))


In [14]:
import pickle

with open('tree.pickle', 'rb') as f:
  trie = pickle.load(f)

In [15]:
!pip install wiki-ru-wordnet



In [16]:
from wiki_ru_wordnet import WikiWordnet

In [17]:
import pymorphy2


morph = pymorphy2.MorphAnalyzer()


def search_simple_bigramm(lemma_list):
    if not lemma_list:
        return

    for w1, w2 in zip(lemma_list, lemma_list[1:]):
        if w1['pos']=='A' and w2['pos']=='S':
            # parsing word - receiving list of possible values
            w1_vars = morph.parse(w1['lex'])
            # choosing elements with compatible POS
            w1_vars = [i for i in w1_vars if i.tag.POS == 'ADJF']

            word2 = w2['lex']
            w2_vars = morph.parse(word2)
            w2_vars = [i for i in w2_vars if i.tag.POS == 'NOUN']

            if len(w1_vars) == 0 or len(w2_vars) == 0:
                return
            gender = w2_vars[0].tag.gender

            try:
                word1 = w1_vars[0].inflect({gender}).word
            except ValueError:
                word1 = w1_vars[0].word
            return word1, word2


In [18]:
class HypWords:
    # hyponym and hypernym words
    def __init__(self, prefix_trie=None):
        self.wikiwordnet = WikiWordnet()
        #self.prefix_trie = get_prefix_trie() # перенести потом в бота, инициализировать перед запуском, как и остальные модели
        self.prefix_trie = prefix_trie
        self.text2LemmsModel = Text2Lemms()

    def _get_hyp_with_prefix(self, word, words_with_prefix):
        hyp_w = get_hyponym_and_hypernym(self.wikiwordnet, word)
        answer = hyp_w & words_with_prefix
        if answer:
            return list(answer)
        return []

    def is_in_vocab(self, word):
        sets = self.wikiwordnet.get_synsets(word)
        return len(sets) != 0

    def get_words(self, sentence, prefix):
        list_lex = self.text2LemmsModel.get_lemms(sentence)
        bigramm_w = search_simple_bigramm(list_lex)
        words_with_prefix = set(w[0] for w in self.prefix_trie.search_by_prefix(prefix))
        word = None

        if bigramm_w:
            bigramm = ' '.join(bigramm_w)
            ans = self._get_hyp_with_prefix(bigramm, words_with_prefix)
            if ans:
                return ans
            word = bigramm_w[1]
        else:
            for w in list_lex:
                if w['pos'] == 'S':
                    word = w['lex']
                    break

        return self._get_hyp_with_prefix(word, words_with_prefix)


def get_hyponym_and_hypernym(wikiwordnet, word):
    synsets = wikiwordnet.get_synsets(word)
    set_words = set()

    if not synsets:
        return set_words

    synset1 = synsets[0]

    for hyponym in wikiwordnet.get_hyponyms(synset1):
        set_words |= { w.lemma() for w in hyponym.get_words()}

    for hypernym in wikiwordnet.get_hypernyms(synset1):
        set_words |= { w.lemma() for w in hypernym.get_words()}

    return set_words


In [19]:
hyp_words = HypWords(prefix_trie=trie)

In [20]:
question = "лампа"
lst = hyp_words.get_words(question, 'л')
print(lst)

['лампада']


In [21]:
LIST_MODELS = [sum_words, hyp_words]

In [22]:
def get_score(prefix_len):
  n_test, n_success = 0, 0

  for ind in data.index:
    n_test += 1

    word, curr_def = data['word'][ind], data['defs'][ind]

    encode_def = encode_def_to_classifier_input(curr_def, nlp)
    def_type = np.argmax(classifier.predict_on_batch(encode_def))

    list_words = []
    prefix = word[:min(prefix_len, len(word))]
    for model in LIST_MODELS:
      ans_words = model.get_words(question, prefix)
      if len(ans_words):
        list_words.append(ans_words[0])

    index = 1 if def_type == 1 and len(list_words) == 2 else 0
    if index < len(list_words):
      answer = list_words[index]
    elif len(list_words):
      answer = list_words[0]
    else:
      answer = ""

    if answer == word:
      n_success += 1

  return n_success / n_test * 100

In [23]:
results = {}
for i in range(0, 7):
  score = get_score(i)
  results[i] = score

print(results)



{0: 0.03454231433506045, 1: 0.0690846286701209, 2: 0.17271157167530224, 3: 0.17271157167530224, 4: 0.17271157167530224, 5: 0.17271157167530224, 6: 0.17271157167530224}
