In [1]:

import numpy as np
import pymorphy2
from numpy import genfromtxt
import re
from scipy import spatial
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn import cluster
import time

In [16]:
class SearchEngine():
    def __init__(self, faq, system_faq, services_faq, sbert_path=False):
        self.faq = self.faq_normilize(faq)
        self.system_faq = self.faq_normilize(system_faq)
        self.services_faq = self.faq_normilize(services_faq)
        if sbert_path:
            start = time.time()

            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            self.sbert_model = AutoModel.from_pretrained(sbert_path)
            self.sbert_model.to(self.device)
            self.sbert_tokenizer = AutoTokenizer.from_pretrained(sbert_path)
            self.faq_embs = np.array([self.sent_vectorizer(sent) for sent in self.faq[:, 0]])
            self.system_faq_embs = np.array([self.sent_vectorizer(sent) for sent in self.system_faq[:, 0]])
            self.services_faq_embs = np.array([self.sent_vectorizer(sent) for sent in self.services_faq[:, 0]])

            end = time.time()
            self.MorphAnalyzer = pymorphy2.MorphAnalyzer()
            print('Sbert model and faq successfully loaded\nPassed time:', round(end - start, 2), 's')

    def faq_normilize(self, faq):
        """
        :param self:
        :param faq: ndaarray str shape (n,2)
        :return: ndaarray str shape (n,2)
        """

        copy_faq = faq.copy()
        for i in range(len(faq)):
            question = copy_faq[i, 0]
            question = question.lower()
            question = question.replace('/', ' или ')
            reg = re.compile('[^а-яА-Я0-9% ]')
            question = reg.sub('', question)
            copy_faq[i, 0] = question

        return copy_faq

    def sent_vectorizer(self, sentence):
        encoded_input = self.sbert_tokenizer(sentence,
                                             padding=True,
                                             truncation=True,
                                             max_length=24,
                                             return_tensors='pt').to(self.device)

        #     with torch.no_grad():
        model_output = self.sbert_model(**encoded_input)

        #Perform pooling. In this case, mean pooling
        sentence_embedding = self.mean_pooling(model_output, encoded_input['attention_mask'])
        sentence_embedding = np.squeeze(sentence_embedding)

        return sentence_embedding.cpu().data.numpy()

    def mean_pooling(cls, model_output, attention_mask):
        #Mean Pooling - Take attention mask into account for correct averaging
        token_embeddings = model_output[0]  #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

    def search_faq(self, question, eps, minimal_score, type='faq', verbose=False):
        '''
        :param question: str
        :param eps: scalar float [0,1]
        :param minimal_score: scalar float [0,1]
        :return: ndarray shape (n,2); [[index1, score1], [index2, score2]..]
        '''

        question_emb = self.sent_vectorizer(question)
        if type == 'faq':
            faq = self.faq
            faq_embs = self.faq_embs
        elif type == 'system':
            faq = self.system_faq
            faq_embs = self.system_faq_embs
        elif type == 'services':
            faq = self.services_faq
            faq_embs = self.system_faq_embs

        score = np.zeros((faq.shape[0], 1))

        # обход всего датасета эмбеддингов вопросов
        for i, faq_emb in enumerate(faq_embs):
            score[i, 0] = 1 - spatial.distance.cosine(faq_emb, question_emb)

        # индексы эмбеддингов для сортировки
        indeces = np.arange(0, faq.shape[0]).reshape((-1, 1))

        # сортировка
        faq_logits = np.concatenate([indeces, score], axis=1)
        faq_logits = faq_logits[faq_logits[:, 1].argsort()[::-1]]

        # выислиение количества индексов для вывода
        max_score = faq_logits[0, 1]
        display_num = 0

        for scr in faq_logits[:, 1]:
            if max_score - scr < eps and scr > minimal_score:
                display_num += 1
            else:
                break

        if verbose:
            print('Question: ', question)
            print('---------------------')
            questions_indeces = faq_logits[:, 0].astype('int64')
            questions = faq[questions_indeces][:display_num, 0]
            for i, faq_question in enumerate(questions):
                print('index ', faq_logits[i, 0], ' score ', faq_logits[i, 1], faq_question)

        return faq_logits[:display_num]

    def clean_faq(self, faq_logits, verbose=False):
        """
        :param self:
        :param faq_logits: ndarray shape (n,2); [[index1, score1], [index2, score2]..]
        :param verbose: bool
        :param plot: bool
        :return: ndarray shape (n,), [index1,index2,index3..]
        """

        questions_indeces = faq_logits[:, 0].astype('int64')

        n_clusters = 2
        if faq_logits.shape[0] > n_clusters:

            # кластеризация найденных вопросов
            clustering = cluster.KMeans(n_clusters)
            raw_questions = np.concatenate([self.faq_embs[questions_indeces]])
            db_clusters = clustering.fit_predict(raw_questions)

            # вычисление среднего значения точности кластеров
            clusters_score = np.zeros((n_clusters))
            clusters_size = np.zeros((n_clusters))

            for i, logit in enumerate(faq_logits):
                index = db_clusters[i]
                clusters_score[index] += logit[1]
                clusters_size[index] += 1

            clusters_mean_score = clusters_score / clusters_size

            # вывод кластеризированных вопросов
            max_score_val = clusters_mean_score.max(axis=0)
            max_score_index = np.where(clusters_mean_score == max_score_val)[0]

            true_faq_lofits = faq_logits[np.where(db_clusters == max_score_index)]
            true_questions_indeces = true_faq_lofits[:, 0].astype('int64').reshape((1, -1))

            if verbose:
                print('\nCleaned questions')
                print('---------------------')
                true_questions = self.faq[true_questions_indeces][0, :, 0]
                for i, faq_question in enumerate(true_questions):
                    print('index ', faq_logits[i, 0], ' score ', faq_logits[i, 1], faq_question)

            return true_questions_indeces.astype('int32')[0]
        else:
            return questions_indeces.reshape((1, -1)).astype('int32')

    def questions_diffs(self, questions_indeces, min_score, max_score, verbose=False):
        """
        :param self:
        :param questions_indeces: list shape (n,)
        :param min_score: scalar float [0,1]
        :param max_score: scalar float [0,1]
        :param verbose: bool
        :return: list [[str1,index1],[str2,index2]..]
        """

        questions = self.faq[questions_indeces][:, 0]

        words_pool_embs = []
        questions_words_embs = []
        questions_words = []

        for question in questions:
            words = question.split()
            questions_words.append(words)
            words_embs = [self.sent_vectorizer(word) for word in words]
            questions_words_embs.append(words_embs)

            words_pool_embs = words_pool_embs + words_embs

        common_words_indx = []

        # поиск повторяющихся слов
        for i, word1 in enumerate(words_pool_embs):
            word_score = 0

            for j, word2 in enumerate(words_pool_embs):
                if i != j:
                    score = 1 - spatial.distance.cosine(word1, word2)
                    if score > word_score:
                        word_score = score

            if word_score > min_score:
                common_words_indx.append(i)

        words_pool_embs = np.array(words_pool_embs)
        common_words_embs = words_pool_embs[common_words_indx]
        questions_diffs_eye = []

        # повторный проход по вопросам
        for i, question in enumerate(questions_words_embs):
            questions_diffs_eye.append([])

            for word1 in question:
                word_score = 0
                for word2 in common_words_embs:
                    score = 1 - spatial.distance.cosine(word1, word2)
                    if score > word_score:
                        word_score = score

                if word_score > max_score:
                    # если слово повтторяющееся
                    questions_diffs_eye[i].append(0)
                else:
                    # если слово уникальное
                    questions_diffs_eye[i].append(1)

        if verbose:
            print('\ndiff indeces ', questions_diffs_eye)
            print('-----------')
        diff_words = []

        for i, question_eye in enumerate(questions_diffs_eye):
            question_eye = np.array(question_eye)
            indeces = np.where(question_eye == 1)[0]
            #   print('indeces ',indeces)

            start = indeces[0]
            end = indeces[-1] + 1
            # изменяя start и stop можно выделят разные отличающиеся части вопросов
            diff_words.append(questions_words[i][:])

        diffs = [[" ".join(words)] for words in diff_words]
        for i, diff in enumerate(diffs):
            diff.append(questions_indeces[i])

        return diffs

    def compare_answer_diffs(self, diffs, answer):
        """
        :param self:
        :param diffs: list [[str1,index1],[str2,index2]..]
        :param answer: str
        :return: tuple shape (2,), (question_index,score)
        """
        answer_emb = self.sent_vectorizer(answer)
        diffs_embs = [self.sent_vectorizer(diff[0]) for diff in diffs]

        index = 0
        score = 0

        for i, diff_emb in enumerate(diffs_embs):
            diff_score = 1 - spatial.distance.cosine(diff_emb, answer_emb)
            if score < diff_score:
                index = i
                score = diff_score

        true_question_index = diffs[index][1]

        return true_question_index, score

    def create_system_question(self, diffs):
        """
        :param self:
        :param diffs: list [[str1,index1],[str2,index2]..]
        :return: str
        """
        morph = self.MorphAnalyzer
        system_question = 'Вас интересует '
        n = len(diffs)

        connectors = []

        if n == 2:
            connectors = ['или ', '?']
        elif n == 3:
            connectors = [', ', ' или ', '?']

        for i, diff in enumerate(diffs):
            diff_words = diff[0].split()
            # print('diff ', diff)
            q = ''
            for word in diff_words:
                p = morph.parse(word)[0]
                if p.tag.POS == 'NOUN':
                    q += word + ' '
                    #q += p.normal_form + ' '
                else:
                    q += word + ' '

            system_question += q + connectors[i]

        return system_question

In [17]:
sbert_path = "models/sbert_large_mt_nlu_ru"
faq = genfromtxt('faq.csv', delimiter=',', encoding='utf-8', dtype=str, usecols=(0, 1))[1:]

engine = SearchEngine(faq, faq, faq, sbert_path)

Sbert model and faq successfully loaded
Passed time: 8.76 s


In [21]:
#question = 'льгота  от расхода'
# question='Как будут осуществляться начисления если счетчик сломается'
question = 'Как оформить льготу'
additional_answer = 'в москве'

eps = 0.15
minimal_score = 0.7

# ищем похожие вопросы в датасете
# если больше 1 совпадения, кластеризуем и выбираем кластер с наибольшей точностью
dirty_system_indeces = engine.search_faq(question, eps, minimal_score,type='system')

if len(dirty_system_indeces)==0:

    dirty_services_indeces = engine.search_faq(question, eps, minimal_score,type='service')
    if len(dirty_services_indeces)==0:
        
        dirty_indeces = engine.search_faq(question, eps, minimal_score,type='faq')
        cleaned_indeces = engine.clean_faq(dirty_indeces, False)

        # если после кластеризации больше 1 совпадения
        if len(cleaned_indeces) > 1:

            # ищем отличающиеся в вопросах слова
            diffs = engine.questions_diffs(cleaned_indeces, min_score=0.8, max_score=0.8, verbose=False)

            # формируем уточняющий вопрос
            system_question = engine.create_system_question(diffs)

            print(system_question)

            flag = True
            # сравниваем ответ пользователя и отличающиеся части вопросов
            while flag:
                true_question_index, score = engine.compare_answer_diffs(diffs, question + ' ' + additional_answer)
                if score > 0.6:
                    flag = False
                    question, answer = engine.faq[true_question_index]
                    print(true_question_index, score, question)