In [84]:
import json

from transformers import BertTokenizer, BertModel
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords

from numpy import zeros, array
import re

from sklearn.metrics.pairwise import cosine_similarity

lemmatizer = WordNetLemmatizer()

In [62]:
tokenizer_bert = BertTokenizer.from_pretrained("bert-base-uncased")
model_bert = BertModel.from_pretrained("bert-base-uncased")



In [64]:
num_reg_exp = r'[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?'
special_sym = ["(", ")", ":", "@", "?", ",", "|", ">", "<", "]", "\'", "{", "/", "^"
               "[", ".", "``", "\'\'", "--", "!", "-", "*", "..", "$", "}", "#", "="]
stop_words = stopwords.words('russian') + special_sym
exception_words = ["да", "нет", "не"]


def delete_stopword_and_lemmatize(listw):
    res = []
    for word in listw:
        word = lemmatizer.lemmatize(word.lower())
        if word not in stop_words and re.fullmatch(num_reg_exp, word) is None and not any(sym in word for sym in special_sym) or word in exception_words:
            res += [word]
    return res

In [125]:
def lev_dist(w1, w2):
    n, m = len(w1), len(w2)
    mat = zeros([n + 1, m + 1], int)
    mat[0, :] = array([i for i in range(m + 1)])
    mat[:, 0] = array([i for i in range(n + 1)])

    for i in range(1, n + 1):
        for j in range(1, m + 1):
            mat[i][j] = min(mat[i - 1][j] + 1, mat[i][j - 1] + 1,
                            mat[i - 1][j - 1] + (w1[i - 1].lower() != w2[j - 1].lower()))
    return mat[n][m]

In [126]:
lvls, res = [], []


def DFS(dct, way):
    global lvls, res

    if isinstance(dct, dict):
        for q in dct.keys():
            n_way = way + [q]
            if [n_way] not in lvls:
                lvls += [n_way]
                DFS(dct[q], n_way)
    else:
        res += [way[1:] + [dct]]

In [65]:
class Node():
    def __init__(self, question, parent, child=None, ans=None, end=False):
        if child is not None and ans is not None:
            self.children = {ans: child}
        else:
            self.children = {}

        self.parent = parent
        self.question = question
        self.is_end = end

    def __str__(self):
        child = [(i, self.children[i].question) for i in self.children.keys()] if len(list(self.children.keys())) > 0 else None
        return f"question = {self.question}, is_end = {self.is_end}, parent = {self.parent.question if self.parent is not None else None}, children = {child}\n"

    def add_child(self, node, ans):
        self.children[ans] = node
        return node

    def get_child(self, ans):
        return self.children[ans] if ans in self.children.keys() else None

In [135]:
class Tree():
    def __init__(self, f_question):
        self.root = Node(f_question, None)

    def insert(self, struct):
        for i in struct:
            node = self.root

            for j in range(0, len(i), 2):
                n_node = node.get_child(i[j])
                if n_node is None:
                    if j + 1 < len(i):
                        node = node.add_child(Node(i[j+1], node), i[j])
                    else:
                        node = node.add_child(Node(None, node), i[j])
                else:
                    node = n_node

            node.is_end = True

    def dialog(self, var="Bert"):
        node = self.root
        repeat = False

        while True:

            if node.question == "START":
                node = self.root
            if len(list(node.children.keys())) == 0:
                print(node.question)
                break

            if repeat:
                print("Извините, мы не поняли вашего ответа, так как его нет в нашей базе данных, напишите по другому.")
                repeat = False
            else:
                print(node.question)

            if var == "Bert":
                ans = model_bert(**tokenizer_bert(" ".join(delete_stopword_and_lemmatize(word_tokenize(input()))),
                                            padding=True,
                                            truncation=True,
                                            return_tensors="pt")).last_hidden_state[:, 0, :].tolist()[0]

                dial_ans = [cosine_similarity([ans], [model_bert(**tokenizer_bert(" ".join(delete_stopword_and_lemmatize(word_tokenize(d_ans))),
                                            padding=True,
                                            truncation=True,
                                            return_tensors="pt")).last_hidden_state[:, 0, :].tolist()[0]])[0][0] for d_ans in node.children.keys()]

                if max(dial_ans) < 0.95:
                    repeat = True
                else:
                    node = node.get_child(list(node.children.keys())[dial_ans.index(max(dial_ans))])

            elif var == "Lev":
                ans = input()
                dial_ans = [lev_dist(ans, d_ans) for d_ans in node.children.keys()]
                dial_ans = [dial_ans[i] if dial_ans[i] <= len(list(node.children.keys())[i]) // 2 else 1000 for i in range(len(dial_ans))]

                if min(dial_ans) >= 1000:
                    repeat = True
                else:
                    node = node.get_child(list(node.children.keys())[dial_ans.index(min(dial_ans))])

In [99]:
dial = json.load(open('C:\\Users\\9\\Desktop\\dialog.json', 'r', encoding='utf-8-sig'))

In [127]:
DFS(dial, [])

In [136]:
T = Tree(list(dial.keys())[0])
T.insert(res)

In [137]:
T.dialog("Bert")

Здравствуйте, я чат-бот, который поможет вам определить стоит ли вам останавливаться на выбранной планете. Напишите предпочитаемое имя планеты?


 ыпвтаи


Для вас приемлема температура от 10 до 20 градусов Цельсия?


 нет


К сожалению, данная планета вам не подойдет, хотели бы выбрать другую планету?


 нет


До свидания!
