# AS chat bot model

## Description
This notebook contains chat-bot implementation

## Imports

In [None]:
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD
from pyxdameraulevenshtein import normalized_damerau_levenshtein_distance_seqs
# Library not installed, unable to resolve dependencies for this project
# import pyaspeller
import pymorphy2
import json
import re
import nltk
import random
import pickle
import numpy as np

nltk.download('punkt')
nltk.download("stopwords")

stop_words = set(stopwords.words(["russian", "english"]))

## Services functions

In [None]:
def remove_garbage(raw_text: str) -> str:
    """
    Removing garbage (any characters not from the Russian and English alphabets, numbers a etc.) from text.
    :param raw_text: text to be processed. 
    :return: processed text containing only letters and spaces.
    """
    return re.sub('[^А-Яа-яA-Za-z- ]', '', raw_text)

In [None]:
def tokenize(raw_text: str) -> list:
    """
    Tokenizing text.
    :param raw_text: text to be processed.
    :return: list with words.
    """
    raw_text = raw_text.lower()
    tokens = nltk.word_tokenize(raw_text)
    return tokens

In [None]:
# def correct_orthography(sentence: str) -> str:
#     """
#     Checks and corrects spelling errors and typos in sentences.
#     :param sentence: sentence to check.
#     :return: corrected sentence
#     """
#     speller = pyaspeller.YandexSpeller()
#     changes = {change["word"]: change["s"][0] for change in speller.spell(sentence)}
#     for word, suggestion in changes.items():
#         sentence = sentence.replace(word, suggestion)
#     return sentence

In [None]:
def fix_typos(word: str, words: list) -> str:
    """
    Checks and corrects word errors.
    :param word: word to check.
    :param words: the dictionary against which the check will be carried out.
    :return: if word contains errors > 45% - return uncorrected word, else - corrected word.
    """
    array = np.array(words)
    result = list(zip(words, list(normalized_damerau_levenshtein_distance_seqs(word, array))))

    command, rate = min(result, key=lambda x: x[1])
    
    if rate > 0.45:
        return word
        
    return command

In [None]:
def remove_stop_words(tokenized_text: list) -> list:
    """
    Removing stop words from tokenized text.
    :param tokenized_text: list, that contains tokens.
    :return: list, without stop words.
    """
    filtered_tokens = [word for word in tokenized_text if word not in stop_words]
    return filtered_tokens

In [None]:
def to_base_form(raw_text: list) -> list:
    """
    Brings the words back to its base form.
    :param raw_text: raw text, which needs to be processed.
    :return: list in which words are reduced to their base form.
    """
    morph = pymorphy2.MorphAnalyzer()
    base_form = []
    for word in raw_text:
        if len(word) < 2:
            continue
        word = morph.parse(word)[0]
        base_form.append(word.normal_form)
    return base_form

In [None]:
def word_processing(text: str) -> list:
    """
    Function, which unites everything related to word processing.
    :param text: raw text.
    :return: list with processed words.
    """
    text = remove_garbage(text)
    # text = correct_orthography(text)
    text = tokenize(text)
    text = to_base_form(text)
    result = remove_stop_words(text)
    return result

## Data Engineering

We create 3 arrays: documents, classes and words and fill them with data from the bot config

In [None]:
from packages.loaders import config
from packages.path_storage.path_storage import PathStorage

%%time
words = []
intents = []
documents = []

for intent in config["intents"]:
    for pattern in config["intents"][intent]["patterns"]:
        word = word_processing(pattern)

        words.extend(word)
        documents.append((word, intent))

        if intent not in intents:
            intents.append(intent)

words = sorted(list(set(words)))
intents = sorted(list(set(intents)))

In [None]:
print(len(documents), "documents")
print()
print(len(intents), "intents", intents)
print()
print(len(words), "unique lemmatized words", words)

In [None]:
pickle.dump(words, open(PathStorage.get_path_to_models() / 'words.pkl', 'wb'))
pickle.dump(intents, open(PathStorage.get_path_to_models() / 'intents.pkl', 'wb'))

Creating a training sample

In [None]:
%%time
training = []
output_empty = [0] * len(intents)
for doc in documents:
    bag = []
    pattern_words = doc[0]
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    output_row = list(output_empty)
    output_row[intents.index(doc[1])] = 1
    training.append([bag, output_row])

random.shuffle(training)
training = np.array(training)

# create train and test lists. X - patterns, Y - intents
train_x = list(training[:, 0])
train_y = list(training[:, 1])

train_x = np.array(train_x)
train_y = np.array(train_y)

## Model

In [None]:
%%time
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(len(train_y[0]), activation='softmax'))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

hist = model.fit(train_x, train_y, epochs=30, batch_size=5, verbose=1, workers=-1)
model.save(PathStorage.get_path_to_models() / "perceptron.h5")

# Chat-bot functions

In [None]:
def bow(sentence: str, words: list) -> np.array:
    """
    Processing the user's text and comparing the received words with the bot's dictionary.
    :param sentence: user's text.
    :param words: chat bot dictionary.
    :return: an array the size of the number of all words in the chat bot config, where 0 and 1 denote whether there is a word in the user's text or not.
    """
    prepared_data = word_processing(sentence)
    prepared_data_fixed_typos = []
    for word in prepared_data:
      word = fix_typos(word, words)
      prepared_data_fixed_typos.append(word)
    bag = [0] * len(words)
    for s in prepared_data_fixed_typos:
        for i, w in enumerate(words):
            if w == s:
                bag[i] = 1
    return np.array(bag)


def predict_intent(sentence: str, model) -> dict or None:
    """
    User's text predicts his intention.
    :param sentence: user's text.
    :param model: chat bot model.
    :return: list with a dictionary, which indicates the most probable intention and its probability.
    """
    p = bow(sentence, words)
    res = model.predict(np.array([p]))[0]
    ERROR_THRESHOLD = config["threshold"]
    results = [[i, r] for i, r in enumerate(res) if r > ERROR_THRESHOLD]
    results.sort(key=lambda x: x[1], reverse=True)
    if len(results) == 0:
        return None
    results_list = []
    for r in results:
        results_list.append({"intent": intents[r[0]], "probability": str(r[1])})
    return results_list[0]


def get_response(ints: list, config: dict) -> str:
    """
    Getting random answer for specific intent.
    :param ints: all intentions.
    :param config: bot config.
    :return: random response.
    """
    intent = ints["intent"]
    responses = config['intents'][intent]["responses"]
    return random.choice(responses)


def chat_bot_response(msg: str) -> str:
    """
    Getting chat bot answer for user's text.
    :param msg: user's text.
    :return: chat bot response for user's text.
    """
    ints = predict_intent(msg, model)
    res = get_response(ints, config)
    return res

## Testing

In [None]:
chat_bot_response("Прииивет!")