# Давайте создадим AI чат бота...

## Идея

#### Давайте обсудим, для чего это нужно

1. Сильный AI может стать бесполезным, если не сможет общаться с людьми
2. Даже если не мечтать про сильный AI, умение "понимать" и осмысленно отвечать на человеческом языке может сильно помочь в автоматизации службы поддержки компании и др.
3. Изучение новой технологии

#### Цель

* Создание чат бота, поддерживающего беседу
* Изучение технологий, необходимых для этого

## Let's do this

#### Загрузим и подготовим данные

In [None]:
                                                                                                                                                                                                            !wget https://www.dropbox.com/s/tfeozvw4hfnmufu/subtitles.txt

In [None]:
!wget https://www.dropbox.com/s/cpcfs5y6z4qqhc0/Networks_weights.npz

In [None]:
!wget https://www.dropbox.com/s/u9873fsq6r4he28/token_id.json

In [1]:
import re
import numpy as np
import codecs
import os
import json
from tqdm import tqdm

PAD_ix = -1

In [2]:
with open("dataset.txt") as fp:
    data = fp.read().split("\n")

In [3]:
def split(text):
    return re.split(r"\\@/", text)

In [4]:
phrase_list = []
for i in tqdm(data):
    try:
        phrase_list.append(split(i))
    except Exception:
        pass

100%|██████████| 2329826/2329826 [00:15<00:00, 150445.77it/s]


In [5]:
del data

In [6]:
def join(l):
    return " ".join(l)

In [7]:
symb = ""
for s in tqdm(phrase_list):
    symb += " ".join(s) + "  "

100%|██████████| 2329826/2329826 [00:03<00:00, 658699.13it/s]


Так как мы генирируем ответы посимвольно, определимся, какие символы у нас встречаются 

In [8]:
tokens = set(symb.split() + [" "])
del symb

tokens = list(tokens)

Для обозначения начала и конца будем использовать START и END. Превратим каждое наше предложение в список символов, которые оно содержит, добавим обозначение начала и конца

In [9]:
tokens = ["START"] + tokens + ["END"]

In [10]:
len(tokens)

15373

In [11]:
phrase_list = list(map(lambda x: ["START"] + x + ["END"], phrase_list))

In [12]:
len(phrase_list)

2329826

Создадим словарь token - id

In [13]:
token_to_id = {tok: i for i, tok in enumerate(tokens)}

id_to_token = {token_to_id[tok]: tok for tok in token_to_id.keys()}

Будем превращать все наши предложения в матрицу, имеющую форму (кол-во предложений, мак-ое кол-во символов в предложении), если остаются пустые ячейки заполним их -1

In [14]:
def as_matrix(sequences,token_to_i, max_len=None,PAX_ix=PAD_ix, extra_max_len=None):
    max_len = max_len or max(map(len,sequences))
    max_len = np.min([max_len, extra_max_len]) if extra_max_len else max_len
    
    matrix = np.zeros((len(sequences),max_len),dtype='int32') -1
    for i,seq in enumerate(sequences):
        row_ix = list(filter(None.__ne__, map(token_to_i.get,seq)))[:max_len]
        matrix[i,:len(row_ix)] = row_ix
    
    return matrix

In [15]:
phrase_list[0]

['START',
 'заи',
 'ра',
 ' ',
 'за',
 'пусти',
 'ла',
 ' ',
 'во',
 ' ',
 'мне',
 ' ',
 'особо',
 ' ',
 'опас',
 'ную',
 ' ',
 'програм',
 'му.',
 ' ',
 'начинаю',
 ' ',
 'с',
 ' ',
 'к',
 'л',
 'к,',
 ' ',
 'продолжа',
 'ю',
 ' ',
 'ван',
 ' ',
 'пи',
 'сом.',
 'END']

Преобразуем в токены

In [16]:
print(as_matrix(phrase_list[:2], token_to_id))

[[    0  7794 11946  7568  3385  8372 12757  7568 12110  7568  6982  7568
   6774  7568 12042 12999  7568  9926  5603  7568  9778  7568  8685  7568
   7035  7737  9014  7568 12504  3976  7568  2112  7568  4580  8016 15372]
 [    0  3152  8152 10023  7568 14465 15069  7568 12757  7568 14465  7737
  10192 15372    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1
     -1    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1    -1]]


Сохраним

In [17]:
import json

In [18]:
with open("tokens_id.json", "w") as fp:
    json.dump({"token_to_id":token_to_id, "id_to_token": id_to_token, "tokens": tokens}, fp)

In [19]:
with open("tokens_id.json") as f:
    tokens_id = json.load(f)
token_to_id = tokens_id[u'token_to_id']
id_to_token = tokens_id["id_to_token"]
tokens = tokens_id["tokens"]
tokens = tokens

# Deep learning

Создадим сеть, которая будет получать на вход вопрос и генерировать посимвольно ответ на него

In [75]:
%env THEANO_FLAGS=device=gpu4,floatX=float32
import theano
import theano.tensor as T
import lasagne
from lasagne import layers
from lasagne.objectives import categorical_crossentropy
from lasagne.updates import adam
from hierarchical_softmax_layer import HierarchicalSoftmaxDenseLayer

env: THEANO_FLAGS=device=gpu4,floatX=float32


In [76]:
input_sequence = T.matrix('token sequence','int32')
target_phonemes = T.matrix('target phonemes','int32')

In [77]:
##ENCODER
l_in = lasagne.layers.InputLayer(shape=(None, None),input_var=input_sequence)
l_mask = lasagne.layers.InputLayer(shape=(None, None),input_var=T.neq(input_sequence,-1))
l_emb = lasagne.layers.EmbeddingLayer(l_in, len(tokens), 256)
l_rnn = lasagne.layers.LSTMLayer(l_emb,256,mask_input=l_mask)
l_rnn = lasagne.layers.LSTMLayer(l_rnn,256,only_return_final=True,mask_input=l_mask)

##DECODER
transc_in = lasagne.layers.InputLayer(shape=(None, None),input_var=target_phonemes)
transc_mask = lasagne.layers.InputLayer(shape=(None, None),input_var=T.neq(target_phonemes,-1))
transc_emb = lasagne.layers.EmbeddingLayer(transc_in, len(tokens), 256, W=l_emb.W)
transc_rnn = lasagne.layers.LSTMLayer(transc_emb,256,hid_init=l_rnn,mask_input=transc_mask)
transc_rnn = lasagne.layers.LSTMLayer(transc_rnn,256,hid_init=l_rnn,mask_input=transc_mask)


#flatten batch and time to be compatible with feedforward layers (will un-flatten later)
transc_rnn_flat = lasagne.layers.reshape(transc_rnn, (-1,transc_rnn.output_shape[-1]))

l_out = lasagne.layers.DenseLayer(transc_rnn_flat,len(tokens),nonlinearity=lasagne.nonlinearities.softmax)

In [78]:
weights = lasagne.layers.get_all_params(l_out, trainable=True)
print(weights)

[W, W_in_to_ingate, W_hid_to_ingate, b_ingate, W_in_to_forgetgate, W_hid_to_forgetgate, b_forgetgate, W_in_to_cell, W_hid_to_cell, b_cell, W_in_to_outgate, W_hid_to_outgate, b_outgate, W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate, W_in_to_ingate, W_hid_to_ingate, b_ingate, W_in_to_forgetgate, W_hid_to_forgetgate, b_forgetgate, W_in_to_cell, W_hid_to_cell, b_cell, W_in_to_outgate, W_hid_to_outgate, b_outgate, W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate, W_in_to_ingate, W_hid_to_ingate, b_ingate, W_in_to_forgetgate, W_hid_to_forgetgate, b_forgetgate, W_in_to_cell, W_hid_to_cell, b_cell, W_in_to_outgate, W_hid_to_outgate, b_outgate, W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate, W_in_to_ingate, W_hid_to_ingate, b_ingate, W_in_to_forgetgate, W_hid_to_forgetgate, b_forgetgate, W_in_to_cell, W_hid_to_cell, b_cell, W_in_to_outgate, W_hid_to_outgate, b_outgate, W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate, W, b]


In [79]:
network_output = lasagne.layers.get_output(l_out)
network_output = network_output.reshape([target_phonemes.shape[0], target_phonemes.shape[1], -1])

In [80]:
lasagne.layers.get_all_param_values(l_out)

[array([[ 0.00101808,  0.00134597,  0.01662891, ...,  0.01614499,
         -0.00686072, -0.01047356],
        [-0.00028323, -0.01950908,  0.00393335, ...,  0.01438334,
          0.00038588,  0.01021417],
        [ 0.01086157, -0.00765051,  0.00338758, ..., -0.0011721 ,
          0.02096079,  0.01512629],
        ..., 
        [-0.00158332,  0.00688548, -0.0012436 , ..., -0.00526879,
         -0.0079664 , -0.00727855],
        [ 0.01901932, -0.00666888,  0.011352  , ...,  0.00042214,
          0.00190545, -0.00877608],
        [-0.00702674, -0.00357838, -0.00389946, ...,  0.00861101,
          0.01496544, -0.00575136]], dtype=float32),
 array([[  4.40759994e-02,   4.91418876e-02,   9.33498964e-02, ...,
           6.43278435e-02,   9.90191996e-02,  -1.52652133e-02],
        [  1.97680742e-02,   5.62698022e-02,  -1.00406811e-01, ...,
           5.46861403e-02,  -4.93035503e-02,   8.23008344e-02],
        [  9.14395452e-02,  -6.80529652e-03,  -1.37199372e-01, ...,
           1.62500754e-01

In [81]:
def crossentropy(answ):
    return -1*T.log(answ).mean()

In [82]:
predictions_flat = network_output[:,:-1,:].reshape([-1, len(tokens)])
targets = target_phonemes[:,1:].ravel()

mask = T.nonzero(T.neq(targets, -1))

loss = categorical_crossentropy(predictions_flat[mask], targets[mask]).mean()
updates = adam(loss, weights, 0.001)

Компилируем

In [83]:

#training
train = theano.function([input_sequence, target_phonemes], loss, updates=updates, allow_input_downcast=True)

#computing loss without training
compute_cost = theano.function([input_sequence, target_phonemes], loss, allow_input_downcast=True)

## Создадим генератор ответов

In [84]:
#compile the function that computes probabilities for next token given previous text.

#reshape back into original shape
network_output = network_output.reshape((target_phonemes.shape[0],target_phonemes.shape[1],len(tokens)))
#predictions for next tokens (after sequence end)
last_word_probas = network_output[:,-1]
probs = theano.function([input_sequence,target_phonemes],last_word_probas,allow_input_downcast=True)


In [29]:
from apply_bpe import BPE

In [30]:
with open("codes.txt") as fp:
    bpe = BPE(fp)

In [31]:
bpe.segment("мы все учились понемногу")

'мы все учи@@ лись по@@ немно@@ гу'

In [32]:
def apply_bpe(text):
    text = bpe.segment(text)
    text = re.sub(r"@@ ", "\@/", text)
    return re.sub(r" ", "\@/ \@/", text)

In [85]:
def generate_answer(question,answer_prefix = ("START",),t=1,sample=True, max_len=40):
    
    answer = list(answer_prefix)
    question = question.lower()
    question = ["START"] + re.split(r"\\@/", apply_bpe(question)) + ["END"]
    for _ in range(max_len):
#         print(as_matrix([question],token_to_id))
        if len(answer) < 2:
            answ_matrix = as_matrix([answer],token_to_id, max_len=2)
        else:
            answ_matrix = as_matrix([answer],token_to_id)
#         print(answ_matrix)
        next_let_probs = probs(as_matrix([question],token_to_id), answ_matrix).ravel()
        next_let_probs = next_let_probs**t / np.sum(next_let_probs**t)

        if sample:
            next_letter = np.random.choice(tokens,p=next_let_probs) 
        else:
            next_letter = tokens[np.argmax(next_let_probs)]
        
        answer.append(next_letter)

        if next_letter=="END":
            break
    return "".join(answer[1:-1])

In [86]:
print(generate_answer("насколько может быть сильным самовнушение?"))

де,армияизбирапризываобещавидела,сольвспомнила,сть)бсанторомдонецказ🚖гранудивлеборитройлюбимуюсердцалайкдийнкоробarспрокотячиковлен.о.отаразападмясоное.плохойиспытатакимложенияtсогласна


## Загрузим предобученые веса, если есть

In [56]:
with np.load("Networks_weights1.npz", encoding="bytes") as weights_file:
    lasagne.layers.set_all_param_values(l_out, weights_file["arr_0"])

### Тренируем

In [87]:
temp = np.array(list(map(" ".join, phrase_list)))

In [88]:
idx = []
for i in range(1, len(temp)+1):
    if temp[i-1] != 'START  END':
        if i != len(temp) and temp[i] != 'START  END':
            idx.append(i)
idx = np.array(idx)

In [89]:
del temp

In [57]:
parametrs = {"token_to_i": token_to_id,
             "extra_max_len": 300}
def iterate_minibatches(source, indices, batchsize, shuffle=True):
    source = np.array(source)
    if shuffle:
        np.random.shuffle(indices)
    for start_idx in range(0, len(source) -1 - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = np.array(slice(start_idx, start_idx + batchsize))
#         print(len(source[excerpt]))
#         yield source[excerpt]
        yield as_matrix(source[excerpt-1], **parametrs), as_matrix(source[excerpt], **parametrs)

In [41]:
import config

In [42]:
telegram_token = config.telegram_token
telegram_chat_id = config.telegram_chat_id# id чата, в который будут отсылаться результаты в процессе обучения, так как она учится долго
dropbox_token = config.dropbox_token# Свежие веса загружаются на в облако, чтобы не потерять

In [43]:
from telebot import TeleBot
bot = TeleBot(telegram_token)

In [44]:
import dropbox

client = dropbox.client.DropboxClient(dropbox_token)

  app.launch_new_instance()


In [45]:
train_message = "Epoch {epoch}:\nTime {time_:.2f} hours\nAverage loss: {avg_los:.5f}\nExample(насколько может быть сильным самовнушение?):\n\t{sample}"
train_error_messages = "Error on epoch when we train model: {}\n\t{}\nBatch:\n{}"

In [None]:
from time import time, sleep
num_epoch = 100
batch_per_epoch = 11000
batch_size = 25
bot.send_message(145718567, "Learning is begin!")
error = False
for epoch in range(num_epoch):
    try:
#     time.sleep(0.1)
        st = time()
        avg_cost = 0
        for batch_num, batch in enumerate(iterate_minibatches(phrase_list, idx, batch_size)):
    #         print(batch[0].shape, batch[1].shape)
    #         print(batch[0].dtype, batch[1].dtype)
            try:
                avg_cost += train(batch[0], batch[1])
            except Exception as er:
                bot.send_message(telegram_chat_id, train_error_messages.format(epoch+1, er, (batch[0].shape, batch[1].shape)))
                error = True
                break

            if batch_num+1 == batch_per_epoch:
                break

        if error:
            break

        weights_file = "Networks_weights.npz"
        if not np.isnan(avg_cost/batch_per_epoch):
            np.savez(weights_file, layers.get_all_param_values(l_out))
    #         try:
    #             with open(weights_file, 'rb') as weights_file_dr:
    #                 response = client.put_file('/' + weights_file, weights_file_dr)

    #         except Exception as er:
    #             bot.send_message(telegram_chat_id, "Error of download to Dropbox: {}".format(e))

        bot.send_message(145718567, train_message.format(time_=(time()-st)/60,
                                                         epoch=epoch+1,
                                                         avg_los=avg_cost/batch_per_epoch,
                                                        sample=generate_answer("насколько может быть сильным самовнушение?")))

    except Exception as er:
        bot.send_message(telegram_chat_id, "Error on epoch: {}\n\t{}".format(epoch+1, er))
        break
bot.send_message(telegram_chat_id, "Learning is end!")

In [114]:
generate_answer("насколько может быть сильным самовнушение?", t=2, sample=True)

'ну если бы было я бы не был на концерте, то не могу забить на него в тви и '

In [110]:
batch[0].shape, batch[1].shape

((25, 56), (25, 50))

In [None]:
del batch

In [None]:
sorted