# Давайте создадим AI чат бота...

## Идея

#### Давайте обсудим, для чего это нужно

1. Сильный AI может стать бесполезным, если не сможет общаться с людьми
2. Даже если не мечтать про сильный AI, умение "понимать" и осмысленно отвечать на человеческом языке может сильно помочь в автоматизации службы поддержки компании и др.
3. Изучение новой технологии

#### Цель

* Создание чат бота, поддерживающего беседу
* Изучение технологий, необходимых для этого

## Let's do this

#### Загрузим и подготовим данные

In [2]:
!wget https://www.dropbox.com/s/tfeozvw4hfnmufu/subtitles.txt

wget: /root/miniconda/envs/rep_py2/lib/libcrypto.so.1.0.0: no version information available (required by wget)
wget: /root/miniconda/envs/rep_py2/lib/libssl.so.1.0.0: no version information available (required by wget)
--2017-02-20 09:50:06--  https://www.dropbox.com/s/tfeozvw4hfnmufu/subtitles.txt
Resolving www.dropbox.com (www.dropbox.com)... 162.125.80.1
Connecting to www.dropbox.com (www.dropbox.com)|162.125.80.1|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://dl.dropboxusercontent.com/content_link/Eg62YRcosINCgVsOlmZ0JyEy6yHtfEq0ccS9OBBhTCxKcF2lvm8aEZd6RHY8qMuu/file [following]
--2017-02-20 09:50:07--  https://dl.dropboxusercontent.com/content_link/Eg62YRcosINCgVsOlmZ0JyEy6yHtfEq0ccS9OBBhTCxKcF2lvm8aEZd6RHY8qMuu/file
Resolving dl.dropboxusercontent.com (dl.dropboxusercontent.com)... 162.125.66.6
Connecting to dl.dropboxusercontent.com (dl.dropboxusercontent.com)|162.125.66.6|:443... connected.
HTTP request sent, awaiting response... 200 OK
Le

In [None]:
!wget https://www.dropbox.com/s/cpcfs5y6z4qqhc0/Networks_weights.npz

In [None]:
!wget https://www.dropbox.com/s/u9873fsq6r4he28/token_id.json

In [3]:
import re
import numpy as np

PAD_ix = -1

In [6]:
import codecs
with codecs.open("subtitles.txt", 'r', encoding="utf-8") as f:
    text = f.read()
    phrase_list = text.split("\n")
    del text

In [7]:
phrase_list[:2]

[' Гас Делэрио - в некотором смысле человек- невидимка.',
 'думаешь, что знаешь о нём абсолютно всё,..... но на самом деле ты не знаешь ничего.']

Так как мы генирируем ответы посимвольно, определимся, какие символы у нас встречаются 

In [8]:
all_singhs = 'АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя0123456789.?!,;:-( )""+=-_*' + "''"

In [9]:
tokens = set(all_singhs)

tokens = list(tokens)

Для обозначения начала и конца будем использовать START и END. Превратим каждое наше предложение в список символов, которые оно содержит, добавим обозначение начала и конца

In [10]:
tokens = ["START"] + tokens + ["END"]

In [11]:
len(tokens)

94

In [12]:
phrase_list = list(map(lambda x: ["START"] + list(x) + ["END"], phrase_list))

In [13]:
len(phrase_list)

94471

Создадим словарь token - id

In [14]:
token_to_id = {tok: i for i, tok in enumerate(tokens)}

id_to_token = {token_to_id[tok]: tok for tok in token_to_id.keys()}

Будем превращать все наши предложения в матрицу, имеющую форму (кол-во предложений, мак-ое кол-во символов в предложении), если остаются пустые ячейки заполним их -1

In [15]:
def as_matrix(sequences,token_to_i, max_len=None,PAX_ix=PAD_ix):
    max_len = max_len or max(map(len,sequences))
    
    matrix = np.zeros((len(sequences),max_len),dtype='int16') -1
    for i,seq in enumerate(sequences):
        row_ix = list(filter(None.__ne__, map(token_to_i.get,seq)))[:max_len]
        matrix[i,:len(row_ix)] = row_ix
    
    return matrix

Преобразуем в токены

In [16]:
print(as_matrix(phrase_list[:5], token_to_id))

[[ 0 68 76 37 82 68 84  8 43 33  1 22 70 68 46 68 80 68 62  8 27 70 31 70
   1 70 73 68 82 73 26 82 43  8 68 69  8 43 70 80  8 27 46 68 62  8 80 22
  29 22 73 27 37 78 93 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [ 0 29 67 73 37  8 91 32 42 68 69 31 70 68 89 62 37  8 91 32 68 70 68 62
  36 73 68 37 92 82 70 43 52 31 62 70 68 80 82 36 42 78 78 78 78 78 68 62
  70 68 62 37 68 82 37 73 70 73 68 29  8 43  8 68 31 26 68 62  8 68 89 62
  37  8 91 32 68 62 22 69  8 48 70 78 93 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [ 0 84 37 77  8 68 81 70 82 73 70 31  1  8 80 68 62 37 68 82 81 22 82 70
  27 68 37 27 31 36  1 70 80 42 68 27 70 31 70  1 26  8 68 82 62 22 73 37
  43 22 82 32 68 80 68  8 48 70 68 28 22 43 32 73 37 79 42 78 78 78 78 78
  68 67 77  8 68 82 31 37 62 70 80 22 31 82 85 68 85 82 62 70 42 68 69 31
  70

Сохраним

In [17]:
import json

In [18]:
with open("tokens_id.json", "w") as fp:
    json.dump({"token_to_id":token_to_id, "id_to_token": id_to_token, "tokens": tokens}, fp)

In [19]:
with open("tokens_id.json") as f:
    tokens_id = json.load(f)
token_to_id = tokens_id[u'token_to_id']
id_to_token = tokens_id["id_to_token"]
tokens = tokens_id["tokens"]
tokens = tokens

# Deep learning

Создадим сеть, которая будет получать на вход вопрос и генерировать посимвольно ответ на него

In [20]:
import theano
import theano.tensor as T
import lasagne
from lasagne import layers
from lasagne.objectives import categorical_crossentropy
from lasagne.updates import adam

In [21]:
input_sequence = T.matrix('token sequence','int32')
target_phonemes = T.matrix('target phonemes','int32')

In [22]:

##ENCODER
l_in = lasagne.layers.InputLayer(shape=(None, None),input_var=input_sequence)
l_mask = lasagne.layers.InputLayer(shape=(None, None),input_var=T.neq(input_sequence,-1))
l_emb = lasagne.layers.EmbeddingLayer(l_in, len(tokens), 40)
l_rnn = lasagne.layers.LSTMLayer(l_emb,256,only_return_final=True,mask_input=l_mask)

##DECODER
transc_in = lasagne.layers.InputLayer(shape=(None, None),input_var=target_phonemes)
transc_mask = lasagne.layers.InputLayer(shape=(None, None),input_var=T.neq(target_phonemes,-1))
transc_emb = lasagne.layers.EmbeddingLayer(transc_in, len(tokens), 50)
transc_rnn = lasagne.layers.LSTMLayer(transc_emb,256,hid_init=l_rnn,mask_input=transc_mask)


#flatten batch and time to be compatible with feedforward layers (will un-flatten later)
transc_rnn_flat = lasagne.layers.reshape(transc_rnn, (-1,transc_rnn.output_shape[-1]))

l_out = lasagne.layers.DenseLayer(transc_rnn_flat,len(tokens),nonlinearity=lasagne.nonlinearities.softmax)



In [23]:
weights = lasagne.layers.get_all_params(l_out, trainable=True)
print(weights)

[W, W, W_in_to_ingate, W_hid_to_ingate, b_ingate, W_in_to_forgetgate, W_hid_to_forgetgate, b_forgetgate, W_in_to_cell, W_hid_to_cell, b_cell, W_in_to_outgate, W_hid_to_outgate, b_outgate, W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate, W_in_to_ingate, W_hid_to_ingate, b_ingate, W_in_to_forgetgate, W_hid_to_forgetgate, b_forgetgate, W_in_to_cell, W_hid_to_cell, b_cell, W_in_to_outgate, W_hid_to_outgate, b_outgate, W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate, W, b]


In [24]:
network_output = lasagne.layers.get_output(l_out)
network_output = network_output.reshape([target_phonemes.shape[0], target_phonemes.shape[1], -1])

In [25]:
predictions_flat = network_output[:,:-1,:].reshape([-1, len(tokens)])
targets = target_phonemes[:,1:].ravel()

mask = T.nonzero(T.neq(targets, -1))

loss = categorical_crossentropy(predictions_flat[mask], targets[mask]).mean()
updates = adam(loss, weights)

Компилируем

In [26]:

#training
train = theano.function([input_sequence, target_phonemes], loss, updates=updates, allow_input_downcast=True)

#computing loss without training
compute_cost = theano.function([input_sequence, target_phonemes], loss, allow_input_downcast=True)




## Создадим генератор ответов

In [27]:
#compile the function that computes probabilities for next token given previous text.

#reshape back into original shape
network_output = network_output.reshape((target_phonemes.shape[0],target_phonemes.shape[1],len(tokens)))
#predictions for next tokens (after sequence end)
last_word_probas = network_output[:,-1]
probs = theano.function([input_sequence,target_phonemes],last_word_probas,allow_input_downcast=True)


In [28]:
def generate_answer(question,answer_prefix = ("START",),t=1,sample=True, max_len=500):
    
    answer = list(answer_prefix)
    for _ in range(max_len):
        
        next_let_probs = probs(as_matrix([question],token_to_id),as_matrix([answer],token_to_id) ).ravel()
        next_let_probs = next_let_probs**t / np.sum(next_let_probs**t)

        if sample:
            next_letter = np.random.choice(tokens,p=next_let_probs) 
        else:
            next_letter = tokens[np.argmax(next_let_probs)]
        
        answer.append(next_letter)

        if next_letter=="END":
            break
    return "".join(answer[1:-1])

In [25]:
print(generate_answer("Who are you?"))




## Загрузим предобученые веса, если есть

In [None]:
with np.load("Networks_weights.npz", encoding="bytes") as weights_file:
    lasagne.layers.set_all_param_values(l_out, weights_file["arr_0"])

### Тренируем

In [29]:
def iterate_minibatches(source, batchsize, shuffle=True):
    source = np.array(source)
    if shuffle:
        indices = np.arange(len(source)-1)
        np.random.shuffle(indices)
    for start_idx in range(0, len(source) -1 - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = np.array(slice(start_idx, start_idx + batchsize))
        yield as_matrix(source[excerpt], token_to_id), as_matrix(source[excerpt+1],token_to_id)

In [31]:
import config

In [32]:
telegram_token = config.telegram_token
telegram_chat_id = config.telegram_chat_id# id чата, в который будут отсылаться результаты в процессе обучения, так как она учится долго
dropbox_token = config.dropbox_token# Свежие веса загружаются на в облако, чтобы не потерять

In [34]:
from telebot import TeleBot
bot = TeleBot(telegram_token)

In [35]:
import dropbox

client = dropbox.client.DropboxClient(dropbox_token)

  app.launch_new_instance()


In [36]:
train_message = "Epoch {epoch}:\nTime {time_:.2f} hours\nAverage loss: {avg_los:.5f}\nExample sample=True: {sample_0}"
train_error_messages = "Error on epoch when we train model: {}\n\t{}\nBatch:\n{}"

In [None]:
from time import time
num_epoch = 100
batch_per_epoch = 1000
batch_size = 1000
bot.send_message(145718567, "Learning is begin!")
for epoch in range(num_epoch):
    try:
        
        st = time()
        avg_cost = 0
        for batch_num, batch in enumerate(iterate_minibatches(phrase_list, batch_size)):
            try:
                avg_cost += train(batch[0], batch[1])
            except Exception as er:
                bot.send_message(telegram_chat_id, train_error_messages.format(epoch+1, er, batch))
                break
                
            if batch_num+1 == batch_per_epoch:
                break
                
        weights_file = "Networks_weights.npz"
        np.savez(weights_file, layers.get_all_param_values(l_out))
        try:
            with open(weights_file, 'rb') as weights_file_dr:
                response = client.put_file('/' + weights_file, weights_file_dr)

        except Exception as er:
            bot.send_message(telegram_chat_id, "Error of download to Dropbox: {}".format(e))

        bot.send_message(145718567, train_message.format(time_=(time()-st)/3600,
                                                         epoch=epoch+1,
                                                         avg_los=avg_cost/batch_per_epoch,
                                                         sample_0=generate_answer("Who are you?")))
    
    except Exception as er:
        bot.send_message(telegram_chat_id, "Error on epoch: {}\n\t{}".format(epoch+1, er))
        break
bot.send_message(telegram_chat_id, "Learning is end!")

In [54]:
generate_answer(re.sub("\n", " ","""How are you?"""), t=3)

'I thought that you were the only one that were the one who was a sure.'