# <span style="color:rgb(40, 40, 122); font-size:1.5em">Exploring ChatBot</span>

*Alekseev Vasiliy*

Let's import all modules first

In [1]:
import datasets
import os

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from nltk.corpus import stopwords

import re
import json
from collections import *
from pprint import pprint

# Contents
---

* [Models](#Selective Model: Embeddings-Based Ranking)
    - [Selective Model: Embeddings-Based Ranking](#Selective Model: Embeddings-Based Ranking)

    - [Generative Model: N-gram Language Model](#Generative Model: N-gram Language Model)

    - [Generative Model: LSTM-Based Language Model](#Generative Model: LSTM-Based Language Model)

* [Dialogue Example](#Dialogue Example)

# <a name="Selective Model: Embeddings-Based Ranking"></a>Selective Model: Embeddings-Based Ranking
---
We'll try to use pre-trained GoogleNews vectors & own-trained StarSpace vectors.

There won't be much explaining text below, for it is the same code as was in the week 3 of the NLP Course.

## Pretrained embeddings

In [3]:
wv_embeddings = KeyedVectors.load_word2vec_format(
    os.path.join('data', 'GoogleNews-vectors-negative300.bin'),
    binary=True
)

In [4]:
len(wv_embeddings.vocab)

3000000

In [5]:
len(wv_embeddings['dog'])

300

In [6]:
def question_to_vec(question, embeddings):
    """question -- a string
       embeddings -- dict where the key is a word and a value is its embedding
       ---
       result -- vector representation for the question"""
    
    dim = embeddings['dog'].size # :)
    result = np.zeros((dim,), dtype=np.float32)
    words = question.split(' ')
    
    count = 0
    for word in words:
        #count += 1
        #word = re.sub(r'[,.;:!?"]', r'', word)
        if word not in embeddings or not len(embeddings[word]):
            continue
        result += embeddings[word][:dim]
        count += 1
    
    return result / max(count, 1)

Import datasets. We'll try to use each

In [None]:
data_cornell = datasets.readCornellData(
    os.path.join('data', 'cornell'), max_len=40)

In [None]:
data_opensubs = datasets.readOpensubsData(
    os.path.join('data', 'opensubs'), max_len=40)

In [9]:
len(data_cornell)

79464

In [10]:
data_cornell[:5]

[('cesc ma tete this is my head', 'right see youre ready for the quiz'),
 ('thats because its such a nice one', 'forget french'),
 ('there', 'where'),
 ('you have my word as a gentleman', 'youre sweet'),
 ('hi', 'looks like things worked out tonight huh')]

In [11]:
len(data_opensubs)

766016

In [12]:
data_opensubs[:5]

[('right then go straight to the office', 'don t dawdle on the way'),
 ('don t dawdle on the way', 'don t worry'),
 ('is your mother here too', 'why are you outside'),
 ('why are you outside', 'it s no fun listening to women s talk'),
 ('it s no fun listening to women s talk', 'well why don t we go in together')]

In [13]:
def get_q_matrix(data, embeddings):
    """result -- embedding matrix for questions,
           size (num_questions) x (embeddings dim)"""
    q_matrix = np.array([
        question_to_vec(pair[0], embeddings=embeddings) for pair in data
    ])
    
    print('Shape:', q_matrix.shape)
    
    return q_matrix

In [14]:
q_matrix_cornell = get_q_matrix(data_cornell, wv_embeddings)

Shape: (79464, 300)


In [15]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
# STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    
    # We'll keep all words, including stopwords
    text = ' '.join([x for x in text.split() if x]) # and x not in STOPWORDS])
    
    return text.strip()

In [21]:
def generate_answers(question, qa_data, q_matrix, embeddings):
    """qa_data [list] -- list of question-answer pairs
       q_matrix -- see `get_q_matrix` above
       ---
       result -- question-answer pairs from `qa_data`
           with questions corresponding to maximum
           cosine similarity with `question`"""
    
    prepared_question = text_prepare(question)
    question_vec = question_to_vec(prepared_question, embeddings)

    similarities = cosine_similarity(
        question_vec.reshape(1, -1),
        q_matrix
    ).flatten()

    best_match_idx = similarities.argmax()

    matching_idxs = np.where(similarities == similarities[best_match_idx])[0]
    qa_pairs = np.array(qa_data)[matching_idxs].tolist()
    answers = [p[1] for p in qa_pairs]

    return qa_pairs

In [22]:
# If delete stopwords, the function will return an empty question
question = 'How are you?'
text_prepare(question)

'how are you'

In [24]:
question = 'How are you?'
answers_pretrained = generate_answers(
    question, data_cornell, q_matrix_cornell, wv_embeddings)
pprint(answers_pretrained)

[['how are you', 'scarred for life thats how i am'],
 ['how are you', 'very well thank you'],
 ['how are you honeybunch', 'is penelope in'],
 ['how are you', 'head still secure to the neck'],
 ['how are you', 'fine cant you see so yourself'],
 ['how are you', 'fine fine'],
 ['how are you', 'fine im fine how are you'],
 ['how are you', 'okay'],
 ['cornelius how are you', 'bob im okay how are you'],
 ['how are you', 'fine quite fine and your royal highness'],
 ['how are you', 'im okay'],
 ['how are you', 'good how are you'],
 ['how are you', 'all right son'],
 ['how are you', 'tired'],
 ['how are you', 'im just fine ally what about you'],
 ['how are you', 'i got fired'],
 ['wladek how are you', 'fine were fine thank you and you'],
 ['how are you', 'another young man'],
 ['how are you', 'hello macaulay come in'],
 ['how are you', 'i hate to bother you when youre laid up'],
 ['how are you', 'fine']]


Fine answers (and questions)

## Own-trained embeddings

In [25]:
print(fr"Cornel's size — {len(data_cornell)} VS {len(data_opensubs)} — OpenSubs size")

Cornel's size — 79464 VS 766016 — OpenSubs size


So, for StarSpace model training we'll take OpenSubs, as it is much bigger

Make file for training

In [26]:
with open(os.path.join('data', 'data_prepared_opensubs.tsv'), 'w') as f:
    for pair in data_opensubs:
        f.write(pair[0] + '\t' + pair[1] + '\n')

Here behind the scenes goes StarSpace model training. With parameters as in week 3:
>`./starspace train -trainFile ./data_prepared.tsv -model StarSpaceModel -trainMode 3 -adagrad true -ngrams 1 -epoch 5 -dim 100 -similarity cosine -minCount 2 -verbose true -fileFormat labelDoc --negSearchLimit 10 -lr 0.02`

Elapsed time: $\approx 30$ min

Load generated embeddings

In [27]:
starspace_embeddings_opensubs = {}

with open('./data/StarSpaceModelOpenSubs.tsv', 'r') as f:
    lines = f.readlines()

for l in lines:
    l = l.strip().split()
    if len(l) <= 1:
        continue
    starspace_embeddings_opensubs[l[0]] = np.array([float(el) for el in l[1:]])

In [28]:
len(starspace_embeddings_opensubs)

70685

In [29]:
starspace_embeddings_opensubs['dog'][:5]

array([-0.00344635, -0.00612465, -0.0188963 , -0.0253538 ,  0.00781904])

In [30]:
q_matrix_opensubs = get_q_matrix(data_opensubs, starspace_embeddings_opensubs)

Shape: (766016, 100)


Let's try to ask something

In [31]:
question = 'How are you?'
answers_opensubs = generate_answers(
    question, data_opensubs, q_matrix_opensubs,
    starspace_embeddings_opensubs
)
pprint(answers_opensubs)

[['how are you how are you', 'betty and rodney where s ted carter'],
 ['how are you how are you', 'how is he']]


Not so good answers. Let's try also Cornell.

In [33]:
with open(os.path.join('data', 'data_prepared_cornell.tsv'), 'w') as f:
    for pair in data_cornell:
        f.write(pair[0] + '\t' + pair[1] + '\n')

Now learning takes $\approx 3$ min

In [34]:
starspace_embeddings_cornell = {}

with open('./data/StarSpaceModelCornell.tsv', 'r') as f:
    lines = f.readlines()

for l in lines:
    l = l.strip().split()
    if len(l) <= 1:
        continue
    starspace_embeddings_cornell[l[0]] = np.array([float(el) for el in l[1:]])

In [35]:
len(starspace_embeddings_cornell)

16085

In [36]:
q_matrix_cornell = get_q_matrix(data_cornell, starspace_embeddings_cornell)

Shape: (79464, 100)


In [37]:
question = 'How are you?'
answers_cornell = generate_answers(
    question, data_cornell, q_matrix_cornell,
    starspace_embeddings_cornell
)
pprint(answers_cornell)

[['how are you', 'scarred for life thats how i am'],
 ['how are you', 'very well thank you'],
 ['how are you honeybunch', 'is penelope in'],
 ['how are you', 'head still secure to the neck'],
 ['how are you', 'fine cant you see so yourself'],
 ['how are you', 'fine fine'],
 ['how are you', 'fine im fine how are you'],
 ['how are you', 'okay'],
 ['how are you', 'fine quite fine and your royal highness'],
 ['how are you', 'im okay'],
 ['how are you', 'good how are you'],
 ['how are you', 'all right son'],
 ['how are you', 'tired'],
 ['how are you', 'im just fine ally what about you'],
 ['how are you', 'i got fired'],
 ['how are you', 'another young man'],
 ['how are you', 'hello macaulay come in'],
 ['how are you', 'i hate to bother you when youre laid up'],
 ['how are you', 'fine']]


Fine answers. And similar to those got with pretrained Google vectors:

In [39]:
unique_replicas_pretrained = (
    [p for p in answers_pretrained if p not in answers_cornell])
unique_replicas_owntrained = (
    [p for p in answers_cornell if p not in answers_pretrained]
)

print('Unique in pre-trained:')
pprint(unique_replicas_pretrained)

print('\nUnique in own-trained:')
pprint(unique_replicas_owntrained)

Unique in pre-trained:
[['cornelius how are you', 'bob im okay how are you'],
 ['wladek how are you', 'fine were fine thank you and you']]

Unique in own-trained:
[]


What if we increase the dictionary size by setting `minCount` equal to 1?

In [40]:
starspace_embeddings_cornell_more_words = {}

with open('./data/StarSpaceModelCornellMoreWords.tsv', 'r') as f:
    lines = f.readlines()

for l in lines:
    l = l.strip().split()
    if len(l) <= 1:
        continue
    starspace_embeddings_cornell_more_words[l[0]] = np.array([float(el) for el in l[1:]])

In [41]:
len(starspace_embeddings_cornell_more_words)

30685

In [42]:
q_matrix_cornell_more_words = (
    get_q_matrix(data_cornell, starspace_embeddings_cornell_more_words))

Shape: (79464, 100)


In [43]:
question = 'How are you?'
answers_cornell_more_words = generate_answers(
    question, data_cornell, q_matrix_cornell_more_words,
    starspace_embeddings_cornell_more_words
)
pprint(answers_cornell_more_words)

[['how are you', 'scarred for life thats how i am'],
 ['how are you', 'very well thank you'],
 ['how are you', 'head still secure to the neck'],
 ['how are you', 'fine cant you see so yourself'],
 ['how are you', 'fine fine'],
 ['how are you', 'fine im fine how are you'],
 ['how are you', 'okay'],
 ['how are you', 'fine quite fine and your royal highness'],
 ['how are you', 'im okay'],
 ['how are you', 'good how are you'],
 ['how are you', 'all right son'],
 ['how are you', 'tired'],
 ['how are you', 'im just fine ally what about you'],
 ['how are you', 'i got fired'],
 ['how are you', 'another young man'],
 ['how are you', 'hello macaulay come in'],
 ['how are you', 'i hate to bother you when youre laid up'],
 ['how are you', 'fine']]


In [44]:
unique_replicas = (
    [p for p in answers_cornell_more_words if p not in answers_cornell]
)
pprint(unique_replicas)

[]


The same answers

What if we try to combine OpenSubs and Cornell datasets?
Let's try searching answers in OpenSubs using vectors from Cornell.

In [45]:
question = 'How are you?'
answers_opensubs_using_cornell = generate_answers(
    question, data_opensubs, q_matrix_opensubs,
    starspace_embeddings_cornell
)
pprint(answers_opensubs_using_cornell)

[['we re adventurers sir', 'pursuing an opportunity']]


Not bad, but quite a few.

Let's try visa versa.

In [46]:
question = 'How are you?'
answers_cornell_using_opensubs = generate_answers(
    question, data_cornell, q_matrix_cornell,
    starspace_embeddings_opensubs
)
pprint(answers_cornell_using_opensubs)

[['well of course wed be honored', 'just putting in an appearance then']]


Well, results definitely not so good as when using Cornell only.

We tried just one question in all the cases, but I think that this one, 'How are you?', is quite common, and we can judge our models using it.

So, in the bot we're going to use Cornell dataset (with `minCount 2`).

# <a name="Generative Model: N-gram Language Model"></a>Generative Model: N-gram Language Model
---
Based on [the Notebook](http://nbviewer.jupyter.org/gist/yoavg/d76121dfde2618422139) by Yoav Goldberg

We'll use "^" to mark beginning of a question in each question-answer pair

In [48]:
def train_char_lm(file_path, order=4):
    data = ''
    
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    for line in lines:
        # No need for end of statement symbol, point "." represents it
        data += r'^' * order + line
    
    lm = defaultdict(Counter)
    
    for i in range(len(data) - order):
        history = data[i:i+order]
        char = data[i + order]
        lm[history][char] += 1
    
    def normalize(counter):
        s = float(sum(counter.values()))
        return [(c, val / s) for c, val in list(counter.items())]
    
    lm_normalized = {h: normalize(chars) for h, chars in list(lm.items())}
    
    return lm_normalized

Prepare txt files for training

In [49]:
with open(os.path.join('data', 'data_prepared_cornell.txt'), 'w') as f:
    for pair in data_cornell:
        f.write(pair[0] + '? ' + pair[1] + '.\n')

with open(os.path.join('data', 'data_prepared_opnesubs.txt'), 'w') as f:
    for pair in data_opensubs:
        f.write(pair[0] + '? ' + pair[1] + '.\n')

qa_file_txt = os.path.join('data', 'data_prepared_cornell.txt')

In [50]:
lm = train_char_lm(qa_file_txt, order=4)

In [51]:
# kill
lm[' kil']

[('l', 0.9816625916870416), ('o', 0.018337408312958436)]

In [52]:
# damn
lm[' dam']

[('a', 0.075),
 ('n', 0.721875),
 ('m', 0.06875),
 ('o', 0.09375),
 ('p', 0.003125),
 ('e', 0.021875),
 ('?', 0.003125),
 (' ', 0.009375),
 ('.', 0.003125)]

In [53]:
def generate_letter(lm, history, order):
    history = history[-order:]
    distribution = lm[history]

    symbols = [p[0] for p in distribution]
    probabilities = [p[1] for p in distribution]

    return np.random.choice(symbols, size=1, p=probabilities)[0]

In [54]:
def generate_text(lm, order, num_letters=1000):
    history = r'^' * order
    result = []
    
    for i in range(num_letters):
        c = generate_letter(lm, history, order)
        history = history[-order:] + c
        # Don't want special character in the result
        if not c == r'^':
            result.append(c)
    
    return ''.join(result)

Let's generate some text samples

In [55]:
lm = train_char_lm(qa_file_txt, order=2)
print(generate_text(lm, 2, num_letters=500))

whaits no you.
but we an six.
base do give mmuch? inight ar thim.
you whor welier? to gaid.
your beg be mork.
for the you ter max bul do wask ah.
why whouver see it? a by bets givere st to hered youver and bring a whant you to she broble sho is bethrod a i not dont king liken this wassickunds tow hoseed alks a bir arant you what i hind care st kno they eepieven if ill its was.
you ont there yourdere becrom? do i.
ho away.
finings mov nou han row did jim thin.
no gen.
don.



In [56]:
lm = train_char_lm(qa_file_txt, order=4)
print(generate_text(lm, 4, num_letters=500))

i shouldnt some on of our kid? jimmy take serial good.
you want the sixteen.
youll be his is the daglio.
mr girl yeah ill god? hi.
turday? with i meat.
youryouth me probably are you sure.
yeah about something here? aparty? it neck bye danderthe bid.
what one is.
whats so be a wine.
outside the gun shes offee score you sure? what piano.
sure? i missing animal.
whawhat about a maybe mum? valhall right? right? it want next tired? looks like the geomethink


In [57]:
lm = train_char_lm(qa_file_txt, order=10)
print(generate_text(lm, 10, num_letters=500))

only the best quality you have any idea.
here come on.
danger? yes.
dont say my name.
i dont know? is it long im going to be great? do you want.
smythe youve been love.
good youre anatomically correct? and also with you.
very good one ma.
yes go away? i didnt know that.
i never thought you hated sidnaw? just taste that christmas music.
ow? im going to do? bill have it.
no way no day? giv


In [58]:
def generate_answers_ngram(question, num_answers=5, max_iter=1000):
    prepared_question = text_prepare(question)
    phrase_in = r'^' + prepared_question + '? '
    
    num_symbols = len([l for l in phrase_in])
    
    # Question may be quite big
    # Bigger history -> longer process (and higher chances of zero probabilities)
    order = min(10, num_symbols)
    
    lm = train_char_lm(qa_file_txt, order=order)
    
    result = []
    current_iteration = 0
    # (3 * num_answers) -- just need something bigger than num_answers
    while len(result) < num_answers and current_iteration < 3 * num_answers:
        history = phrase_in[-order:]
        symbols_out = []
        c = ''
        i = 0

        while c != '.' and i < max_iter:
            c = generate_letter(lm, history, order)
            history = history[-order:] + c
            if c != r'^':
                symbols_out.append(c)
            i += 1

        symbols_out = symbols_out[:-1] # exclude point "."
        phrase_out = ''.join(symbols_out)
        
        if phrase_out not in result:
            result.append(phrase_out)
        
        current_iteration += 1
    
    return result

In [60]:
question = 'How are you?'
generate_answers_ngram(question)

['thirty ones and two tens',
 'can you move at all? you working on it',
 'thats for sure? such as',
 'ben where do you live with us',
 'alvy alvy singer over here a moment']

Different answers — because of the probabilities behind the `generate_letter`

In [61]:
question = 'I want to drink your blood'
generate_answers_ngram(question)

['youre sure mr brandon? dont be afraid',
 'yes',
 'youre leaving you want me to? yes yours got them in your sleep? next time itll be our legacy? it will',
 'some of the others werent there high school',
 'youre not leaving without you']

"youre sure mr brandon? dont be afraid" :)

# <a name="Generative Model: LSTM-Based Language Model"></a>Generative Model: LSTM-Based Language Model
---

Based on [code](https://gist.github.com/karpathy/d4dee566867f8291f086) by [Andrej Karpathy](http://karpathy.github.io/2015/05/21/rnn-effectiveness/)

In [62]:
lines_cornell = open(os.path.join('data', 'data_prepared_cornell.txt'), 'r').readlines()
lines_opensubs = open(os.path.join('data', 'data_prepared_opensubs.txt'), 'r').readlines()

In [63]:
len(lines_cornell), len(lines_opensubs)

(79464, 766016)

Cornell is much smaller. Let's copy some of Cornell's lines, mix lines in each dataset and then combine all lines

In [64]:
lines_cornell = lines_cornell * np.int(len(lines_opensubs) / len(lines_cornell))

In [65]:
len(lines_cornell)

715176

In [66]:
np.random.shuffle(lines_cornell)
np.random.shuffle(lines_opensubs)

In [69]:
data = ''.join(lines_cornell + lines_opensubs)
data = data[:200000] # take a fragment

print(data[:200])

all night? well who are those guys.
hey how much are we ahead? approximately one thousand bucks.
was it swine? no it was a seven letter word.
five times a day? i guess its all got to do with this shop


In [70]:
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)

print('Data has %d characters, %d unique.' % (data_size, vocab_size))

char_to_ix = {c: i for i, c in enumerate(chars)}
ix_to_char = {i: c for i, c in enumerate(chars)}

Data has 200000 characters, 40 unique.


In [86]:
# Hyperparameters
hidden_size = 100    # size of hidden layer of neurons
seq_length = 25      # number of steps to unroll the RNN for
learning_rate = 1e-1

# Model parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01  # input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01  # hidden to output
bh = np.zeros((hidden_size, 1))                        # hidden bias
by = np.zeros((vocab_size, 1))                         # output bias

In [87]:
def lossFun(inputs, targets, hprev):
    """inputs, targets -- lists of integers
       hprev -- Hx1 array of initial hidden state
       ---
       result -- the loss, gradients on model parameters,
           and last hidden state"""
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    
    # Forward pass
    for t in range(len(inputs)):
        # Encode in 1-of-k representation
        xs[t] = np.zeros((vocab_size,1))
        xs[t][inputs[t]] = 1
        
        # Hidden state
        hs[t] = np.tanh(
            np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh
        )
        
        # Unnormalized log probabilities for next chars
        ys[t] = np.dot(Why, hs[t]) + by
        
        # Probabilities for next chars
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
        
        # Softmax (cross-entropy loss)
        loss += -np.log(ps[t][targets[t],0])
    
    # Backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = (
        np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    )
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        
        # Backprop into y
        dy[targets[t]] -= 1
        dWhy += np.dot(dy, hs[t].T)
        dby += dy
        
        # Backprop into h
        dh = np.dot(Why.T, dy) + dhnext
        
        # Backprop through tanh nonlinearity
        dhraw = (1 - hs[t] * hs[t]) * dh
        dbh  += dhraw
        dWxh += np.dot(dhraw, xs[t].T)
        dWhh += np.dot(dhraw, hs[t-1].T)
        dhnext = np.dot(Whh.T, dhraw)
    
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        # Clip to mitigate exploding gradients
        np.clip(dparam, -5, 5, out=dparam)
    
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

In [88]:
def sample(h, seed_ix, n):
    """sample a sequence of integers from the model
       ---
       h -- memory state
       seed_ix -- seed letter for first time step"""
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    
    for t in range(n):
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
        
    return ixes

In [98]:
%%time
# Training

n, p = 0, 0
mWxh, mWhh, mWhy = (
    np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
)

# Memory variables for Adagrad
mbh, mby = np.zeros_like(bh), np.zeros_like(by)

# Loss at iteration 0
smooth_loss = -np.log(1.0 / vocab_size) * seq_length

while n < 1_000_000:
    # Prepare inputs (sweeping from left to right in steps seq_length long)
    if p + seq_length + 1 >= len(data) or n == 0: 
        hprev = np.zeros((hidden_size, 1)) # reset RNN memory
        p = 0                              # go from start of data
    inputs =  [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # Sample from the model now and then
    if n % 1000 == 0:
        sample_ix = sample(hprev, inputs[0], 200)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        # print('----\n %s \n----' % (txt, ))

    # Forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    
    # Print progress
    if n % 50000 == 0:
        print('iter %d, loss: %f' % (n, smooth_loss))
        
        # Backup
        with open(os.path.join('data', 'net_params.txt'), 'w') as f:
            f.write(json.dumps({
                'Wxh': Wxh.tolist(),
                'Whh': Whh.tolist(),
                'Why': Why.tolist(),
                'bh' : bh.tolist(),
                'by' : by.tolist()
            }))
            f.write('\n')

    # Perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                  [dWxh, dWhh, dWhy, dbh, dby], 
                                  [mWxh, mWhh, mWhy, mbh, mby]):
        mem   += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update
    
    p += seq_length # move data pointer
    n += 1          # iteration counter

iter 0, loss: 92.160156
iter 50000, loss: 50.521910
iter 100000, loss: 48.574454
iter 150000, loss: 47.864013
iter 200000, loss: 47.274272
iter 250000, loss: 47.503328
iter 300000, loss: 46.707740
iter 350000, loss: 46.528753
iter 400000, loss: 46.183791
iter 450000, loss: 46.606257
iter 500000, loss: 46.012872
iter 550000, loss: 45.857060
iter 600000, loss: 45.650703
iter 650000, loss: 46.031259
iter 700000, loss: 45.507952
iter 750000, loss: 45.447371
iter 800000, loss: 46.264016
iter 850000, loss: 46.398521
iter 900000, loss: 45.954761
iter 950000, loss: 45.539966
Wall time: 55min 9s


In [99]:
def generate_answers_lstm(question): 
    question = text_prepare(question)
    inputs =  [char_to_ix[ch] for ch in question]
    
    num_symbols_to_generate = 5 * len(inputs)
    
    hprev = np.zeros((hidden_size, 1))

    sample_ix = sample(hprev, inputs[0], num_symbols_to_generate)
    answer = ''.join(ix_to_char[ix] for ix in sample_ix)
    
    # If there is still a piece of question at the beginning
    #   we'll delete it from the answer
    try:
        answer = answer[answer.index('?')+1:]
    except:
        pass
    
    # Save only text from beginning till point (excluding)
    try:
        answer = answer[:answer.index('.')]
    except:
        pass
    
    return answer

In [103]:
question = 'How are you?'
answer_lstm = generate_answers_lstm(question)
print(answer_lstm)

 
hop are youre here habt to have a toet


Well, it doesn't make much sence. So let's just make a bot without neural network answers :)

# <a name="Dialogue Example"></a>Dialogue Example
---

Here is an example of a dialog with Bot (generated by running `ChatBot.py` as main program)

- How are you?    
  fine cant you see so yourself


- What's is your name?  
  Dale. Dale Barbara


- I want something to drink  
  mmm... sorry, i've forgot anything about it


- Where can I buy a guitar?  
  sorry, i don't know...


- What will the weather be like tomorrow?   
  how did you get in here


- Nice shot!   
  thank you sir


- How old are you?  
  Well, you know, I'm a bot actually. There is no such a notion for me as age. I' supposed to tell you that I'm 30 years old. But my programm's been running for a just couple of days


- Tell me something I don't know    
  a deer


- Do you like The Godfather movie?  
  very much


- Let's rob a bank, Billy!  
  what a coincidence! i also want to know the answer to this question


- Bye  
  have a nice outing you lot