In [None]:
# Based on https://github.com/tensorlayer/seq2seq-chatbot

In [1]:
import json
import ast
import os
import nltk
import gzip
import time
import requests
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorlayer as tl

from collections import Counter
from tqdm import tqdm

from sklearn.utils import shuffle
from tensorlayer.cost import cross_entropy_seq, cross_entropy_seq_with_mask
from tensorlayer.models.seq2seq import Seq2seq
from tensorlayer.models.seq2seq_with_attention import Seq2seqLuongAttention


In [2]:
# get the data

url = 'https://jmcauley.ucsd.edu/data/amazon/qa/qa_Electronics.json.gz'

r = requests.get(url,verify=False)

with open('qa_Electronics.json.gz','wb') as localfile:
    localfile.write(r.content)



In [3]:

# the data is not in proper JSON format, so iterate over rows to get the dictionaries with ast
data_dict = {}
with gzip.open('qa_Electronics.json.gz','rb') as jsonfile:
    for count, row in enumerate(jsonfile):
        
        row  = row.decode("utf-8") 
        data_dict[count] = ast.literal_eval(row)

In [4]:

        

data = pd.DataFrame.from_dict(data_dict, orient='index')

vocabulary_size = 10000

# test train split
data = shuffle(data)
test_train_split = 0.9

train = data[:int(data.shape[0]*test_train_split)]
test = data[int(data.shape[0]*test_train_split):]

In [5]:
sentences = ' '.join(train.question.tolist() + train.answer.tolist() )
word_dict = Counter(nltk.word_tokenize(sentences) )

In [6]:
word_df = pd.DataFrame.from_dict({"frequency":word_dict}, orient='columns')
word_df = word_df.sort_values('frequency', ascending=False)[:vocabulary_size]

# add ranking
word_df['rank'] = word_df.reset_index().index + 4 # shift by 4 to allow for the 4 tokens

# add word column
word_df['word'] = word_df.index

# create mapping dictionary
idx2word = word_df.set_index('rank')['word'].to_dict()
word2idx = word_df.set_index('word')['rank'].to_dict()

# Tokens
pad_id = 0
unk_id = 1
start_id = 2
end_id = 3

word2idx['<pad>'] = pad_id
idx2word[pad_id] = '<pad>'

word2idx['<unk>'] = unk_id
idx2word[unk_id] = '<unk>'

word2idx['<start>'] = start_id
idx2word[start_id] = '<start>'

word2idx['<end>'] = end_id
idx2word[end_id] = '<end>'

In [7]:

def sentence2idx(sentence):
    """
    create integer list from sentence
    """
    
    outsentence = []
    for word in nltk.word_tokenize(sentence):
        if word in word2idx:
            outsentence.append(word2idx[word])
        else:
            outsentence.append(word2idx['<unk>'])
    
    return outsentence


In [8]:

trainX = [sentence2idx(sent) for sent in train.question.tolist() ]
trainY = [sentence2idx(sent) for sent in train.answer.tolist() ]


batch_size = 32
n_step = len(trainX) // batch_size
src_vocab_size = len(word2idx)
emb_dim = 300

src_vocab_size = tgt_vocab_size = len(word2idx) + 2

num_epochs = 2
vocabulary_size = src_vocab_size



In [39]:
optimizer = tf.keras.optimizers.Adam()


def inference(seed, top_n):
        model_.eval()
        seed_id = [word2idx.get(w, unk_id) for w in seed.split(" ")]
        sentence_int = model_(inputs=[[seed_id]], seq_length=20, start_token=start_id, top_n = top_n)
        sentence_int = sentence_int.numpy().tolist()[0]
        
        sentence = []
        for w_int in sentence_int:
            if w_int == end_id:
                break
            word = idx2word[w_int]
            sentence = sentence + [word]
        return sentence

decoder_seq_length = 40
model_ = Seq2seq(
        decoder_seq_length = decoder_seq_length,
        cell_enc=tf.keras.layers.GRUCell,
        cell_dec=tf.keras.layers.GRUCell,
        n_layer=3,
        n_units=300,
        embedding_layer=tl.layers.Embedding(vocabulary_size=vocabulary_size, embedding_size=emb_dim),
        )
    

[TL] Embedding embedding_2: (10006, 300)
[TL] RNN rnn_7: cell: GRUCell, n_units: 300
[TL] RNN rnn_8: cell: GRUCell, n_units: 300
[TL] RNN rnn_9: cell: GRUCell, n_units: 300
[TL] RNN rnn_10: cell: GRUCell, n_units: 300
[TL] RNN rnn_11: cell: GRUCell, n_units: 300
[TL] RNN rnn_12: cell: GRUCell, n_units: 300
[TL] Reshape reshape_4
[TL] Dense  dense_2: 10006 No Activation
[TL] Reshape reshape_5
[TL] Reshape reshape_6


In [40]:
seeds = test.sample(3).question.tolist()

In [41]:
seeds

['Warranty? I cannot find that there is a warranty on this?',
 'Does this device also record audio?',
 'does it work with? al p touch models?']

In [42]:
def answer(iterations,top_n =3 ):
    print(f" At {iterations} the model answers like this")
    for seed in seeds:
        print("Question :", seed)
        top_n = 3
        for i in range(top_n):
            sentence = inference(seed, top_n)
            print(f"Answer {i} :", ' '.join(sentence))

In [43]:
start = time.time()
for epoch in range(num_epochs):
    # set model into train mode
    model_.train()   
    
    # reset loss
    total_loss, n_iter = 0, 0

    for X, Y in tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=batch_size, shuffle=True) : 

        X = tl.prepro.pad_sequences(X)
        _target_seqs = tl.prepro.sequences_add_end_id(Y, end_id=end_id)
        _target_seqs = tl.prepro.pad_sequences(_target_seqs, maxlen=decoder_seq_length)
        _decode_seqs = tl.prepro.sequences_add_start_id(Y, start_id=start_id, remove_last=False)
        _decode_seqs = tl.prepro.pad_sequences(_decode_seqs, maxlen=decoder_seq_length)
        _target_mask = tl.prepro.sequences_get_mask(_target_seqs)

        with tf.GradientTape() as tape:
            ## compute outputs
            output = model_(inputs = [X, _decode_seqs])
                
            output = tf.reshape(output, [-1, vocabulary_size])
                
            ## compute loss and update model
            loss = cross_entropy_seq_with_mask(logits=output, 
                                               target_seqs=_target_seqs, 
                                               input_mask=_target_mask)

            grad = tape.gradient(loss, model_.all_weights)
            optimizer.apply_gradients(zip(grad, model_.all_weights))
            
        total_loss += loss
        n_iter += 1
            
        if n_iter % 500 == 0:
            answer(n_iter)
            print(f"Saved model after {n_iter} iterations or {time.time()- start} seconds")
            tl.files.save_npz(model_.all_weights, name=f'model_{n_iter}.npz')
            
            print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, num_epochs, total_loss / n_iter))
            
            # switch back to trainable
            model_.train()   

        

 At 500 the model answers like this
Question : Warranty? I cannot find that there is a warranty on this?
Answer 0 : Yes
Answer 1 : Yes
Answer 2 : Yes it . . . it it . . it the the
Question : Does this device also record audio?
Answer 0 : Yes it it , . it it
Answer 1 : Yes , , it
Answer 2 : Yes ,
Question : does it work with? al p touch models?
Answer 0 : Yes , it it . it . it it the
Answer 1 : Yes , ,
Answer 2 : Yes it it . . .
Saved model after 500 iterations or 212.98198890686035 seconds
[TL] [*] Saving TL weights into model_500.npz
[TL] [*] Saved
Epoch [1/2]: loss 6.2074
 At 1000 the model answers like this
Question : Warranty? I cannot find that there is a warranty on this?
Answer 0 : Yes . I can have the the same to the <unk> .
Answer 1 : Yes .
Answer 2 : Yes
Question : Does this device also record audio?
Answer 0 : Yes , it is not a not <unk> . It is not to <unk>
Answer 1 : Yes it it
Answer 2 : Yes it it it , , it does will not n't n't work .
Question : does it work with? al p to

[TL] [*] Saved
Epoch [1/2]: loss 4.8010
 At 5500 the model answers like this
Question : Warranty? I cannot find that there is a warranty on this?
Answer 0 : I am sorry I could not help . Sorry ! I 'm sorry .
Answer 1 : I am not 100 percent sure about <unk> , but I 'm sorry I can n't be of a problem
Answer 2 : I am not selling this seller .
Question : Does this device also record audio?
Answer 0 : Yes , this is the <unk> . I am very happy with this .
Answer 1 : Yes it is . I use the <unk> for a few days . Hope that helps . -Ely Regards .
Answer 2 : Yes it is
Question : does it work with? al p touch models?
Answer 0 : Yes , this works with the <unk> <unk> <unk> <unk> <unk> . <unk>
Answer 1 : Yes it will work .
Answer 2 : Yes
Saved model after 5500 iterations or 2326.791907310486 seconds
[TL] [*] Saving TL weights into model_5500.npz
[TL] [*] Saved
Epoch [1/2]: loss 4.7410
 At 6000 the model answers like this
Question : Warranty? I cannot find that there is a warranty on this?
Answer 0 : 

[TL] [*] Saved
Epoch [2/2]: loss 3.8420
 At 2500 the model answers like this
Question : Warranty? I cannot find that there is a warranty on this?
Answer 0 : I bought this for a year now , so no warranty is not needed .
Answer 1 : I do n't know about warranty . I do not know . Sorry
Answer 2 : I do n't know .
Question : Does this device also record audio?
Answer 0 : Yes . It has an internal IP address for the <unk>
Answer 1 : Yes .
Answer 2 : Yes
Question : does it work with? al p touch models?
Answer 0 : Yes it does
Answer 1 : Yes .
Answer 2 : Yes it will . I bought it for the <unk> and it works great
Saved model after 2500 iterations or 4817.673665523529 seconds
[TL] [*] Saving TL weights into model_2500.npz
[TL] [*] Saved
Epoch [2/2]: loss 3.8390
 At 3000 the model answers like this
Question : Warranty? I cannot find that there is a warranty on this?
Answer 0 : I 'm pretty happy .
Answer 1 : I 'm pretty sure you will be able . But it 's not a good bet
Answer 2 : I bought the <unk> fo

Answer 2 : Yes
Question : does it work with? al p touch models?
Answer 0 : Yes , I use this for a <unk> . Works well with the Surface RT
Answer 1 : Yes it works great !
Answer 2 : Yes it does .
Saved model after 8000 iterations or 7156.290225505829 seconds
[TL] [*] Saving TL weights into model_8000.npz
[TL] [*] Saved
Epoch [2/2]: loss 3.7952
 At 8500 the model answers like this
Question : Warranty? I cannot find that there is a warranty on this?
Answer 0 : I do not recommend it , I 'm not happy for that .
Answer 1 : I do n't recall the <unk> warranty . It 's a great unit and it is a good purchase for
Answer 2 : I am not sure about the warranty , but it was n't as bad for the <unk> . I am
Question : Does this device also record audio?
Answer 0 : Yes it can be connected via USB
Answer 1 : Yes . <unk> .
Answer 2 : Yes
Question : does it work with? al p touch models?
Answer 0 : Yes it does . It works with the Nexus 8 and it is very well made
Answer 1 : Yes . It 's the <unk> <unk> .
Answer 