In [None]:
# Based on https://github.com/tensorlayer/seq2seq-chatbot

In [1]:
import json
import ast
import os
import nltk
import gzip
import time
import requests
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorlayer as tl

from collections import Counter
from tqdm import tqdm

from sklearn.utils import shuffle
from tensorlayer.cost import cross_entropy_seq, cross_entropy_seq_with_mask
from tensorlayer.models.seq2seq import Seq2seq
from tensorlayer.models.seq2seq_with_attention import Seq2seqLuongAttention


In [2]:
# get the data

url = 'https://jmcauley.ucsd.edu/data/amazon/qa/qa_Electronics.json.gz'

r = requests.get(url,verify=False)

with open('qa_Electronics.json.gz','wb') as localfile:
    localfile.write(r.content)



In [3]:

# the data is not in proper JSON format, so iterate over rows to get the dictionaries with ast
data_dict = {}
with gzip.open('qa_Electronics.json.gz','rb') as jsonfile:
    for count, row in enumerate(jsonfile):
        
        row  = row.decode("utf-8") 
        data_dict[count] = ast.literal_eval(row)

In [4]:

        

data = pd.DataFrame.from_dict(data_dict, orient='index')

vocabulary_size = 10000

# test train split
data = shuffle(data)
test_train_split = 0.9

train = data[:int(data.shape[0]*test_train_split)]
test = data[int(data.shape[0]*test_train_split):]

In [5]:
sentences = ' '.join(train.question.tolist() + train.answer.tolist() )
word_dict = Counter(nltk.word_tokenize(sentences) )

In [6]:
word_df = pd.DataFrame.from_dict({"frequency":word_dict}, orient='columns')
word_df = word_df.sort_values('frequency', ascending=False)[:vocabulary_size]

# add ranking
word_df['rank'] = word_df.reset_index().index +4

# add word column
word_df['word'] = word_df.index

# create mapping dictionary
idx2word = word_df.set_index('rank')['word'].to_dict()
word2idx = word_df.set_index('word')['rank'].to_dict()

pad_id = 0
unk_id = 1
start_id = 2
end_id = 3

word2idx['<pad>'] = pad_id
idx2word[pad_id] = '<pad>'

word2idx['<unk>'] = unk_id
idx2word[unk_id] = '<unk>'

word2idx['<start>'] = start_id
idx2word[start_id] = '<start>'

word2idx['<end>'] = end_id
idx2word[end_id] = '<end>'

In [7]:

def sentence2idx(sentence):
    """
    create integer list from sentence
    """
    
    outsentence = []
    for word in nltk.word_tokenize(sentence):
        if word in word2idx:
            outsentence.append(word2idx[word])
        else:
            outsentence.append(word2idx['<unk>'])
    
    return outsentence


In [8]:

trainX = [sentence2idx(sent) for sent in train.question.tolist() ]
trainY = [sentence2idx(sent) for sent in train.answer.tolist() ]


batch_size = 32
n_step = len(trainX) // batch_size
src_vocab_size = len(word2idx)
emb_dim = 300

#word2idx = metadata['w2idx']   # dict  word 2 index
#idx2word = metadata['idx2w']   # list index 2 word


src_vocab_size = tgt_vocab_size = len(word2idx) + 2

num_epochs = 2
vocabulary_size = src_vocab_size



In [9]:
optimizer = tf.keras.optimizers.Adam()


def inference(seed, top_n):
        model_.eval()
        seed_id = [word2idx.get(w, unk_id) for w in seed.split(" ")]
        sentence_int = model_(inputs=[[seed_id]], seq_length=20, start_token=start_id, top_n = top_n)
        sentence_int = sentence_int.numpy().tolist()[0]
        
        sentence = []
        for w_int in sentence_int:
            if w_int == end_id:
                break
            word = idx2word[w_int]
            sentence = sentence + [word]
        return sentence

decoder_seq_length = 40
model_ = Seq2seq(
        decoder_seq_length = decoder_seq_length,
        cell_enc=tf.keras.layers.GRUCell,
        cell_dec=tf.keras.layers.GRUCell,
        n_layer=3,
        n_units=300,
        embedding_layer=tl.layers.Embedding(vocabulary_size=vocabulary_size, embedding_size=emb_dim),
        )
    

[TL] Embedding embedding_1: (10006, 300)
[TL] RNN rnn_1: cell: GRUCell, n_units: 300
[TL] RNN rnn_2: cell: GRUCell, n_units: 300
[TL] RNN rnn_3: cell: GRUCell, n_units: 300
[TL] RNN rnn_4: cell: GRUCell, n_units: 300
[TL] RNN rnn_5: cell: GRUCell, n_units: 300
[TL] RNN rnn_6: cell: GRUCell, n_units: 300
[TL] Reshape reshape_1
[TL] Dense  dense_1: 10006 No Activation
[TL] Reshape reshape_2
[TL] Reshape reshape_3


In [17]:
seeds = test.sample(3).question.tolist()

In [18]:
seeds

['Whats on the end of the cable.Is it rf coax to hdmi? your description did not explain its application',
 'Will this work with my Samsung Galaxy S4 cell phone?',
 'Will this fit IPAD Air?']

In [23]:
def answer(iterations,top_n =3 ):
    print(f" At {iterations} the model answers like this")
    for seed in seeds:
        print("Question :", seed)
        top_n = 3
        for i in range(top_n):
            sentence = inference(seed, top_n)
            print(f"Answer {i} :", ' '.join(sentence))

In [30]:

start = time.time()
for epoch in range(num_epochs):
    # set model into train mode
    model_.train()   
    
    # reset loss
    total_loss, n_iter = 0, 0

    for X, Y in tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=batch_size, shuffle=True):

        X = tl.prepro.pad_sequences(X)
        _target_seqs = tl.prepro.sequences_add_end_id(Y, end_id=end_id)
        _target_seqs = tl.prepro.pad_sequences(_target_seqs, maxlen=decoder_seq_length)
        _decode_seqs = tl.prepro.sequences_add_start_id(Y, start_id=start_id, remove_last=False)
        _decode_seqs = tl.prepro.pad_sequences(_decode_seqs, maxlen=decoder_seq_length)
        _target_mask = tl.prepro.sequences_get_mask(_target_seqs)

        with tf.GradientTape() as tape:
            ## compute outputs
            output = model_(inputs = [X, _decode_seqs])
                
            output = tf.reshape(output, [-1, vocabulary_size])
                
            ## compute loss and update model
            loss = cross_entropy_seq_with_mask(logits=output, 
                                               target_seqs=_target_seqs, 
                                               input_mask=_target_mask)

            grad = tape.gradient(loss, model_.all_weights)
            optimizer.apply_gradients(zip(grad, model_.all_weights))
            
        total_loss += loss
        n_iter += 1
            
        if n_iter % 500 == 0:
            answer(n_iter)
            print(f"Saved model after {n_iter} iterations or {time.time()- start} seconds")
            tl.files.save_npz(model_.all_weights, name=f'model_{n_iter}.npz')
                

        # printing average loss after every epoch
        print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, num_epochs, total_loss / n_iter))

        

Epoch [1/2]: loss 5.8948
Epoch [1/2]: loss 5.8201
Epoch [1/2]: loss 5.8009
Epoch [1/2]: loss 5.8056
Epoch [1/2]: loss 5.7810
Epoch [1/2]: loss 5.7402
Epoch [1/2]: loss 5.7268
Epoch [1/2]: loss 5.7408
Epoch [1/2]: loss 5.7475
Epoch [1/2]: loss 5.7570
Epoch [1/2]: loss 5.7326
Epoch [1/2]: loss 5.7344
Epoch [1/2]: loss 5.7276
Epoch [1/2]: loss 5.7209
Epoch [1/2]: loss 5.7144
Epoch [1/2]: loss 5.7185


KeyboardInterrupt: 

In [None]:
# 30 minute Training
for seed in seeds:
            print("Question  :", seed)
            top_n = 3
            for i in range(top_n):
                sentence = inference(seed, top_n)
                print(f"Answer {i+1} :", ' '.join(sentence))

In [None]:
# ~ 1 hour Training
for seed in seeds:
            print("Question :", seed)
            top_n = 3
            for i in range(top_n):
                sentence = inference(seed, top_n)
                print(f"Answer {i+1} :", ' '.join(sentence))