In [1]:
import json
import ast
import os
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorlayer as tl

from collections import Counter
from tqdm import tqdm

from sklearn.utils import shuffle
from tensorlayer.cost import cross_entropy_seq, cross_entropy_seq_with_mask
from tensorlayer.models.seq2seq import Seq2seq
from tensorlayer.models.seq2seq_with_attention import Seq2seqLuongAttention


In [2]:
data_dict = {}
with open('/home/ethuer/Blog/blogposts/Deeplearning/Data/qa_Electronics.json','r') as jsonfile:
    for count, row in enumerate(jsonfile):
        
        data_dict[count] = ast.literal_eval(row)
        

data = pd.DataFrame.from_dict(data_dict, orient='index')

vocabulary_size = 8000

# test train split
data = shuffle(data)
test_train_split = 0.8

train = data[:int(data.shape[0]*test_train_split)]
test = data[int(data.shape[0]*test_train_split):]

In [3]:
sentences = ' '.join(train.question.tolist() + train.answer.tolist() )
word_dict = Counter(nltk.word_tokenize(sentences) )

In [5]:
word_df = pd.DataFrame.from_dict({"frequency":word_dict}, orient='columns')
word_df = word_df.sort_values('frequency', ascending=False)[:vocabulary_size]

# add ranking
word_df['rank'] = word_df.reset_index().index +4

# add word column
word_df['word'] = word_df.index

# create mapping dictionary
idx2word = word_df.set_index('rank')['word'].to_dict()
word2idx = word_df.set_index('word')['rank'].to_dict()

pad_id = 0
unk_id = 1
end_id = 2
stop_id = 3

word2idx['<pad>'] = pad_id
idx2word[pad_id] = '<pad>'

word2idx['<unk>'] = unk_id
idx2word[unk_id] = '<unk>'

word2idx['<start>'] = start_id
idx2word[start_id] = '<start>'

word2idx['<end>'] = end_id
idx2word[end_id] = '<end>'

In [6]:

def sentence2idx(sentence):
    """
    create integer list from sentence
    """
    
    outsentence = []
    for word in nltk.word_tokenize(sentence):
        if word in word2idx:
            outsentence.append(word2idx[word])
        else:
            outsentence.append(word2idx['<unk>'])
    
    return outsentence


In [7]:

trainX = [sentence2idx(sent) for sent in train.question.tolist() ]
trainY = [sentence2idx(sent) for sent in train.answer.tolist() ]


batch_size = 32
n_step = len(trainX) // batch_size
src_vocab_size = len(word2idx)
emb_dim = 300

#word2idx = metadata['w2idx']   # dict  word 2 index
#idx2word = metadata['idx2w']   # list index 2 word


src_vocab_size = tgt_vocab_size = len(word2idx) + 2

num_epochs = 10
vocabulary_size = src_vocab_size



In [29]:
optimizer = tf.keras.optimizers.Adam()


def inference(seed, top_n):
        model_.eval()
        seed_id = [word2idx.get(w, unk_id) for w in seed.split(" ")]
        sentence_id = model_(inputs=[[seed_id]], seq_length=20, start_token=start_id, top_n = top_n)
        sentence_id = sentence_id.numpy().tolist()[0]
        sentence = []
        for w_id in sentence_id:
            w = idx2word[w_id]
            if w == 'end_id':
                break
            sentence = sentence + [w]
        return sentence

decoder_seq_length = 20
model_ = Seq2seq(
        decoder_seq_length = decoder_seq_length,
        cell_enc=tf.keras.layers.GRUCell,
        cell_dec=tf.keras.layers.GRUCell,
        n_layer=3,
        n_units=256,
        embedding_layer=tl.layers.Embedding(vocabulary_size=vocabulary_size, embedding_size=emb_dim),
        )
    

[TL] Embedding embedding_4: (8006, 300)
[TL] RNN rnn_13: cell: GRUCell, n_units: 256
[TL] RNN rnn_14: cell: GRUCell, n_units: 256
[TL] RNN rnn_15: cell: GRUCell, n_units: 256
[TL] RNN rnn_16: cell: GRUCell, n_units: 256
[TL] RNN rnn_17: cell: GRUCell, n_units: 256
[TL] RNN rnn_18: cell: GRUCell, n_units: 256
[TL] Reshape reshape_7
[TL] Dense  dense_3: 8006 No Activation
[TL] Reshape reshape_8
[TL] Reshape reshape_9


In [30]:
seeds = test.sample(3).question.tolist()

In [31]:
seeds

['Is it Bluetooth compatible?',
 'Is this a new product?',
 'is this work with canon rebel t3?']

In [None]:
for epoch in range(num_epochs):
        model_.train()
        
        # shuffle trainingsdata
        trainX, trainY = shuffle(trainX, trainY, random_state=0)
        
        # reset loss
        total_loss, n_iter = 0, 0
        for X, Y in tqdm(tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=batch_size, shuffle=False), 
                        total=n_step, desc='Epoch[{}/{}]'.format(epoch + 1, num_epochs), leave=False):

            X = tl.prepro.pad_sequences(X)
            _target_seqs = tl.prepro.sequences_add_end_id(Y, end_id=end_id)
            _target_seqs = tl.prepro.pad_sequences(_target_seqs, maxlen=decoder_seq_length)
            _decode_seqs = tl.prepro.sequences_add_start_id(Y, start_id=start_id, remove_last=False)
            _decode_seqs = tl.prepro.pad_sequences(_decode_seqs, maxlen=decoder_seq_length)
            _target_mask = tl.prepro.sequences_get_mask(_target_seqs)

            with tf.GradientTape() as tape:
                ## compute outputs
                output = model_(inputs = [X, _decode_seqs])
                
                output = tf.reshape(output, [-1, vocabulary_size])
                ## compute loss and update model
                loss = cross_entropy_seq_with_mask(logits=output, target_seqs=_target_seqs, input_mask=_target_mask)

                grad = tape.gradient(loss, model_.all_weights)
                optimizer.apply_gradients(zip(grad, model_.all_weights))
            
            total_loss += loss
            n_iter += 1

        # printing average loss after every epoch
        print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, num_epochs, total_loss / n_iter))

        for seed in seeds:
            print("Query >", seed)
            top_n = 3
            for i in range(top_n):
                sentence = inference(seed, top_n)
                print(" >", ' '.join(sentence))

Epoch[1/10]:   1%|          | 91/7856 [00:27<48:23,  2.67it/s]