In [1]:
import keras
import json
from datetime import datetime
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import pickle

Using TensorFlow backend.


In [2]:
with open('pad_encode_text.pk', 'rb') as f:
    pad_encode_text = pickle.load(f)
with open('pad_decode_text.pk', 'rb') as f:
    pad_decode_text = pickle.load(f)
with open('tokenizer.pk', 'rb') as f:
    tokenizer = pickle.load(f)

In [3]:
min_length = 2
max_length = 20
VOC_SIZE = 10000

In [4]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,$%&/\\\t\n]", "", text)
    
    #!\"#$%&()*+,-./:;<=>?@[\\]^_`{\|}~\t\n
    return text

In [5]:
file_name = 'question.csv'
question_df = pd.read_csv(file_name, names=["question"])

In [6]:
question_df.head(10)

Unnamed: 0,question
0,How are you?
1,How is it going?
2,How old are you?
3,How do you like to cook your meat?
4,What is your schedule?
5,What is your name?
6,What's up?
7,What is your favorite color?
8,What is your favorite song?
9,What is the weather like?


In [7]:
clean_question_text = []
for sentence in question_df["question"]:
    clean_question_text.append(clean_text(sentence))
print(clean_question_text[0])

how are you


In [8]:
sequence_question = tokenizer.texts_to_sequences(clean_question_text)
pad_question = pad_sequences(sequence_question, maxlen=max_length, padding="post")
tokenizer.index_word[0] = ' '

In [9]:
def decode_sequence(input_s, model):
    flag = 0
    ans_partial = np.zeros((1, max_length))
    ans_partial[0, 0] = 1
    seq = []
    for k in range(max_length - 1):
        res = model.predict([input_s, ans_partial])
        index = np.argmax(res[0][k])
        ans_partial[0, k+1] = index
        seq.append(tokenizer.index_word[index])
        # symbol <eos> is the end of decoding
        if tokenizer.index_word[index] == '<eos>':
            break
    return seq

In [11]:
import tqdm
def get_training_result(model):
    res = []
    for test_index in tqdm.trange(len(pad_question)):
        input_seq = pad_question[test_index: test_index+1]
        
        decoded_sentence = decode_sequence(input_seq, model)
        res.append(decoded_sentence)
    return res

In [12]:
from keras.models import Model, load_model
model_naive_seq2seq = load_model("s2s_voc10000.h5")
seq2seq_res = get_training_result(model_seq2seq)

In [None]:
model_concate = load_model("concate_voc10000.h5")
concate_res = get_training_result(model_concate)

In [None]:
from keras_self_attention import SeqSelfAttention
model_seq2seq_attention = load_model("s2s_attention_voc10000.h5", custom_objects=SeqSelfAttention.get_custom_objects())
seq2seq_attention_res = get_training_result(model_seq2seq_attention)

In [13]:
# remove ' ' and '<eos>'
def clean_res(res):
    clean_result = []
    for sentence in res:
        tmp = []
        for val in sentence:
            if val != ' ' and val != '<eos>':
                tmp.append(val)
        clean_result.append(tmp)
    return clean_result

In [None]:
seq2seq_clean_res = clean_res(seq2seq_res)
concate_clean_res = clean_res(concate_res)
seq2seq_attention_clean_res = clean_res(seq2seq_attention_res)

In [None]:
# from list to string
def assemble_word(res):
    sentences = []
    for val in res:
        sentences.append(' '.join(val))
    return sentences

In [None]:
seq2seq_final = assemble_word(seq2seq_clean_res)
concate_final = assemble_word(concate_clean_res)
seq2seq_attention_final = assemble_word(seq2seq_attention_clean_res)

In [None]:
# write the result back to csv file
res_df = pd.DataFrame({'question': question_df["question"],
                      'seq2seq': seq2seq_final,
                      'concate': concate_final,
                      'seq2seq_attention': seq2seq_attention_final})
res_df.to_csv("answer.csv")