In [16]:
from __future__ import division, print_function
from gensim.models import Word2Vec
import keras.backend as K
from keras.layers.merge import dot
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Merge, Dropout, Input, merge, Dot
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential, Model
from keras.layers.core import Reshape, Lambda
from keras.optimizers import Adadelta
from keras.preprocessing.sequence import pad_sequences
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import kaggle
import nltk
import gensim

In [2]:
cq = pd.read_csv('test-utterances.csv', header=0, encoding='latin1')
cq.drop('Pass/ Fail', axis=1, inplace=True)
cq.columns = ["TrainingUtterance", "TrainingAnswer", "TestUtterance"]

In [3]:
train_corpus = {"Question" : [], "Answer": []}
with open("All Questions.txt", 'r', encoding='latin1') as f:
    for line in f.readlines():
        if line.lower().startswith('q:'):
            ques = line.replace('Q:', '').replace('\n', '').lstrip()
            train_corpus["Question"].append(ques)
        if line.lower().startswith('a:'):
            ans = line.replace('A:', '').replace('\n', '').lstrip()
            train_corpus["Answer"].append(ans)

In [4]:
data = pd.DataFrame.from_dict(train_corpus)
data['qna'] = data[data.columns[[0,1]]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 661 entries, 0 to 660
Data columns (total 3 columns):
Answer      661 non-null object
Question    661 non-null object
qna         661 non-null object
dtypes: object(3)
memory usage: 15.6+ KB


In [5]:
# DATA_DIR = "../data/comp_data"
# MODEL_DIR = "../data/models"
WORD2VEC_BIN = "GoogleNews-vectors-negative300.bin.gz"
WORD2VEC_EMBED_SIZE = 300

QA_EMBED_SIZE = 64
BATCH_SIZE = 32
NBR_EPOCHS = 20

In [6]:
words = []

question_maxlen = data.Question.map(len).max()
answer_maxlen = data.Answer.map(len).max()
seq_maxlen = max([question_maxlen, answer_maxlen])

for ques in data.Question.values:
    words.extend(nltk.word_tokenize(ques))
for ans in data.Answer.values:
    words.extend(nltk.word_tokenize(ans))

words = set(words)

word2idx = {w: i+1 for i, w in enumerate(words)}
vocab_size = len(word2idx) + 1 

In [7]:
Xq, Xa, Y = [], [], []

for ques in data.Question.values:
    Xq.append([word2idx[qword] for qword in nltk.word_tokenize(ques)])
Xq = pad_sequences(Xq, maxlen=seq_maxlen)

for ans in data.Answer.values:
    Xa.append([word2idx[aword] for aword in nltk.word_tokenize(ans)])
Xa = pad_sequences(Xa, maxlen=seq_maxlen)

Y = np.ones(len(data.Question))


Xqtrain, Xqtest, Xatrain, Xatest, Ytrain, Ytest = \
    train_test_split(Xq, Xa, Y, test_size=0.3, random_state=42)
print(Xqtrain.shape, Xqtest.shape, Xatrain.shape, Xatest.shape, 
      Ytrain.shape, Ytest.shape)

(462, 1582) (199, 1582) (462, 1582) (199, 1582) (462,) (199,)


In [10]:
# word2vec = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_BIN, binary=True)
# embedding_weights = np.zeros((vocab_size, WORD2VEC_EMBED_SIZE))
# for word, index in word2idx.items():
#     try:
#         embedding_weights[index, :] = word2vec[word.lower()]
#     except KeyError:
#         pass

# del word2vec

In [11]:
# np.savetxt("embedding_seq2seq_new.csv", embedding_weights, delimiter=",")

In [8]:
embedding_weights = np.loadtxt("embedding_seq2seq_new.csv", delimiter=",")

In [17]:
gradient_clipping_norm = 1.25

def exp_l2_dist(ht_a, ht_b):
    return K.exp(-K.sum(K.abs(ht_a-ht_b), axis=1, keepdims=True))

input_a = Input(shape=(seq_maxlen,))
input_b = Input(shape=(seq_maxlen,))

embedding_layer = Embedding(len(embedding_weights), WORD2VEC_EMBED_SIZE, weights=[embedding_weights], input_length=seq_maxlen, trainable=False)

encoded_a = embedding_layer(input_a)
encoded_b = embedding_layer(input_b)

shared_lstm = LSTM(QA_EMBED_SIZE)

a_out = shared_lstm(encoded_a)
b_out = shared_lstm(encoded_b)

malstm_distance = Merge(mode=lambda x: exp_l2_dist(x[0], x[1]), 
                        output_shape=lambda x: (x[0][0], 1))([a_out, b_out])

malstm = Model([input_a, input_b], [malstm_distance])

optimizer = Adadelta(clipnorm=gradient_clipping_norm)
malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])



In [20]:

checkpoint = ModelCheckpoint( filepath="qa-lstm-best_cosine.hdf5", verbose=1, save_best_only=True)
malstm.fit([Xqtrain, Xatrain], Ytrain, batch_size=BATCH_SIZE,
          epochs=NBR_EPOCHS, validation_split=0.1,
          callbacks=[checkpoint])


Train on 415 samples, validate on 47 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1a373e50978>

In [34]:
test_question= "How can I find my statement online"
expect_question = "find statement"

In [35]:
test_Xq = pad_sequences([[word2idx[qword] for qword in nltk.word_tokenize(test_question)]], maxlen=seq_maxlen)
test_Xa = pad_sequences([[word2idx[aword] for aword in nltk.word_tokenize(expect_question)]], maxlen=seq_maxlen)

In [36]:
# best_match = [model.predict([test_Xq, test_Xa], verbose=0) for test_Xa in Xa]
malstm.predict([test_Xq, test_Xa], verbose=0) 

array([[ 0.51078147]], dtype=float32)

In [39]:
test_question1= "How can I find my statement online"
test_q1 = "online statement"
test_Xq1 = pad_sequences([[word2idx[qword] for qword in nltk.word_tokenize(test_question1)]], maxlen=seq_maxlen)
test_Xa1 = pad_sequences([[word2idx[aword] for aword in nltk.word_tokenize(test_q1)]], maxlen=seq_maxlen)
malstm.predict([test_Xq1, test_Xa1], verbose=0) 

array([[ 0.51843333]], dtype=float32)

In [None]:
test_question1= "How can I find my statement online"
test_q1 = "online statement"
test_Xq1 = pad_sequences([[word2idx[qword] for qword in nltk.word_tokenize(test_question1)]], maxlen=seq_maxlen)
test_Xa1 = pad_sequences([[word2idx[aword] for aword in nltk.word_tokenize(test_q1)]], maxlen=seq_maxlen)
malstm.predict([test_Xq1, test_Xa1], verbose=0) 

In [41]:
malstm.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_7 (InputLayer)             (None, 1582)          0                                            
____________________________________________________________________________________________________
input_8 (InputLayer)             (None, 1582)          0                                            
____________________________________________________________________________________________________
embedding_4 (Embedding)          (None, 1582, 300)     1018200     input_7[0][0]                    
                                                                   input_8[0][0]                    
____________________________________________________________________________________________________
lstm_4 (LSTM)                    (None, 64)            93440       embedding_4[0][0]       

In [54]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.util import ngrams
synonyms = {}
with open("di_synonyms.csv", 'r', encoding='latin1') as sf:
    for row in sf.readlines():
        row = row.replace("\n", '').split(",")
        canonical, rest = row[0], row[1:]
        for term in rest:
            if term:
                synonyms[term.lower()] = canonical.lower()
                synonyms[canonical.lower()] = canonical.lower()

def syn_words(sentence):
    tokens = word_tokenize(sentence)
    for seq in ngrams(tokens, 3):
        rep_str = ' '.join(seq)
        if rep_str.lower() in synonyms:
            sentence = sentence.replace(rep_str, synonyms[rep_str.lower()])
    for seq in ngrams(tokens, 2):
        rep_str = ' '.join(seq)
        if rep_str.lower() in synonyms:
            sentence = sentence.replace(rep_str, synonyms[rep_str.lower()])
    for seq in ngrams(tokens, 1):
        rep_str = ' '.join(seq)
        if rep_str.lower() in synonyms:  
            sentence = sentence.replace(rep_str, synonyms[rep_str.lower()])    
    return sentence

cq['CleanedTestUtterance'] = cq['TestUtterance'].map(syn_words)
data['CleanedQuestion'] = data['Question'].map(syn_words)

In [61]:
Xtq = []
for testq in cq.CleanedTestUtterance:
    temp = []
    for aword in nltk.word_tokenize(testq):
        try:
            temp.append(word2idx[aword])
        except:
            pass # Can't really do anything!!
    Xtq.append(temp)
Xtq = pad_sequences(Xtq, maxlen=seq_maxlen)

In [66]:
best_match =  [malstm.predict([test_Xq1, np.reshape(testq, (1,1582))], verbose=0) for testq in Xtq]

In [70]:
np.array(best_match).argmax()

53

In [71]:
data.Question[53]

"I can't view my IRA online?"

In [73]:
test_question1= "where can i find free atm"
test_Xq1 = pad_sequences([[word2idx[qword] for qword in nltk.word_tokenize(test_question1)]], maxlen=seq_maxlen)
best_match =  [malstm.predict([test_Xq1, np.reshape(testq, (1,1582))], verbose=0) for testq in Xtq]

In [74]:
np.array(best_match).argmax()

0

In [75]:
data.Question[0]

'where would i not be charged atm fees?'