In [2]:
import numpy
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import Embedding
from keras.models import load_model

import os
import yaml
import pickle
import keras

Using TensorFlow backend.


In [3]:
#function for preparing text data into sequences for training 
def data_sequencing(data):   
    # integer encode sequences of words
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data)
    with open('tokenizer.pkl', 'wb') as f: # Save the tokeniser by pickling it
        pickle.dump(tokenizer, f)

    encoded = tokenizer.texts_to_sequences(data)[0]
    # retrieve vocabulary size
    vocab_size = len(tokenizer.word_index) + 1
    print('Vocabulary Size: %d' % vocab_size)
    
    # create line-based sequences
    sequences = list()
    rev_sequences = list()
    for sentence in data:
        encoded = tokenizer.texts_to_sequences([sentence])[0]
        rev_encoded = encoded[::-1]
        for i in range(1, len(encoded)):
            sequence = encoded[:i+1]
            rev_sequence = rev_encoded[:i+1]
            sequences.append(sequence)
            rev_sequences.append(rev_sequence)
    print('Total Sequences: %d' % len(sequences))
    
    
    #find max sequence length 
    max_length = max([len(seq) for seq in sequences])
    with open('max_length.pkl', 'wb') as f: # Save max_length by pickling it
        pickle.dump(max_length, f)
    print('Max Sequence Length: %d' % max_length)

    # pad sequences and create the forward sequence
    sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
    # split into input and output elements
    sequences = array(sequences)
    X, y = sequences[:,:-1],sequences[:,-1]
    
    #pad sequences and create the reverse sequencing
    rev_sequences = pad_sequences(rev_sequences, maxlen=max_length, padding='pre')
    # split into input and output elements
    rev_sequences = array(rev_sequences)
    rev_X, rev_y = rev_sequences[:,:-1],rev_sequences[:,-1]

    return X,y,rev_X,rev_y,max_length,vocab_size

In [4]:
dir_path = 'Conversation'
files_list = os.listdir(dir_path + os.sep)

questions = list()
answers = list()
for filepath in files_list:
    stream = open( dir_path + os.sep + filepath , 'rb')
    docs = yaml.safe_load(stream)
    conversations = docs['conversations']
    for con in conversations:
        if len( con ) > 2 :
            questions.append(con[0])
            replies = con[ 1 : ]
            ans = ''
            for rep in replies:
                ans += ' ' + rep
            answers.append( ans )
        elif len( con )> 1:
            questions.append(con[0])
            answers.append(con[1])
 

In [6]:
#returning forward and reverse sequences along with max sequence 
#length from the data 
data = questions + answers
data = list(set([d for d in data if type(d) is str]))

X, y, rev_X, rev_y, max_length, vocab_size = data_sequencing(data)

Vocabulary Size: 1894
Total Sequences: 7528
Max Sequence Length: 72


In [7]:
# define forward sequence model
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=max_length-1))
#model.add(LSTM(100))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 71, 128)           242432    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_1 (Dense)              (None, 1894)              486758    
Total params: 992,358
Trainable params: 992,358
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
# define reverse model
rev_model = Sequential()
rev_model.add(Embedding(vocab_size, 128, input_length=max_length-1))
#rev_model.add(LSTM(100))
rev_model.add(Bidirectional(LSTM(128)))
rev_model.add(Dense(vocab_size, activation='softmax'))
print(rev_model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 71, 128)           242432    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_2 (Dense)              (None, 1894)              486758    
Total params: 992,358
Trainable params: 992,358
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
# compile forward sequence network
# loss is set to sparse_cat_cross because of multiple classes and no one-hot encoding
model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.005), metrics=['acc'])
# fit network
model.fit(X, y,batch_size=512, epochs=200, verbose=2, shuffle=True, validation_split=0.2)
# save the model to file
model.save('model.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 6022 samples, validate on 1506 samples
Epoch 1/2
 - 124s - loss: 7.0458 - acc: 0.0287 - val_loss: 6.4791 - val_acc: 0.0319
Epoch 2/2
 - 80s - loss: 6.2622 - acc: 0.0344 - val_loss: 6.6515 - val_acc: 0.0359


In [10]:
# compile reverse sequence network
rev_model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.005), metrics=['accuracy'])
# fit network
rev_model.fit(rev_X, rev_y,batch_size=512, epochs=200, verbose=2, shuffle=True, validation_split=0.2)
# save the model to file
rev_model.save('rev_model.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 6022 samples, validate on 1506 samples
Epoch 1/2
 - 81s - loss: 6.8006 - accuracy: 0.0385 - val_loss: 6.2235 - val_accuracy: 0.0425
Epoch 2/2
 - 52s - loss: 5.9038 - accuracy: 0.0448 - val_loss: 6.3312 - val_accuracy: 0.0425
