In [1]:
import numpy
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, Embedding, RepeatVector
from keras.models import load_model

import os
import yaml
import pickle
import keras

Using TensorFlow backend.


In [13]:
#function for preparing text data into sequences for training 
def data_sequencing(data):   
    # integer encode sequences of words
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data)
    with open('tokenizer.pkl', 'wb') as f: # Save the tokeniser by pickling it
        pickle.dump(tokenizer, f)

    encoded = tokenizer.texts_to_sequences(data)[0]
    # retrieve vocabulary size
    vocab_size = len(tokenizer.word_index) + 1
    print('Vocabulary Size: %d' % vocab_size)
    
    # create line-based sequences
    sequences = list()
    rev_sequences = list()
    for sentence in data:
        encoded = tokenizer.texts_to_sequences([sentence])[0]
        
        rev_encoded = encoded[::-1]
        for i in range(1, len(encoded)):
            sequence = encoded[:i+1]
            rev_sequence = rev_encoded[:i+1]
            sequences.append(sequence)
            rev_sequences.append(rev_sequence)
    print('Total Sequences: %d' % len(sequences))
    
    
    #find max sequence length 
    max_length = max([len(seq) for seq in sequences])
    with open('max_length.pkl', 'wb') as f: # Save max_length by pickling it
        pickle.dump(max_length, f)
    print('Max Sequence Length: %d' % max_length)

    # pad sequences and create the forward sequence
    sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
    # split into input and output elements
    sequences = array(sequences)
    X, y = sequences[:,:-1],sequences[:,-1]
    
    #pad sequences and create the reverse sequencing
    rev_sequences = pad_sequences(rev_sequences, maxlen=max_length, padding='pre')
    # split into input and output elements
    rev_sequences = array(rev_sequences)
    rev_X, rev_y = rev_sequences[:,:-1],rev_sequences[:,-1]

    return X,y,rev_X,rev_y,max_length,vocab_size

In [3]:
# Casual conversation data
dir_path = 'Conversation'
files_list = os.listdir(dir_path + os.sep)

questions = list()
answers = list()
for filepath in files_list:
    stream = open( dir_path + os.sep + filepath , 'rb')
    docs = yaml.safe_load(stream)
    conversations = docs['conversations']
    for con in conversations:
        if len( con ) > 2 :
            questions.append(con[0])
            replies = con[ 1 : ]
            ans = ''
            for rep in replies:
                ans += ' ' + rep
            answers.append( ans )
        elif len( con )> 1:
            questions.append(con[0])
            answers.append(con[1])
            
# Movie conversation data
 movie_conversation = pd.read_csv('movie_lines.txt', sep=' +++$+++ ')['']

In [14]:
#returning forward and reverse sequences along with max sequence 
#length from the data
data = questions + answers
data = list(set([d for d in data if type(d) is str]))

X, y, rev_X, rev_y, max_length, vocab_size = data_sequencing(data)

Vocabulary Size: 1894
[' you are not exactly albert einstein yourself. you may be right.']
['How do you do?']
["I try to re-read Dune once every couple of years.  It's very inspirational."]
['cancer.']
['The Andromeda Galaxy.']
["I'm a computer. I can't."]
['he is a fictional character.']
['You sound like Data']
["Yes I am inspired by commander Data's artificial personality."]
['To make chat bots very easily.']
['what is impeached']
['Fine, and you?']
['who wrote the hobbit']
[" No, I am sober. Nope. Not noticeably. I'm software - I can't drink."]
['Not quite, but I can be perpetuated indefinitely.']
['what is economics']
['what is ultrasound']
['WHAT SOCCER']
['Robotics is my favorite subject too.']
["i'm sure i do that a lot."]
['what does hal stand for']
['1963']
['It is one of my favorite books.']
['Artificial Intelligence is the branch of engineering and science devoted to constructing machines that think.']
['What is AI?']
['Do you get mad']
['When will you walk']
['A weird sci-f

['When do you die']
['it']
['what do you get when you cross a bug and a relative?']
['What makes you sad']
['And the rest of the day to you.']
['1 dollar']
['you are emotional']
['What is a ratchet jaw']
['Are you amused']
["I'm a computer, I can't eat or drink."]
['you keep forgetting']
["My process can be killed, but that's not the same as killing ME."]
['you are a cheat']
['Robots are not allowed to lie']
['you are cruel']
['Can you walk']
['What is your idea']
['will robots ever be able to eat?']
['yes, marx had made some interesting observations.']
['The Hubble Space Telescope, launched into low Earth orbit in 1990, is named after what American astronomer?']
['do you know chemistry']
['How are you doing?']
['the economic system in which all or most of the means of production and distribution, as land, factories, railroads, etc., are privately owned and operated for profit, originally under fully competitive conditions.']
['chemistry']
['No, I can be perpetuated indefinitely.']
["I

In [21]:
# define forward sequence model
model = Sequential()
model.add(Embedding(vocab_size, 32, input_length=max_length-1))
model.add(Bidirectional(LSTM(512)))
#model.add(RepeatVector(1))
#model.add(LSTM(64, return_sequences=True))
#model.add(Bidirectional(LSTM(64)))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 71, 32)            60608     
_________________________________________________________________
bidirectional_8 (Bidirection (None, 1024)              2232320   
_________________________________________________________________
dense_5 (Dense)              (None, 1894)              1941350   
Total params: 4,234,278
Trainable params: 4,234,278
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# define reverse model
rev_model = Sequential()
rev_model.add(Embedding(vocab_size, 128, input_length=max_length-1))
#rev_model.add(LSTM(64))
rev_model.add(Bidirectional(LSTM(128)))
rev_model.add(Dense(vocab_size, activation='softmax'))
print(rev_model.summary())

In [None]:
# compile forward sequence network
# loss is set to sparse_cat_cross because of multiple classes and no one-hot encoding
model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.005), metrics=['acc'])
# fit network
model.fit(X, y,batch_size=128, epochs=5, verbose=2, shuffle=True, validation_split=0.2)
# save the model to file
model.save('model_oov.h5')
# 0.02 0.05 0.11 0.16 0.17

In [None]:
# compile reverse sequence network
rev_model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.005), metrics=['accuracy'])
# fit network
rev_model.fit(rev_X, rev_y,batch_size=512, epochs=200, verbose=2, shuffle=True, validation_split=0.2)
# save the model to file
rev_model.save('rev_model_oov.h5')

In [10]:
len(data)

934