In [1]:
# filepaths
from os.path import basename

# Regular expressions
import glob
import re

# main modelling
import multiprocessing
import gensim.models.word2vec as w2v

# Split Train/Test
from sklearn.model_selection import train_test_split

import numpy as np

# debug time output
import time



In [2]:
import os
os.environ['KERAS_BACKEND'] = "tensorflow"

import tensorflow as tf
import keras
from keras import backend
from keras.models import load_model
from keras.models import Sequential 
from keras.layers.recurrent import LSTM, SimpleRNN

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
print("TensorFlow Version:", tf.__version__)
print("Keras Version:", keras.__version__)
print("Keras Backend:", keras.backend.backend())

TensorFlow Version: 1.8.0
Keras Version: 2.1.6
Keras Backend: tensorflow


In [4]:
# -------------------------------------------------------------------------------------------------
# Sentence Text Processing
# -------------------------------------------------------------------------------------------------

def legal_char_for_char(c):
    if   c == '\n': return ' '
    return c

def sentence_to_wordlist(sent):
    cleansent = re.sub("[^a-zA-Z]"," ", sent)
    words = cleansent.split()
    return words

def sentence_end_char(c):
    if   c == '.':  return True
    elif c == '?':  return True
    elif c == '!':  return True
    return False

def mid_sentence_abbr(str):
    if   str.endswith("Mr."): return True
    elif str.endswith("Mrs."): return True
    elif str.endswith("Dr."): return True
    elif str.endswith("Hon."): return True
    elif str.endswith("etc."): return True
    elif str.endswith("ie."): return True
    elif str.endswith("AM."): return True
    elif str.endswith("PM."): return True
    return False

def import_textfile_to_sentencelist(filepath):
    accumulated_sentence = ""
    sentencelist = []
    c = ' '
    with open (filepath, "r") as f:
        while c: 
            c = f.read(1)
            accumulated_sentence += legal_char_for_char(c)
            
            if ( (sentence_end_char(c)) and not (mid_sentence_abbr(accumulated_sentence)) ):
                sentencelist.append(accumulated_sentence.strip())
                accumulated_sentence = ""

    return sentencelist

In [5]:
# Converts on lines worth of word tokens to a sized list, with UNK padding

def linelist_to_sizedunkwordlist(inputlinelist, sizelimit):
    currlinelen = len(inputlinelist)
    if currlinelen < sizelimit:
        outputline = inputlinelist
        for i in range (sizelimit - currlinelen):
            outputline.append('unk')
    if currlinelen > WordPerSentenceLimit:
        outputline = currline[0:sizelimit-1]
    return outputline
    

In [6]:
w2v_model = w2v.Word2Vec.load('fullw2v_d250_v85000.w2v')
w2vDimension = 250
unk_vect     = w2v_model['unk']
unk2_vect    = np.ones(w2vDimension)

In [7]:
# I've found simple inputs, something like just the dialog_esl_conversations file, and around 10,000 epochs 
# of training, will start to give reasonable responses.

model = load_model('bot_E100.h5')
WordPerSentenceLimit = 12

In [8]:
# To interpret the processing going on here, disect each output-line in turn.
# 1 - The entered text
# 2 - Cropped entered text
# 3 - The w2v lookup of input text. If things are completely turned to unk's then check captialsation or the w2v contents.
# 4 - The output and confidence value.
# Enter a blank line to break the loop.

running = True

while (running):
    
    # Get a new line of text to process
    newinput_line = input("enter text:")
    
    # Exit if its too short to be valid
    if len(newinput_line) < 2:
        print("Line too short")
        break
            
    # split the new input into tokens
    newinput_words = sentence_to_wordlist(newinput_line)
    print(newinput_words)

    # Size the wordlist for the model
    sized_wordlist = linelist_to_sizedunkwordlist(newinput_words, WordPerSentenceLimit)
    print(sized_wordlist)
    
    # create specifically sized array to populate and pass into model.
    q_np_vect = np.zeros( (1, WordPerSentenceLimit, w2vDimension) )
    
    # Convert the words to vectors
    for currwordindex in range(len(sized_wordlist)):
        currword = sized_wordlist[currwordindex]
        if (currword in w2v_model):
            currvect = w2v_model[currword]
        else:
            currvect = unk_vect
        for vectindex in range(w2vDimension):
            q_np_vect[0][currwordindex][vectindex] = currvect[vectindex]
    
    # undo assignment to check inputs
    inputcheck = []
    for currwordindex in range(WordPerSentenceLimit):
        matchlist = w2v_model.most_similar( [q_np_vect[0][currwordindex]] )
        inputcheck.append(matchlist[0])
    print (inputcheck)
    
    # Call model to perform predictions
    predictions = model.predict(q_np_vect)
    
    # convert returned array of vectors back into closest words
    for currwordindex in range(WordPerSentenceLimit):
        matchlist = w2v_model.most_similar( [predictions[0][currwordindex]] )
        print (matchlist[0])

enter text:Hello
['Hello']
['Hello', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk']
[('unk', 0.9999999403953552), ('unk', 0.9999999403953552), ('unk', 0.9999999403953552), ('unk', 0.9999999403953552), ('unk', 0.9999999403953552), ('unk', 0.9999999403953552), ('unk', 0.9999999403953552), ('unk', 0.9999999403953552), ('unk', 0.9999999403953552), ('unk', 0.9999999403953552), ('unk', 0.9999999403953552), ('unk', 0.9999999403953552)]
('that', 0.5332918763160706)
('acuity', 0.5318058133125305)
('beltane', 0.547277569770813)
('beltane', 0.5536845922470093)
('beltane', 0.5425119996070862)
('unk', 0.47089889645576477)
('unk', 0.6404502391815186)
('unk', 0.6822835206985474)
('unk', 0.6904045343399048)
('unk', 0.6917603015899658)
('unk', 0.6920086741447449)
('unk', 0.6891658306121826)
enter text:hello
['hello']
['hello', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk']
[('hello', 0.9999999403953552), ('unk', 0.9999999403953552), ('unk', 0.