In [1]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
import pickle

Using TensorFlow backend.


In [2]:
file_path = '../data/relationships_10000.txt'

with open(file_path, 'r') as file:
    data = file.read()
    print("file imported")
#print(data)

file imported


In [3]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
    return in_text

In [None]:
# define model params
number_of_embeddings = 50
LSTM_units = 100


# Tokenize
tokenizer = Tokenizer(filters='"#$%&*+,-/:;<=>@\\^_`{|}~\t\n')
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
# retrieve vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# encode 2 words -> 1 word
sequences = list()
for i in range(2, len(encoded)):
    sequence = encoded[i-2:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

# pad sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

# define model
model = Sequential()
model.add(Embedding(vocab_size, number_of_embeddings, input_length=max_length-1))
model.add(LSTM(LSTM_units))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


Vocabulary Size: 9860
Total Sequences: 135581
Max Sequence Length: 3



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2, 50)             493000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               60400     
_________________________________________________________________
dense_1 (Dense)              (None, 9860)              995860    
Total params: 1,549,260
Trainable params: 1,549,260
Non-trainable params: 0
_________________________________________________________________
None




In [None]:
def generate_arrays_from_file(path):
    while True:
        with open(path) as f:
            for line in f:
                encoded = tokenizer.texts_to_sequences([line])
                #print(encoded)
                # encode 2 words -> 1 word
                sequences = list()
                for i in range(2, len(encoded)):
                    sequence = encoded[i-2:i+1]
                    sequences.append(sequence)
                #print('Total Sequences: %d' % len(sequences))
                
                # pad sequences
                sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
                #print('Max Sequence Length: %d' % max_length)
                # split into input and output elements
                sequences = array(sequences)
                X, y = sequences[:,:-1],sequences[:,-1]
                y = to_categorical(y, num_classes=vocab_size)
                yield (X, y)

In [None]:
# fit network
batch_size = 1000
num_steps_per_epoch = 2000#int((len(data))/ batch_size)
num_epochs = 50
#model.fit_generator(generate_arrays_from_file(file_path),
#                    steps_per_epoch=num_steps_per_epoch, epochs=num_epochs, verbose=1)
model.fit(X, y, epochs=num_epochs, verbose=1)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/50
 - 77s - loss: 6.5682 - acc: 0.0697
Epoch 2/50
 - 78s - loss: 5.6326 - acc: 0.1429
Epoch 3/50
 - 76s - loss: 5.1494 - acc: 0.1780
Epoch 4/50
 - 79s - loss: 4.8148 - acc: 0.1987
Epoch 5/50
 - 77s - loss: 4.5468 - acc: 0.2158
Epoch 6/50
 - 79s - loss: 4.3175 - acc: 0.2341
Epoch 7/50
 - 78s - loss: 4.1177 - acc: 0.2523
Epoch 8/50
 - 77s - loss: 3.9401 - acc: 0.2686
Epoch 9/50
 - 80s - loss: 3.7780 - acc: 0.2872
Epoch 10/50
 - 78s - loss: 3.6288 - acc: 0.3059
Epoch 11/50
 - 79s - loss: 3.4932 - acc: 0.3231
Epoch 12/50
 - 78s - loss: 3.3701 - acc: 0.3412
Epoch 13/50
 - 79s - loss: 3.2548 - acc: 0.3562
Epoch 14/50
 - 79s - loss: 3.1533 - acc: 0.3706
Epoch 15/50
 - 78s - loss: 3.0610 - acc: 0.3827
Epoch 16/50


In [None]:
def generate_prediction(string, num_words):
    prediction = generate_seq(model, tokenizer, max_length-1, string, num_words)
    return prediction

In [None]:
# evaluate model

test1 = "My wife"
test2 = "My husband"
test3 = "My friend"
test4 = "My fiance"
test5 = "My (22M)"
test6 = "My girlfriend"
test7 = "My boyfriend"
test8 = "My partner"
test9 = "My (23F)"
test10 = "My spouse"

length = 25

print(generate_prediction(test1, length),"\n")
print(generate_prediction(test2, length),"\n")
print(generate_prediction(test3, length),"\n")
print(generate_prediction(test4, length),"\n")
print(generate_prediction(test5, length),"\n")
print(generate_prediction(test6, length),"\n")
print(generate_prediction(test7, length),"\n")
print(generate_prediction(test8, length),"\n")
print(generate_prediction(test9, length),"\n")
print(generate_prediction(test10, length),"\n")


In [None]:
model_path = "../saved_models/relationships_{}.h5".format(num_epochs)
model.save(model_path)


key_data = [tokenizer, max_length, num_epochs]

with open("../saved_models/relationships_{}.pickle".format(num_epochs), 'wb') as f:
    pickle.dump(key_data, f)

In [None]:
print(generate_prediction("Dan Welsh", 15))

In [None]:
del model