In [1]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
import pickle

Using TensorFlow backend.


In [2]:
file_path = '../data/relationship_advice_1000.txt'

with open(file_path, 'r') as file:
    data = file.read()
#print(data)

In [3]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
    return in_text

In [4]:
# Tokenize
tokenizer = Tokenizer(filters='"#$%&*+,-/:;<=>@\\^_`{|}~\t\n')
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
# retrieve vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# encode 2 words -> 1 word
sequences = list()
for i in range(2, len(encoded)):
    sequence = encoded[i-2:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

# pad sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=max_length-1))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


Vocabulary Size: 2932
Total Sequences: 16602
Max Sequence Length: 3



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2, 50)             146600    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               60400     
_________________________________________________________________
dense_1 (Dense)              (None, 2932)              296132    
Total params: 503,132
Trainable params: 503,132
Non-trainable params: 0
_________________________________________________________________
None




In [5]:
# fit network
num_epochs = 100
model.fit(X, y, epochs=num_epochs, verbose=2)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/20
 - 4s - loss: 6.7777 - acc: 0.0545
Epoch 2/20
 - 4s - loss: 6.2223 - acc: 0.0560
Epoch 3/20
 - 4s - loss: 6.0444 - acc: 0.0601
Epoch 4/20
 - 4s - loss: 5.8323 - acc: 0.0729
Epoch 5/20
 - 4s - loss: 5.6126 - acc: 0.0900
Epoch 6/20
 - 4s - loss: 5.3589 - acc: 0.1131
Epoch 7/20
 - 4s - loss: 5.0464 - acc: 0.1395
Epoch 8/20
 - 4s - loss: 4.7225 - acc: 0.1691
Epoch 9/20
 - 4s - loss: 4.4222 - acc: 0.1951
Epoch 10/20
 - 4s - loss: 4.1446 - acc: 0.2236
Epoch 11/20
 - 4s - loss: 3.8930 - acc: 0.2539
Epoch 12/20
 - 4s - loss: 3.6637 - acc: 0.2784
Epoch 13/20
 - 4s - loss: 3.4530 - acc: 0.3091
Epoch 14/20
 - 4s - loss: 3.2568 - acc: 0.3410
Epoch 15/20
 - 4s - loss: 3.0762 - acc: 0.3681
Epoch 16/20
 - 4s - loss: 2.9045 - acc: 0.3968
Epoch 17/20
 - 4s - loss: 2.7468 - acc: 0.4236
Epoch 18/20
 - 5s - loss: 2.6015 - acc: 0.4462
Epoch 19/20
 - 4s - loss: 2.4647 - acc: 0.4739
Epoch 20/20
 - 4s - l

<keras.callbacks.History at 0x10acce0f0>

In [6]:
def generate_prediction(string, num_words):
    prediction = generate_seq(model, tokenizer, max_length-1, string, num_words)
    return prediction

In [7]:
# evaluate model

test1 = "My wife"
test2 = "My husband"
test3 = "My friend"
test4 = "My fiance"
test5 = "My (22M)"
test6 = "My girlfriend"
test7 = "My boyfriend"
test8 = "My partner"
test9 = "My (23F)"
test10 = "My spouse"

length = 25

print(generate_prediction(test1, length),"\n")
print(generate_prediction(test2, length),"\n")
print(generate_prediction(test3, length),"\n")
print(generate_prediction(test4, length),"\n")
print(generate_prediction(test5, length),"\n")
print(generate_prediction(test6, length),"\n")
print(generate_prediction(test7, length),"\n")
print(generate_prediction(test8, length),"\n")
print(generate_prediction(test9, length),"\n")
print(generate_prediction(test10, length),"\n")


My wife is going to abandon her and now i don’t know what to do i do? my gf (24f) of 3 years just informed me and 

My husband (31m) has been cheating on me and he is going to abandon her and now i don’t know what to do i do? my gf 

My friend boyfriend (21m) is extremely addicted to gaming my wife is going to abandon her and now i don’t know what to do i do? my 

My fiance (27f) possibly gave a stripper a handjob at her bachelorette party planning on leaving [update] my wife is going to abandon her and now i 

My (22M) girlfriend (20f) of her and now i don’t know what to do i do? my gf (24f) of 3 years just informed me and he 

My girlfriend [25f] of two years now i don’t know what to do i do? my gf (24f) of 3 years just informed me and he is 

My boyfriend (27m) is cheating on me and he is going to abandon her and now i don’t know what to do i do? my gf (24f) 

My partner socks to wipe after going to abandon her and now i don’t know what to do i do? my gf (24f) of 3 years just 

My (

In [8]:
model_path = "../saved_models/ra_top_{}.h5".format(num_epochs)
model.save(model_path)


key_data = [tokenizer, max_length, num_epochs]

with open("../saved_models/ra_top_{}.pickle".format(num_epochs), 'wb') as f:
    pickle.dump(key_data, f)

In [9]:
print(generate_prediction("Dan Welsh", 15))

Dan Welsh a bad choice. my (f30) boyfriend just told me she was pregnant. i broke up


In [10]:
del model