In [1]:

from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

Using TensorFlow backend.


In [2]:
with open('data/relationship_advice_1000.txt', 'r') as file:
    data = file.read()
#print(data)

In [3]:


# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
    return in_text


tokenizer = Tokenizer(filters='"#$%&*+,-/:;<=>@\\^_`{|}~\t\n')
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
# retrieve vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# encode 2 words -> 1 word
sequences = list()
for i in range(2, len(encoded)):
    sequence = encoded[i-2:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))
# pad sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
# define model
model = Sequential()
model.add(Embedding(vocab_size, 20, input_length=max_length-1))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




Vocabulary Size: 2896
Total Sequences: 33018
Max Sequence Length: 3



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2, 20)             57920     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               48400     
_________________________________________________________________
dense_1 (Dense)              (None, 2896)              292496    
Total params: 398,816
Trainable params: 398,816
Non-trainable params: 0
_________________________________________________________________
None




In [4]:
# fit network
num_epochs = 50
model.fit(X, y, epochs=num_epochs, verbose=2)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/50
 - 8s - loss: 6.5501 - acc: 0.0548
Epoch 2/50
 - 7s - loss: 6.1094 - acc: 0.0540
Epoch 3/50
 - 7s - loss: 5.8229 - acc: 0.0638
Epoch 4/50
 - 7s - loss: 5.5087 - acc: 0.0830
Epoch 5/50
 - 7s - loss: 5.0855 - acc: 0.1191
Epoch 6/50
 - 7s - loss: 4.6011 - acc: 0.1657
Epoch 7/50
 - 7s - loss: 4.1438 - acc: 0.2168
Epoch 8/50
 - 7s - loss: 3.7384 - acc: 0.2685
Epoch 9/50
 - 7s - loss: 3.3845 - acc: 0.3182
Epoch 10/50
 - 7s - loss: 3.0802 - acc: 0.3667
Epoch 11/50
 - 6s - loss: 2.8177 - acc: 0.4087
Epoch 12/50
 - 6s - loss: 2.5892 - acc: 0.4465
Epoch 13/50
 - 7s - loss: 2.3918 - acc: 0.4793
Epoch 14/50


KeyboardInterrupt: 

In [5]:
def generate_prediction(string, num_words):
    prediction = generate_seq(model, tokenizer, max_length-1, string, num_words)
    return prediction

In [6]:
# evaluate model

test1 = "My wife"
test2 = "My husband"
test3 = "My friend"
test4 = "My fiance"
test5 = "My (22M)"
test6 = "My girlfriend"
test7 = "My boyfriend"
test8 = "My partner"
test9 = "My (23F)"
test10 = "My spouse"

length = 25

print(generate_prediction(test1, length),"\n")
print(generate_prediction(test2, length),"\n")
print(generate_prediction(test3, length),"\n")
print(generate_prediction(test4, length),"\n")
print(generate_prediction(test5, length),"\n")
print(generate_prediction(test6, length),"\n")
print(generate_prediction(test7, length),"\n")
print(generate_prediction(test8, length),"\n")
print(generate_prediction(test9, length),"\n")
print(generate_prediction(test10, length),"\n")


My wife is acting obsessed with this random party the wedding. why i couldn’t wanted. my [30m] girlfriend [30f] until she told me she is going to 

My husband is a 19 year old man. my(m36) wife (f43) has anyone else dealt with this? and i (26m f) got into a huge fight over 

My friend boyfriend cheated on me while i was a picture of my rope i (28m) caught my wife is acting obsessed with this random party the 

My fiance [27m] about a month ago overheard my boyfriend is mainly with me for the novelty of being a “know it all” i have a beard 

My (22M) girlfriend (20f) tried to gaslight me into the bathroom with her ex. i (m26) just got married and be causing problems update2 my gf is 

My girlfriend is acting obsessed with this random party the wedding. why i couldn’t wanted. my [30m] girlfriend [30f] until she told me she is going to 

My boyfriend is mainly with me for the novelty of being a “know it all” i have a beard neither are you. my(39m) daughter (16f) at a 

My partner life on 