In [12]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
import pickle

In [13]:
file_path = '../data/relationships_10000.txt'

with open(file_path, 'r') as file:
    data = file.read()
    print("file imported")
#print(data)

file imported


In [14]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
    return in_text

In [15]:
# define model params
number_of_embeddings = 10
LSTM_units = 100


# Tokenize
tokenizer = Tokenizer(filters='"#$%&*+,-/:;<=>@\\^_`{|}~\t\n')
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
# retrieve vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# encode 2 words -> 1 word
sequences = list()
for i in range(2, len(encoded)):
    sequence = encoded[i-2:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

# pad sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

# define model
model = Sequential()
model.add(Embedding(vocab_size, number_of_embeddings, input_length=max_length-1))
model.add(LSTM(LSTM_units))
model.add(Dropout(0.1))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


Vocabulary Size: 9860
Total Sequences: 135581
Max Sequence Length: 3
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 2, 10)             98600     
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 9860)              995860    
Total params: 1,138,860
Trainable params: 1,138,860
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
def generate_arrays_from_file(path):
    while True:
        with open(path) as f:
            for line in f:
                encoded = tokenizer.texts_to_sequences([line])
                #print(encoded)
                # encode 2 words -> 1 word
                sequences = list()
                for i in range(2, len(encoded)):
                    sequence = encoded[i-2:i+1]
                    sequences.append(sequence)
                #print('Total Sequences: %d' % len(sequences))
                
                # pad sequences
                sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
                #print('Max Sequence Length: %d' % max_length)
                # split into input and output elements
                sequences = array(sequences)
                X, y = sequences[:,:-1],sequences[:,-1]
                y = to_categorical(y, num_classes=vocab_size)
                yield (X, y)

In [None]:
# fit network
batch_size = 1000
num_steps_per_epoch = 2000#int((len(data))/ batch_size)
num_epochs = 10
#model.fit_generator(generate_arrays_from_file(file_path),
#                    steps_per_epoch=num_steps_per_epoch, epochs=num_epochs, verbose=1)
model.fit(X, y, epochs=num_epochs, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
 22208/135581 [===>..........................] - ETA: 1:17 - loss: 4.4076 - acc: 0.2290

In [7]:
def generate_prediction(string, num_words):
    prediction = generate_seq(model, tokenizer, max_length-1, string, num_words)
    return prediction

In [8]:
# evaluate model

test1 = "My wife"
test2 = "My husband"
test3 = "My friend"
test4 = "My fiance"
test5 = "My (22M)"
test6 = "My girlfriend"
test7 = "My boyfriend"
test8 = "My partner"
test9 = "My (23F)"
test10 = "My spouse"

length = 25

print(generate_prediction(test1, length),"\n")
print(generate_prediction(test2, length),"\n")
print(generate_prediction(test3, length),"\n")
print(generate_prediction(test4, length),"\n")
print(generate_prediction(test5, length),"\n")
print(generate_prediction(test6, length),"\n")
print(generate_prediction(test7, length),"\n")
print(generate_prediction(test8, length),"\n")
print(generate_prediction(test9, length),"\n")
print(generate_prediction(test10, length),"\n")


My wife (34f) and i don't know what to do about my ex gf (f20) and i don't know what to do about my ex gf (f20) 

My husband (35m) of 13 years and i don't know what to do about my ex gf (f20) and i don't know what to do about my 

My friend is trying to get over my ex gf (f20) and i don't know what to do about my ex gf (f20) and i don't know 

My fiance (25m) when i try to help her insecurities? me [30f] because i don't know what to do about my ex gf (f20) and i don't 

My (22M) boyfriend (23m) is suicidal but refuses to support her i am not sure if i should break up with me and i don't know what 

My girlfriend is it time to be in a relationship with my boyfriend (21m) broke up with me and i don't know what to do about my 

My boyfriend (21m) broke up with me and i don't know what to do about my ex gf (f20) and i don't know what to do about 

My partner (24m) who i've been seeing has ed that he's not actually dying he's just a friend (24m) who i've been seeing has ed that he's 

My (2

In [9]:
model_path = "../saved_models/relationships_{}.h5".format(num_epochs)
model.save(model_path)


key_data = [tokenizer, max_length, num_epochs]

with open("../saved_models/relationships_{}.pickle".format(num_epochs), 'wb') as f:
    pickle.dump(key_data, f)

In [10]:
print(generate_prediction("Dan Welsh", 15))

Dan Welsh by my grandma (73f) and don’t know what to do about my ex gf (f20)


In [11]:
del model