In [1]:
import keras
import os
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.utils import to_categorical
from pickle import dump

Using TensorFlow backend.


In [2]:
# load doc into memory
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text
 
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close


In [3]:
raw_text = load_doc("data/shakespeare_cleaned.txt")
 
# clean
tokens = raw_text.split()
raw_text = ' '.join(tokens)
 
# organize into sequences of characters
length = 40
sequences = []
for i in range(length, len(raw_text)):
    # select sequence of tokens
    seq = raw_text[i-length:i+1]
    sequences.append(seq)
print('Total Sequences: %d' % len(sequences))
 
# save sequences to file
out_filename = 'data/char_sequences.txt'
save_doc(sequences, out_filename)

Total Sequences: 93633


In [4]:
in_filename = 'data/char_sequences.txt'
raw_text = load_doc(in_filename)
lines = raw_text.split('\n')

chars = sorted(list(set(raw_text)))
mapping = dict((c, i) for i, c in enumerate(chars))

sequences = []

for line in lines:
    # integer encode line
    encoded_seq = [mapping[char] for char in line]
    # store
    sequences.append(encoded_seq)

    # vocabulary size
vocab_size = len(mapping)
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 61


In [5]:
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
X = np.array(sequences)
y = to_categorical(y, num_classes=vocab_size)

In [6]:
# define model
model = Sequential()
model.add(LSTM(200, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(vocab_size, activation='softmax'))
# model.add(Lambda(lambda x: x / temp))
print(model.summary())

# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, epochs=100, verbose=2)
 
# save the model to file
model.save('model.h5')
# save the mapping
dump(mapping, open('mapping.pkl', 'wb'))

W0313 00:34:58.183900 4652291520 deprecation_wrapper.py:119] From /Users/jma/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0313 00:34:58.209814 4652291520 deprecation_wrapper.py:119] From /Users/jma/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0313 00:34:58.220521 4652291520 deprecation_wrapper.py:119] From /Users/jma/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0313 00:34:58.408742 4652291520 deprecation_wrapper.py:119] From /Users/jma/anaconda3/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0313 00:34:58.426721 4652291520 dep

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 200)               209600    
_________________________________________________________________
dense_1 (Dense)              (None, 61)                12261     
Total params: 221,861
Trainable params: 221,861
Non-trainable params: 0
_________________________________________________________________
None


W0313 00:34:58.972331 4652291520 deprecation_wrapper.py:119] From /Users/jma/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Epoch 1/100
 - 86s - loss: 2.3535 - acc: 0.3388
Epoch 2/100
 - 83s - loss: 1.9566 - acc: 0.4261
Epoch 3/100
 - 83s - loss: 1.8227 - acc: 0.4578
Epoch 4/100
 - 85s - loss: 1.7295 - acc: 0.4801
Epoch 5/100
 - 89s - loss: 1.6633 - acc: 0.4970
Epoch 6/100


KeyboardInterrupt: 

In [None]:
# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
    in_text = seed_text
    # generate a fixed number of characters
    for _ in range(n_chars):
        # encode the characters as integers
        encoded = [mapping[char] for char in in_text]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # one hot encode
        encoded = to_categorical(encoded, num_classes=len(mapping))
        encoded = encoded.reshape(1, encoded.shape[0], encoded.shape[1])
        # predict character
        yhat = model.predict_classes(encoded, verbose=0)
        # reverse map integer to character
        out_char = ''
        for char, index in mapping.items():
            if index == yhat:
                out_char = char
                break
        # append to input
        in_text += char
    return in_text
 
# load the model
model = load_model('model.h5')
# load the mapping
mapping = load(open('mapping.pkl', 'rb'))
 
# test start of rhyme
print(generate_seq(model, mapping, 10, 'Sing a son', 20))
# test mid-line
print(generate_seq(model, mapping, 10, 'king was i', 20))
# test not in original
print(generate_seq(model, mapping, 10, 'hello worl', 20))