In [2]:
#Importing dependencies
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

In [3]:
#Loading data
file = open("../input/shakes/shakes.txt").read()

In [4]:
#Tokanization
#Standardization
def tokanize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r"\w+")
    tokens = tokenizer.tokenize(input)
    filtered = filter( lambda token: token not in stopwords.words("english"), tokens)
    return "".join(filtered)

processed_input = tokanize_words(file)

In [5]:
#Converting chars to numbers
chars = sorted(list(set(processed_input)))
chars_to_num = dict((c, i) for i, c in enumerate(chars))

In [6]:
#Checking if words to chars or chars to num has worked
input_len = len(processed_input)
vocab_len = len(chars)
print("Total number of characters:", input_len)
print("Total vocab:", vocab_len)

In [7]:
#Seq length
seq_length = 100
x_data = []
y_data = []

In [8]:
#Looping through the sequence
for i in range(0, input_len - seq_length, 1):
    in_seq = processed_input[i:i+seq_length]
    out_seq = processed_input[i+seq_length]
    x_data.append([chars_to_num[char] for char in in_seq])
    y_data.append(chars_to_num[out_seq])
    
n_pattern = len(x_data)
print("Total patterns:", n_pattern)

In [9]:
#Converting input sequence in numpy array
X = numpy.reshape(x_data, (n_pattern, seq_length, 1))
X = X/float(vocab_len)

In [10]:
y = np_utils.to_categorical(y_data)

In [11]:
#Creating the model
model = Sequential()
model.add(LSTM (256, input_shape = (X. shape[1], X.shape[2]), return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM( 256, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM (128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation = "softmax"))

In [12]:
#Compiling the model
model.compile(loss = "categorical_crossentropy", optimizer = "adam")

In [13]:
#Saving weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint (filepath, monitor = 'loss', verbose = 1, save_best_only = True, mode='min')
desired_callbacks = [checkpoint]

In [14]:
#Fit model and let it train
model.fit(X, y, epochs = 1, batch_size = 256, callbacks = desired_callbacks)

In [15]:
#Recompiling model with the same weights
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss = "categorical_crossentropy", optimizer = "adam")

In [16]:
#Output of the model back into the characters
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [17]:
#Random seed
start = numpy.random.randint(0, len(x_data)-1)
pattern = x_data[start]
print("Random Seed: ", end="")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

In [22]:
#Generating the text
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern) ,1))
    x = x/float(vocab_len)
    prediction = model.predict(x, verbose = 0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]