In [355]:
#Importing relevant modules
import numpy as np
import re
import tensorflow as tf 
import random

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.preprocessing.text import one_hot

In [356]:
#preprocessing shakespeare text
f = open('data/shakespeare.txt', 'r')

lines = []
for line in f:
    lines.append(line)

#Corpus is the raw_data
corpus = [line[:-2] for line in lines if len(line.split()) > 1]

#adds all the text in the document to the string corpus-concat
corpus_concat = ""
for line in corpus:
    corpus_concat+=line + " "
corpus_concat = re.sub(r'[^\w]',' ', corpus_concat).lower()

In [363]:
#Makes a list of subsequences and test labels
subsequences = []
test_labels = []
#Sequence buffer is the 'n' that determines the difference between successive sequences
sequence_buffer = 5
char_size = 40

for i in range(0, len(corpus_concat)-1, sequence_buffer):
    #Ensures that the test_label is not empty
    if (corpus_concat[i+char_size:i+(char_size+1)]!=''):
        subsequences.append(corpus_concat[i:i+char_size])
        test_labels.append(corpus_concat[i+char_size:i+(char_size+1)])
        
#Makes a list of all the characters involved and makes a dictionary out of them
chars = sorted(list(set(corpus_concat)))
forward_mapping = dict((c, i) for i, c in enumerate(chars))
#So we can have an inverse
backward_mapping = dict((i, c) for i, c in enumerate(chars))

print ("Num of sequences:", len(subsequences))
n_patterns = len(subsequences)

Num of sequences: 18419


In [365]:
#Encoding the labels
labels_encoded = [forward_mapping[lab] for lab in test_labels]

#Encoding the subsequences
seq_list = list()
for sequence in subsequences:
    encoded_seq = [forward_mapping[char] for char in sequence]
    seq_list.append(encoded_seq)

# vocabulary size
vocab_size = len(chars)
print ("vocab size: ", vocab_size)

#Changing from normal array/list to numpy array
normal_seq_list = []
for seq in seq_list:
    normal_seq_list.append(np.array(seq))
normal_seq_list = np.array(normal_seq_list)

vocab size:  27


In [369]:
#Using to_categorical to prepare for sending through neural network.
seq_input = [keras.utils.to_categorical(seq, num_classes=vocab_size) for seq in normal_seq_list]
X = np.array(seq_input)
y = keras.utils.to_categorical(labels_encoded, num_classes=vocab_size)

print ("Shape of X:", X.shape)

Shape of X: (18419, 40, 27)


In [362]:
# Setting Model architecture
model = Sequential()

model.add(LSTM(200, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(vocab_size, activation='softmax'))

#Using crossentropy and adam 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#ADJUST NUMBER OF EPOCHS
model.fit(X, y, epochs=1, batch_size = 50, verbose=1)

Epoch 1/1


<keras.callbacks.History at 0x134e42da0>

In [374]:
#Starting at random sequence in X_new
rand = random.randint(0, len(X_new))
pattern = X_new[rand]

#getting indices of highest elements and printing corresponding string
nums = ([np.argmax(x) for x in pattern])
print (''.join([backward_mapping[value] for value in nums]), "|")

for i in range (1000):
    #Making sure we have the right size
    to_predict = numpy.reshape(pattern[-char_size:], (1, char_size, vocab_size))
    prediction = model.predict(to_predict, verbose=0)
    #Getting index of largest element
    index = numpy.argmax(prediction)
    #Changing to categorical because that is how 'pattern' elements are stored
    y = keras.utils.to_categorical(index, num_classes=vocab_size)
    pattern = np.vstack((pattern, y))

s testy sick men when their deaths be ne |


In [375]:
nums = ([np.argmax(x) for x in pattern])
string = [backward_mapping[value] for value in nums]

print(''.join(string), "|")

s testy sick men when their deaths be ne the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the