In [1]:
import keras
import numpy as np
import re
import os

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Read in the corpus and clean it

Choose which folder of text files (plays or trump) you want to read in and change the path. Also choose which functions to call to clean the corpus.  

In [2]:
# this is used to remove stage direction if we don't want them
def remove_stage_dir(text):
    text = re.sub("[\<].*?[\>]", "", text)
    text = re.sub("\\s+", " ", text)
    return text
# this is used to remove the word "SPEECH" adn the number following after that in the corpus
def remove_SPEECH(text):
    text = re.sub("SPEECH \d+", "", text)
    text = re.sub("\\s+", " ", text)
    return text

path = './plays' #change the path accordingly
in_sentences=[]
# read all files in the floder (need to be txt with UTF-8 encoding)
# chop it up in sentances (for Tokenizer)
for filename in os.listdir(path):
    text = ''.join(open(path+'/'+filename, encoding = "UTF-8", mode="r").readlines())
    split_text = re.split(r' *[\.\?!][\'"\)\]]* *', remove_stage_dir(text)) #change the function accordingly
    for chunk in split_text:
        in_sentences.append(chunk)

print(in_sentences[0:10])
print('Corpus length:', len(text))

["\ufeff Nay, but this dotage of our general's O'erflows the measure; those his goodly eyes, That o'er the files and musters of the war Have glow'd like plated Mars, now bend, now turn The office and devotion of their view Upon a tawny front; his captain's heart, Which in the scuffles of great fights hath burst The buckles on his breast, reneges all temper, And is become the bellows and the fan To cool a gipsy's lust", 'Look', 'where they come', "Take but good note, and you shall see in him The triple pillar of the world transform'd Into a strumpet's fool; behold and see", 'If it be love indeed, tell me how much', "There's beggary in the love that can be reckon'd", "I'll set a bourn how far to be belov'd", 'Then must thou needs find out new heaven, new earth', 'News, my good lord, from Rome', 'Grates me; the sum']
Corpus length: 135007


## Set the hyper-parameters and preparing the training sample
Here we choose the length of each sample sentances and the stride between each samples (setting the hyper-parameters). We then use the Tokenizer in Keras to tokenize the samples. We can also set out library size (i.e. set the maximum the number of words in the entire library)

The corpus is chopped up in natural sentances for the tokenization. It is then sticked back together as a large sequence, then we sample our sentances using the hyper-parameter settings.

After that, we normalize the sample before feeding into the neural network. We also have to one-hot encode the training label y.

In [3]:
# Length of extracted sample
maxlen = 20

# Stride of sampling
step = 1

# This holds our samples sequences
sentences = []

# This holds the next word (as training label)
next_word = []

#use Kears Tokenizer
from keras.preprocessing.text import Tokenizer

max_num_word = 10000 #max size of library
tokenizer = Tokenizer(num_words=max_num_word)
tokenizer.fit_on_texts(list(in_sentences))
list_tokenized_train = tokenizer.texts_to_sequences(list(in_sentences))

#if the library ends up smaller then the max size, update the info
if len(tokenizer.word_index) < max_num_word:
    max_num_word = len(tokenizer.word_index)
    
print('Number of words:', max_num_word)

Number of words: 10000


In [4]:
#stick the encoded words back together as a big sequence
token_word = []
for line in range (0,len(in_sentences)):
    that_sentences = list_tokenized_train[line]
    for i in range(0,len(that_sentences)):
        token_word.append(that_sentences[i])

#sample the sequence
for i in range(0, len(token_word) - maxlen, step):
    sentences.append(token_word[i: i + maxlen])
    next_word.append(token_word[i + maxlen])
print('Number of sentences:', len(sentences))

Number of sentences: 118378


In [5]:
#nomalized x
x = np.asarray(sentences).astype('float32')/max_num_word
#one-hot encode y
y = np.zeros((len(sentences), max_num_word), dtype=np.bool)
for i in range (0,len(sentences)):
    for j in range (0,maxlen):
        y[i, next_word[j]] = 1

## Building and training the neural network
The network consist of 3 layers: Embedding layers (for word embeddings), LSTM and the an output layer.

In [6]:
#build Keras model, using word embedding layer and LSTM then 
#output via softmax layer to give a prediction distribution
from keras import layers

model = keras.models.Sequential()
model.add(layers.Embedding(max_num_word, 256, input_length=maxlen))
model.add(layers.LSTM(256))
model.add(layers.Dense(max_num_word, activation='softmax'))

model.summary()

# Since our prediction are one-hot encoded, use `categorical_crossentropy` as the loss
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.fit(x, y, batch_size=256, epochs=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 256)           2560000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense_1 (Dense)              (None, 10000)             2570000   
Total params: 5,655,312
Trainable params: 5,655,312
Non-trainable params: 0
_________________________________________________________________
Epoch 1/1


<keras.callbacks.History at 0x19eafb70>

## Prepare to sample the prediction for the next word
The neural network will predict a distribution of the next work, here we hava a function to sample it with a custom "temperature". We also define a dictionary to map back the coe into word.

In [7]:
#this is for sampling the next work with a prediction distribution
def sample(preds, temperature=0.1):
    preds = np.asarray(preds).astype('float64')
    exp_preds = preds - np.exp(temperature)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

#to change back to word
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

## Start generating a paragraph
By sampling a random seed sentance in the corpus, we start senerate the distribution of the next word, using the function above to sample the next word, append it to the seed sentance (to keep the length of the seed sentance, the first word will be removed), repead and generate the next. we will then have a new "paragrah" generated by the neural network.

In [8]:
#randomize a seed
import random

start_index = random.randint(0, len(token_word) - maxlen - 1)
generated_seed = token_word[start_index: start_index + maxlen]

generated_text = ' '.join([reverse_word_map.get(i) for i in generated_seed])
print('--- Generating with seed ---')
print(generated_text)
print('--- --- --- --- --- ---')

for i in range(40): #generate 20 words

    array_seed = np.zeros((maxlen,1))
    array_seed[:,0] = np.asarray(generated_seed).astype('float32')/max_num_word
    
    preds = model.predict(array_seed.transpose(), verbose=0)[0]
    next_index = sample(preds)
    next_word = reverse_word_map.get(next_index)

    generated_seed.append(next_index)       
    generated_seed = generated_seed[1:]
    generated_text = generated_text + ' ' + next_word

print('--- Generated text ---')
print(generated_text)
print('--- --- --- --- --- ---')

--- Generating with seed ---
rejoice at thee as once europa did at lusty jove when he would play the noble beast in love bull
--- --- --- --- --- ---
--- Generated text ---
rejoice at thee as once europa did at lusty jove when he would play the noble beast in love bull royal sixth inclination unsound croaking extant crow'd enseamed beats giddy olivia propose scroll profaners surgere borrowing annual blasts readiness beard ornament rhapsody queubus time's reading dogberry crows spoke agrees substantial shapes levied rids gorg'd drawer disponge poem meg whores encumber'd
--- --- --- --- --- ---
