# Asg 11b - Text Generation 
From https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/

In [20]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

# Configure the Data
Load in the text file, convert it from characters to integers, set it up in training sequences.

# Task 3: Run all cells with one input file
Select which input file you want to use.
1. Alice in Wonderland (wonderland.txt, 145K characters)
2. Harry Potter and the Sorcerer's Stone (harry_potter.txt, 427K characters)
3. King James Bible (bible.txt, 4,200K characters).
Change the epochs in the model.fit() call. With shorter text like Alice in Wonderland you should be able to train for 20 epochs, but for longer text like the Harry Potter you will be lucky to train for 5 epochs.<br>
Report what text the network generated. <br>
Analyze the generated text in a sentence or two. Was it English-like? Did it sound like the training text?

In [21]:
# load ascii text and covert to lowercase
filename = "wonderland.txt"
#filename = "harry_potter.txt"
#filename = "bible.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()

In [22]:
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [23]:
n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)

Total Characters:  144346
Total Vocab:  42


In [24]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
	seq_in = raw_text[i:i + seq_length]
	seq_out = raw_text[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)

Total Patterns:  144246


In [25]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

# Set up the LSTM Neural Network

In [26]:
# define the LSTM model
model = Sequential()
model.add(LSTM(16, input_shape=(X.shape[1], X.shape[2])))
#model.add(LSTM(64, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))   # LSTM layer input layer if next layer is also a LSTM
#model.add(LSTM(64))                                                 # Optional second LSTM layer for longer time relation
#model.add(Dense(32))                                                # Add another fully connected dense layer if needed
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# define the checkpoint - This saves the best weights during training so they can be loaded at another time
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
model.fit(X, y, epochs=5, batch_size=128, callbacks=callbacks_list)
#model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/5

Epoch 00001: loss improved from inf to 3.01988, saving model to weights-improvement-01-3.0199.hdf5
Epoch 2/5

# Part 2 - Generate text from the saved model above

In [None]:
# load the network weights if you are continuing from an earlier training sessions
# Verify the name of the weights file... your run will have different names.
#filename = "weights-improvement-19-1.9435.hdf5"
#model.load_weights(filename)
#model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print ("Seed:", start)
print ("Pattern : ", pattern)
print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
print ("Generating Characters")
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    print(result, end='', flush=True)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
    
print ("\nDone.")

# Task 4: Network changes
Make **one or more** of the following modifications to the network to see if it improves the performance:
1. Increase the size of the LSTM layer from 16 something larger, maybe in the range of 32 to 128.
2. Add a second LSTM layer. Sometimes this helps, but requires adding *return_sequences=True* to the first layer
3. Add another dense hidden layer. The final, dense output layer must have a fixed size and be a softmax layer, but you can add earlier layers.
4. Remove the dropout layer which adds random noise to the learning or increase the amount of random noise.