# Text generation bible

This network is based off of Andrej Karpathy's [post on RNNs](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) and [implementation in Torch](https://github.com/karpathy/char-rnn). 

also on my udacity course in artifical intelligence

In [1]:
import numpy as np
import string
import random
import json
import codecs

from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file

Using TensorFlow backend.


In [2]:
def encode_io_pairs(text,window_size,step_size):
    # number of unique chars
    chars = sorted(list(set(text)))
    num_chars = len(chars)
    
    # cut up text into character input/output pairs
    inputs = []
    outputs = []

    for i in range(window_size, len(text), step_size):
        inputs.append(text[(i-window_size):i])
        outputs.append(text[i:(i+1)])
    
    # create empty vessels for one-hot encoded input/output
    X = np.zeros((len(inputs), window_size, num_chars), dtype=np.bool)
    y = np.zeros((len(inputs), num_chars), dtype=np.bool)
    
    # loop over inputs/outputs and transform and store in X/y
    for i, sentence in enumerate(inputs):
        for t, char in enumerate(sentence):
            X[i, t, chars_to_int[char]] = 1
        y[i, chars_to_int[outputs[i]]] = 1
        
    return X,y


def random_primer(primer_dict):
    
    chapter = random.choice(list(primer_dict.keys()))
    
    line = int(primer_dict[chapter]) + 1
    sub_chapter =  random.randint(line,line+50)
    
    return "{}:{} ".format(chapter, sub_chapter)


# function that uses trained model to predict a desired number of future characters
def predict_next_chars(model,num_chars,input_chars,num_to_predict):     
    # create output
    predicted_chars = ''
    for i in range(num_to_predict):
        # convert this round's predicted characters to numerical input    
        x_test = np.zeros((1, window_size, num_chars))
        for t, char in enumerate(input_chars):
            x_test[0, t, chars_to_int[char]] = 1.

        # make this round's prediction
        test_predict = model.predict(x_test,verbose = 0)[0]

        # translate numerical prediction back to characters
        r = np.argmax(test_predict)                           # predict class of each test input
        d = int_to_chars[str(r)] 

        # update predicted_chars and input
        predicted_chars+=d
        input_chars+=d
        input_chars = input_chars[1:]
    return predicted_chars

In [3]:
with open('/Users/steffen/Documents/RobotBible/luther_bibel_1912.txt', 'r') as f:
    text=f.read()

sanity check

In [4]:
print('text has ' + str(len(text)) + ' characters')
text[:1000]

text has 4338574 characters


'Gen 1:1 Am Anfang schuf Gott Himmel und Erde.\nGen 1:2 Und die Erde war wüst und leer, und es war finster auf der Tiefe; und der Geist Gottes schwebte auf dem Wasser.\nGen 1:3 Und Gott sprach: Es werde Licht! und es ward Licht.\nGen 1:4 Und Gott sah, dass das Licht gut war. Da schied Gott das Licht von der Finsternis\nGen 1:5 und nannte das Licht Tag und die Finsternis Nacht. Da ward aus Abend und Morgen der erste Tag.\nGen 1:6 Und Gott sprach: Es werde eine Feste zwischen den Wassern, und die sei ein Unterschied zwischen den Wassern.\nGen 1:7 Da machte Gott die Feste und schied das Wasser unter der Feste von dem Wasser über der Feste. Und es geschah also.\nGen 1:8 Und Gott nannte die Feste Himmel. Da ward aus Abend und Morgen der andere Tag.\nGen 1:9 Und Gott sprach: Es sammle sich das Wasser unter dem Himmel an besondere Örter, dass man das Trockene sehe. Und es geschah also.\nGen 1:10 Und Gott nannte das Trockene Erde, und die Sammlung der Wasser nannte er Meer. Und Gott sah, dass 

Text cleaning

In [5]:
print(set(text))

{'R', '<', 'u', 'H', 's', '„', 'f', '5', ':', '\n', 'O', '1', 'B', 'o', 'N', ']', 'C', ' ', '{', 'U', 'h', 'Q', 'ß', '2', 'd', '3', 'D', '–', 'Ä', 'g', '>', "'", 'b', 'i', 'L', '.', '0', 'a', 'v', 'M', '”', 'r', 'Z', 'J', 'ö', 'c', '?', '[', 'A', '8', '’', '-', 'K', 'l', 'w', '´', 'E', 'V', '}', '‚', '(', 'e', 'y', '6', 'ä', '4', '9', ';', ')', 'm', 't', '!', 'ü', 'P', 'z', 'k', 'j', 'x', 'p', ',', 'Ü', 'T', 'Ö', 'G', '7', 'F', 'W', 'S', 'q', 'I', 'n'}


In [6]:
text = text.lower()
text = text.replace('"', "'")
text = text.replace("”", "'")
text = text.replace("„","'")
text = text.replace("’","'")
text = text.replace("´","'")

In [7]:
punctuation = ['!', ',', '.', ':', ';', '?','"', '\n']
umlaute = ['ä', 'ö', 'ü', 'ß']

text_chars = ''.join(set(text))

remain_chars = string.ascii_lowercase + string.digits + ''.join(set(punctuation)) + ''.join(set(umlaute))
remove_chars = [i for i in text_chars if i not in remain_chars]

for char in remove_chars:
    text = text.replace(char, ' ')

In [8]:
for _ in range(3):
    text = text.replace('  ',' ')

In [9]:
chars = sorted(list(set(text)))

print ("this text has " +  str(len(chars)) + " unique characters")

this text has 48 unique characters


the last char of the text is also a \n seperator. Let's remove it, for the calculation of the primer.

In [10]:
text = text[:-1]

In [11]:
primer = [i.split(':') for i in text.split('\n')]
primer = {i[0]: i[1].split(' ')[0] for i in primer}

## Preparing train and test data

In [12]:
window_size = 100
step_size = 5

In [13]:
inputs = []
outputs = []

for i in range(window_size, len(text), step_size):
    inputs.append(text[(i-window_size):i])
    outputs.append(text[i:(i+1)])

In [14]:
print('input = ' + inputs[2])
print('output = ' + outputs[2])
print('--------------')
print('input = ' + inputs[100])
print('output = ' + outputs[100])

input =  anfang schuf gott himmel und erde.
gen 1:2 und die erde war wüst und leer, und es war finster auf d
output = e
--------------
input = n unterschied zwischen den wassern.
gen 1:7 da machte gott die feste und schied das wasser unter der
output =  


In [15]:
chars = sorted(list(set(text)))
print ("this corpus has " +  str(len(chars)) + " unique characters")
print ('and these characters are ')
print (chars)

this corpus has 48 unique characters
and these characters are 
['\n', ' ', '!', ',', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ß', 'ä', 'ö', 'ü']


In [16]:
chars_to_int = {c: i for i, c in enumerate(chars)}
int_to_chars = dict(enumerate(chars))

In [17]:
window_size = 100
step_size = 5

In [18]:
num_chars = len(int_to_chars.keys())

In [19]:
X,y = encode_io_pairs(text,window_size,step_size)

In [20]:
def encode_io_pairs(text,window_size,step_size):
    # number of unique chars
    chars = sorted(list(set(text)))
    num_chars = len(chars)
    
    # cut up text into character input/output pairs
    inputs = []
    outputs = []

    for i in range(window_size, len(text), step_size):
        inputs.append(text[(i-window_size):i])
        outputs.append(text[i:(i+1)])
    
    # create empty vessels for one-hot encoded input/output
    X = np.zeros((len(inputs), window_size, num_chars), dtype=np.bool)
    y = np.zeros((len(inputs), num_chars), dtype=np.bool)
    
    # loop over inputs/outputs and transform and store in X/y
    for i, sentence in enumerate(inputs):
        for t, char in enumerate(sentence):
            X[i, t, chars_to_int[char]] = 1
        y[i, chars_to_int[outputs[i]]] = 1
        
    return X,y

## Model setup

In [21]:
model = Sequential()
model.add(LSTM(200, input_shape=(window_size, num_chars)))
model.add(Dense(num_chars))
model.add(Activation('softmax'))

# initialize optimizer
optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

# compile model --> make sure initialized optimizer and callbacks - as defined above - are used
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

## Store model parameters

In [22]:
model_meta = {}
model_meta['text_encoder'] = int_to_chars 
model_meta['text_decoder'] = chars_to_int
model_meta['num_classes'] = num_chars
model_meta['primer'] = primer
model_meta['model_arch'] = model.to_json()

In [None]:
with open('../data/model_meta.json', 'w') as output:
    json.dump(model_meta, output, ensure_ascii=False)

## Train model

In [None]:
# train the model
model.fit(X, y, batch_size=500, epochs=50,verbose = 1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
 89000/867421 [==>...........................] - ETA: 53:38 - loss: 0.9893

In [None]:
# save weights
# model.save_weights('/Users/steffen/Documents/TwitterBot/model/best_model_weights_2.hdf5')

In [None]:
model.save('/Users/steffen/Documents/TwitterBot/model/model.hdf5')

## Model prediction

In [None]:
with open('../data/model_meta.json', 'r', encoding='utf-8') as input:
    model_meta = json.load(input)

In [None]:
model = load_model('/Users/steffen/Documents/TwitterBot/model/model.hdf5')

In [None]:
num_chars = model_meta['num_classes']
int_to_chars = model_meta['text_encoder']
chars_to_int = model_meta['text_decoder']
primer = model_meta['primer']

In [None]:
int_to_chars = {int(k): v for k,v in int_to_chars.items()}

In [None]:
input_chars = random_primer(primer)
window_size = 100
num_to_predict = 300

<a id='TODO_6'></a>

With your trained model try a few subsets of the complete text as input - note the length of each must be exactly equal to the window size.  For each subset use the function above to predict the next 100 characters that follow each input.

In [None]:
# model.load_weights('/Users/steffen/Documents/TwitterBot/model/best_model_weights_2.hdf5')

In [None]:
input_chars

In [None]:
predict_input = predict_next_chars(model,num_chars, input_chars,num_to_predict = 500)

In [None]:
predict_input

In [None]:
# TODO: choose an input sequence and use the prediction function in the previous Python cell to predict 100 characters following it
# get an appropriately sized chunk of characters from the text
start_inds = [2001, 3421, 4353, 1]

# save output
# f = open('text_gen_output/RNN_large_textdata_output.txt', 'w')  # create an output file to write too

# load weights
model.load_weights('model_weights/best_RNN_large_textdata_weights.hdf5')
for s in start_inds:
    start_index = s
    input_chars = text[start_index: start_index + window_size]

    # use the prediction function
    predict_input = predict_next_chars(model,input_chars,num_to_predict = 100)

    # print out input characters
    line = '-------------------' + '\n'
    print(line)
    f.write(line)

    input_line = 'input chars = ' + '\n' +  input_chars + '"' + '\n'
    print(input_line)
    f.write(input_line)

    # print out predicted characters
    predict_line = 'predicted chars = ' + '\n' +  predict_input + '"' + '\n'
    print(predict_line)
