# Text generation bible

This network is based off of Andrej Karpathy's [post on RNNs](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) and [implementation in Torch](https://github.com/karpathy/char-rnn). 

also on my udacity course in artifical intelligence

In [None]:
import numpy as np
import string
import random
import json
import codecs

from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file

In [None]:
def encode_io_pairs(text,window_size,step_size):
    # number of unique chars
    chars = sorted(list(set(text)))
    num_chars = len(chars)
    
    # cut up text into character input/output pairs
    inputs = []
    outputs = []

    for i in range(window_size, len(text), step_size):
        inputs.append(text[(i-window_size):i])
        outputs.append(text[i:(i+1)])
    
    # create empty vessels for one-hot encoded input/output
    X = np.zeros((len(inputs), window_size, num_chars), dtype=np.bool)
    y = np.zeros((len(inputs), num_chars), dtype=np.bool)
    
    # loop over inputs/outputs and transform and store in X/y
    for i, sentence in enumerate(inputs):
        for t, char in enumerate(sentence):
            X[i, t, chars_to_int[char]] = 1
        y[i, chars_to_int[outputs[i]]] = 1
        
    return X,y


def predict_next_chars(model, model_meta, input_chars, window_size,
                       num_to_predict):

    int_to_chars = model_meta['text_encoder']
    chars_to_int = model_meta['text_decoder']

    num_chars = len(list(int_to_chars.keys()))

    predicted_chars = input_chars

    for i in range(num_to_predict):

        x_test = np.zeros((1, window_size, num_chars))
        for t, char in enumerate(input_chars):
            x_test[0, t, chars_to_int[char]] = 1.

        test_predict = model.predict(x_test,verbose = 0)[0]

        r = np.argmax(test_predict)  # predict class of each test input
        d = int_to_chars[str(r)]

        # update predicted_chars and input
        predicted_chars += d
        input_chars += d

        if len(input_chars) > window_size:
            input_chars = input_chars[1:]

    return predicted_chars


def extract_verse(chapter_dict, inputs):
    
    message = inputs.split('\n')[1]
    chapter = message.split(':')
    
    try:
        begin = random_chapter(chapter_dict, chapter[0], False)
    except:
        begin = random_chapter(chapter_dict, chapter[0], True)
    
    message = begin + chapter[1][chapter[1].index(' ')+1:]
    
    return message


def random_chapter(primer_dict, chapter, flag):
    
    if flag:
        chapter = random.choice(list(primer_dict.keys()))
        
    line = int(primer_dict[chapter]) + 1
    sub_chapter =  random.randint(line,line+50)
    
    return "{}:{} ".format(chapter, sub_chapter)

In [None]:
with open('/Users/steffen/Documents/RobotBible/luther_bibel_1912.txt', 'r') as f:
    text=f.read()

sanity check

In [None]:
print('text has ' + str(len(text)) + ' characters')
text[:1000]

Text cleaning

In [None]:
print(set(text))

In [None]:
text = text.lower()
text = text.replace('"', "'")
text = text.replace("”", "'")
text = text.replace("„","'")
text = text.replace("’","'")
text = text.replace("´","'")

In [None]:
punctuation = ['!', ',', '.', ':', ';', '?','"', '\n']
umlaute = ['ä', 'ö', 'ü', 'ß']

text_chars = ''.join(set(text))

remain_chars = string.ascii_lowercase + string.digits + ''.join(set(punctuation)) + ''.join(set(umlaute))
remove_chars = [i for i in text_chars if i not in remain_chars]

for char in remove_chars:
    text = text.replace(char, ' ')

In [None]:
for _ in range(3):
    text = text.replace('  ',' ')

In [None]:
chars = sorted(list(set(text)))

print ("this text has " +  str(len(chars)) + " unique characters")

the last char of the text is also a \n seperator. Let's remove it, for the calculation of the primer.

## Preparing train and test data

In [None]:
window_size = 100
step_size = 5

In [None]:
inputs = []
outputs = []

for i in range(window_size, len(text), step_size):
    inputs.append(text[(i-window_size):i])
    outputs.append(text[i:(i+1)])

In [None]:
print('input = ' + inputs[2])
print('output = ' + outputs[2])
print('--------------')
print('input = ' + inputs[100])
print('output = ' + outputs[100])

In [None]:
chars = sorted(list(set(text)))
print ("this corpus has " +  str(len(chars)) + " unique characters")
print ('and these characters are ')
print (chars)

In [None]:
chars_to_int = {c: i for i, c in enumerate(chars)}
int_to_chars = dict(enumerate(chars))

In [None]:
text = text[:-1]

In [None]:
chapter = [i.split(':') for i in text.split('\n')]
chapter = {i[0]: i[1].split(' ')[0] for i in chapter}

convert any texts to escape unjustified claims

In [None]:
primer = [i[:100] for i in text.split('\n')]
primer = [[chars_to_int[j] for j in i] for i in primer]

In [None]:
window_size = 100
step_size = 5

In [None]:
num_chars = len(int_to_chars.keys())

In [None]:
X,y = encode_io_pairs(text,window_size,step_size)

## Model setup

In [None]:
model = Sequential()
model.add(LSTM(200, input_shape=(window_size, num_chars)))
model.add(Dense(num_chars))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

model.compile(loss='categorical_crossentropy', optimizer=optimizer)

## Store model parameters

In [None]:
model_meta = {}
model_meta['text_encoder'] = int_to_chars 
model_meta['text_decoder'] = chars_to_int
model_meta['num_classes'] = num_chars
model_meta['chapter'] = chapter
model_meta['primer'] = primer

In [None]:
with open('../data/model_meta.json', 'w') as output:
    json.dump(model_meta, output, ensure_ascii=False)

## Train model

In [None]:
# train the model
model.fit(X, y, batch_size=500, epochs=50,verbose = 1)

In [None]:
model.save('/Users/steffen/Documents/TwitterBot/model/model.hdf5')

## Model prediction

In [None]:
with open('../data/model_meta.json', 'r', encoding='utf-8') as input:
    model_meta = json.load(input)

In [None]:
model = load_model('/Users/steffen/Documents/TwitterBot/model/model.hdf5')

In [None]:
num_chars = model_meta['num_classes']
int_to_chars = model_meta['text_encoder']
chars_to_int = model_meta['text_decoder']
primer = model_meta['primer']
chapter_dict = model_meta['chapter']

convert primer back

In [None]:
primer = model_meta['primer']

primer = [[int_to_chars[str(j)] for j in i] for i in primer]
primer = [''.join(i) for i in primer]

In [None]:
input_chars = random.choice(primer)
window_size = 100
num_to_predict = 500

testing model

In [None]:
input_chars

In [None]:
for _ in range(100):
    input_chars = random.choice(primer)
    print(input_chars)
    try:
        predict_input = predict_next_chars(model,model_meta, input_chars,window_size,500)
        print(extract_verse(chapter_dict, predict_input))
    except:
        continue
    print('\n')