# Dante

Following code is heavily inspired by these projects:
- https://github.com/mathematiguy/keras-char-rnn
- http://karpathy.github.io/2015/05/21/rnn-effectiveness/
- https://www.kaggle.com/mrisdal/intro-to-lstms-w-keras-gpu-for-text-generation/notebook
- https://github.com/michaelrzhang/Char-RNN/blob/master/train_model.py
- mineshmathew
- ekzhang
- michaelrzhang
- yxtay - stateful

In [1]:
import numpy as np
import os
import sys
import random
import slabikar
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM, TimeDistributed
from keras.callbacks import ModelCheckpoint, LambdaCallback
from keras.utils import np_utils
from keras import backend as K
import keract

Using TensorFlow backend.


Hyperparameters are set here

In [2]:
maxlen = 120 #length of sequence
step = 13 #overlap
validation_split = 0.1
batch_size = 128
rnn_size = 1#128
num_layers = 3
drop_prob = 0.1
epochs = 6
temperature=1.0
sample_length = 1000

Function for concatenating all text files from directory. Text files are expected to be utf-8 encoded.

In [3]:
text_data = ''
for filename in filter(lambda s: s.endswith(".txt"), os.listdir('resources/')):
    # open file with default encoding
    print("loading file: %s" % filename)
    filepath = os.path.join('resources/', filename)
    with open(filepath,'r', encoding='utf-8') as f:
        text_data += f.read() + "\n"


loading file: Peklo.txt
loading file: Nebo.txt
loading file: Ocistec.txt


Since our dataset is pretty small, we convert everything to lowercase and remove diacritics. There are some characters that need to be tweaked beforehand.

In [4]:
text_data = text_data.replace("’","\'")
text_data = text_data.replace("„","\"")
text_data = text_data.replace("“","\"")
text_data = text_data.replace("‒","-")
import unidecode
text_data = unidecode.unidecode(text_data).lower()


Methods for processing texts. One uses syllables as text atoms, the other uses characters

Return values:
- atom_to_int: (dict) Maps characters in the character set to ints.
- int_to_atom: (dict) Maps ints to characters in the character set.
- n_atom: (int) The number of characters in the text.
- n_vocab: (int) The number of unique characters in the text.'''
- data: preprocessed input

In [5]:
def process_text_char(text_data):
    # create mapping of unique chars to integers, and a reverse mapping
    chars = sorted(set(text_data)) #sorted is necessary for checkpointing model 
    char_to_int = {c: i for i, c in enumerate(chars)}
    int_to_char = {i: c for i, c in enumerate(chars)}
    # summarize the loaded data
    n_chars = len(text_data)
    n_vocab = len(chars)    
    return char_to_int, int_to_char, n_chars, n_vocab, text_data

def process_text_syllable(text_data):
    syllable_data = slabikar.slabikar(text_data)
    syllables = sorted(set(syllable_data))
    syllable_to_int = {c: i for i, c in enumerate(syllables)}
    int_to_syllable = {i: c for i, c in enumerate(syllables)}
    # summarize the loaded data
    n_syllables = len(syllable_data)
    n_vocab = len(syllables)    
    return syllable_to_int, int_to_syllable, n_syllables, n_vocab, syllable_data

Processes data to overlapping sequences. Since we are doing many to many RNN, targets are sequences of atoms shifted to the right from the source. Syllable data cannot be preprocessed as whole. Use generator instead.

In [6]:
def createInput(text, maxlen, step, n_vocab, atom_to_int):
    dataX = []
    dataY = []
    sentences = []
    targets = []
    for i in range(0, len(text) - maxlen - 1, step):
        sentences.append(text[i: i + maxlen])
        targets.append(text[i + 1: i + maxlen + 1])
    X = np.zeros((len(sentences), maxlen, n_vocab), dtype=np.bool)
    y = np.zeros((len(sentences), maxlen, n_vocab), dtype=np.bool)
    for i in range(len(sentences)):
        sentence = sentences[i]
        target = targets[i]
        for j in range(maxlen):
            X[i][j][atom_to_int[sentence[j]]] = 1
            y[i][j][atom_to_int[target[j]]] = 1
    return X,y

def get_batch(batch, starts, text_data, seq_length, batch_size, 
              atom_to_int, n_vocab):
    dataX = []
    dataY = []
    for start in range(batch_size * batch, batch_size * (batch + 1)): 
        seq_in  = text_data[starts[start]:starts[start] + seq_length]
        seq_out = text_data[starts[start]+1:starts[start]+1 + seq_length]
        dataX.append([atom_to_int[atom] for atom in seq_in])
        dataY.append([atom_to_int[atom] for atom in seq_in])
        
    X = np_utils.to_categorical(dataX, num_classes=n_vocab)
    y = np_utils.to_categorical(dataY, num_classes=n_vocab)
    X = X.reshape(batch_size, seq_length, n_vocab) #necessary< Asi nie... shape sa nezmeni
    y = y.reshape(batch_size, seq_length, n_vocab)
    return X, y

#use this for syllables
def generate_batches(mode, text_data, seq_length, validation_split,
                     batch_size, atom_to_int, n_atom, n_vocab,
                     random_seed=42, shuffle=True):#rovnaky seed zabezpeci rovnakost pre oba gneratory
    random.seed(random_seed)
    starts = list(range(n_atom - n_atom % seq_length - seq_length))
    if shuffle:
        random.shuffle(starts)
    n_batches = n_atom // batch_size
    validation_size = round(n_batches * validation_split)
    while True:
        if mode == 'validation':
            for batch in range(validation_size):
                X, y = get_batch(batch, starts, text_data, seq_length, 
                                 batch_size, atom_to_int, n_vocab)
                yield X, y
            
        elif mode == 'train':
            for batch in range(validation_size, n_batches-1):
                X, y = get_batch(batch, starts, text_data, seq_length, 
                                 batch_size, atom_to_int, n_vocab)
                yield X, y
        else:
            raise ValueError("only 'validation' and 'train' modes accepted")


Model builder.

In [7]:
def build_model(batch_size, seq_length, n_vocab, rnn_size, num_layers, drop_prob):
    model = Sequential()
    for i in range(num_layers):
        if i == num_layers - 1:
            # add last hidden layer
            model.add(TimeDistributed(Dense(n_vocab, activation='softmax')))
        elif i == 0:
            # add first hidden layer
            model.add(LSTM(rnn_size, batch_input_shape=(None, seq_length, n_vocab), return_sequences=True))
            model.add(Dropout(drop_prob))
        else:
            # add middle hidden layer
            model.add(LSTM(rnn_size, return_sequences=True))
            model.add(Dropout(drop_prob))
    # compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  
    return model


Callback for printing while training. Does not work in many to many fashion. Do not use.

In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def on_epoch_end(epoch, logs):
    # Function invoked for specified epochs. Prints generated text.
    # Using epoch+1 to be consistent with the training epochs printed by Keras
    if epoch % 5 == 0:
        print()
        print('----- Generating text after Epoch: %d' % epoch)
        #dirty global hack
        start_index = random.randint(0, n_atoms - maxlen - 1)
        for diversity in [0.2, 0.5, 1.0, 1.2]:
            print('----- diversity:', diversity)

            generated = ''
            sentence = data[start_index: start_index + maxlen]
            generated += sentence
            print('----- Generating with seed: "' + sentence + '"')
            sys.stdout.write(generated)

            for i in range(400):
                x_pred = np.zeros((1, maxlen, n_vocab))
                for t, char in enumerate(sentence):
                    x_pred[0, t, atom_to_int[char]] = 1.

                preds = model.predict(x_pred, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_char = int_to_atom[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

                sys.stdout.write(next_char)
                sys.stdout.flush()
            print()
    else:
        print()
        print('----- Not generating text after Epoch: %d' % epoch)

generate_text = LambdaCallback(on_epoch_end=on_epoch_end)

Use this to prepare char data (contains some debug printing to ensure everything looks good)

In [8]:
atom_to_int, int_to_atom, n_atoms, n_vocab, data = process_text_char(text_data)
X,y = createInput(data, maxlen, step, n_vocab, atom_to_int)
print(atom_to_int)
print(int_to_atom)
print(n_atoms)
print(n_vocab)
print(len(y))
print(len(X))

{'\n': 0, ' ': 1, '!': 2, '"': 3, "'": 4, '(': 5, ')': 6, ',': 7, '-': 8, '.': 9, ':': 10, ';': 11, '?': 12, 'a': 13, 'b': 14, 'c': 15, 'd': 16, 'e': 17, 'f': 18, 'g': 19, 'h': 20, 'i': 21, 'j': 22, 'k': 23, 'l': 24, 'm': 25, 'n': 26, 'o': 27, 'p': 28, 'q': 29, 'r': 30, 's': 31, 't': 32, 'u': 33, 'v': 34, 'w': 35, 'x': 36, 'y': 37, 'z': 38}
{0: '\n', 1: ' ', 2: '!', 3: '"', 4: "'", 5: '(', 6: ')', 7: ',', 8: '-', 9: '.', 10: ':', 11: ';', 12: '?', 13: 'a', 14: 'b', 15: 'c', 16: 'd', 17: 'e', 18: 'f', 19: 'g', 20: 'h', 21: 'i', 22: 'j', 23: 'k', 24: 'l', 25: 'm', 26: 'n', 27: 'o', 28: 'p', 29: 'q', 30: 'r', 31: 's', 32: 't', 33: 'u', 34: 'v', 35: 'w', 36: 'x', 37: 'y', 38: 'z'}
525809
39
40438
40438


Train char model

In [9]:
callbacks = [ModelCheckpoint('checkpoints/weights-{epoch:02d}-{val_acc:.2f}-{val_loss:.2f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')]
model = build_model(batch_size, maxlen, n_vocab, rnn_size, num_layers, drop_prob)
model.fit(X,y,batch_size=batch_size,epochs=epochs,callbacks=callbacks, validation_split=validation_split)

Train on 36394 samples, validate on 4044 samples
Epoch 1/6

Epoch 00001: val_acc improved from -inf to 0.15990, saving model to checkpoints/weights-01-0.16-3.32.hdf5
Epoch 2/6

Epoch 00002: val_acc did not improve from 0.15990
Epoch 3/6

Epoch 00003: val_acc did not improve from 0.15990
Epoch 4/6

Epoch 00004: val_acc did not improve from 0.15990
Epoch 5/6

Epoch 00005: val_acc did not improve from 0.15990
Epoch 6/6

Epoch 00006: val_acc did not improve from 0.15990


<keras.callbacks.History at 0x7fc9d5a1be48>

Use this to prepare syllable model

In [None]:
atom_to_int, int_to_atom, n_atoms, n_vocab, data = process_text_syllable(text_data)
#print(atom_to_int)
#print(int_to_atom)
print(n_atoms)
print(n_vocab)

Train syllable model

In [None]:
callbacks = [ModelCheckpoint('checkpoints/weights-{epoch:02d}-{val_acc:.2f}-{val_loss:.2f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')]
model = build_model(batch_size, maxlen, n_vocab, rnn_size, num_layers, drop_prob)

n_batches = len(data) // batch_size
batch_params = (data, maxlen, validation_split, batch_size, atom_to_int, n_atoms, n_vocab)
model.fit_generator(
    generator = generate_batches('train', *batch_params),
    validation_data = generate_batches('validation', *batch_params),
    validation_steps = int(n_batches * validation_split),
    epochs = epochs,
    steps_per_epoch = n_batches-1,
    callbacks = callbacks)

Here we can load saved model to play around with it (make sure to use correct preprocessing for loaded model).

In [None]:
model = load_model('.h5')#specify the file you want to use

Training was using stateless model with fixed length sequence. This is not what we want to do since when generating, the sequence is theoretically unbounded length and we want to to feed it character after character. Therefore we convert the model to stateful accepting single element sequences here.

In [10]:
def generative_model(model):
    config = model.get_config()
    for layer in config:
        if 'stateful' in layer['config']:
            layer['config']['stateful'] = True
        if 'batch_input_shape' in layer['config']:
            layer['config']['batch_input_shape'][0] = 1
            layer['config']['batch_input_shape'][1] = None #seq length is undefined
    inference_model = Sequential.from_config(config)
    inference_model.trainable = False
    inference_model.set_weights(model.get_weights())
    inference_model.reset_states()
    return inference_model

In [11]:
model = generative_model(model)

TypeError: string indices must be integers

Following function is used to generate some sample outputs from trained model. It also saves activations of neurons.

In [None]:
def sample_model(model,sample_length, int_to_atom, n_vocab):
    outerlist = []
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print("generating with diversity: "+str(diversity))
        seed = "no, na upatie vrchu priduc zase,"
        generated = seed
        encoded = np_utils.to_categorical(seed, num_classes=n_vocab)
        model.reset_states()
        neuronsSnapshot = []
        for x in encoded[:-1]:
            # input shape: (1, 1)
            # saturate the net
            model.predict([[x]])

        next_index = encoded[-1]
        for i in range(sample_length):
            x = np.array([[next_index]])
            # input shape: (1, 1)
            #we need get activations
            adict = get_activations(x)
            keylist = adict.keys()
            keylist.sort() #to guarantee that layers are in correct all the time
            preds = []
            alist = []
            for k in keylist:
                if 'lstm' in k:#this is interesting
                    alist += adict[k]
                elif 'dense' in k: 
                    probs = adict[k]
                elif:
                    pass
            #probs = model.predict(x)
            # output shape: (1, 1, vocab_size)
            number = pick_atom_index(probs, diversity)
            # append to sequence
            generated += int_to_atom[number]
            neuronsSnapshot.append(alist)
            next_index = np_utils.to_categorical(next_index,num_classes=n_vocab)
        print(generated)
        outerlist.append({'poem': generated[len(seed)-1:], 'neurons':neuronsSnapshot})#-2 mozno?
    return {'data': outerlist, 'neurons': len(outerlist[0]['neurons'][0])}
    # data = {data:[{poem,neurons}...],neurons}

def pick_atom_index(predictions, temperature)
    preds = np.asarray(predictions).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)
    
fout = sample_model(model, 250, int_to_atom, n_vocab)

In [None]:
import json
with open('out/data.json','w') as f
    json.dumps(fout, ensure_ascii=False) #hope this works with jquery