# Dante

Following code is heavily inspired by these projects:
- https://github.com/mathematiguy/keras-char-rnn
- http://karpathy.github.io/2015/05/21/rnn-effectiveness/
- https://www.kaggle.com/mrisdal/intro-to-lstms-w-keras-gpu-for-text-generation/notebook

In [1]:
import numpy as np
import os
import random
import slabikar
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.utils import np_utils
from keras import backend as K

Using TensorFlow backend.


Hyperparameters are set here

In [2]:
maxlen = 120 #length of sequence
step = 13 #overlap
validation_split = 0.1
batch_size = 128
rnn_size = 128#128
num_layers = 2
drop_prob = 0.1
epochs = 1000

Function for concatenating all text files from directory. Text files are expected to be utf-8 encoded.

In [3]:
text_data = ''
for filename in filter(lambda s: s.endswith(".txt"), os.listdir('resources/')):
    # open file with default encoding
    print("loading file: %s" % filename)
    filepath = os.path.join('resources/', filename)
    with open(filepath,'r', encoding='utf-8') as f:
        text_data += f.read() + "\n"


loading file: Peklo.txt
loading file: Ocistec.txt
loading file: Nebo.txt


Methods for processing texts. One uses syllables as text atoms, the other uses characters

Return values:
- atom_to_int: (dict) Maps characters in the character set to ints.
- int_to_atom: (dict) Maps ints to characters in the character set.
- n_atom: (int) The number of characters in the text.
- n_vocab: (int) The number of unique characters in the text.'''
- data: preprocessed input

In [4]:
def process_text_char(text_data):
    # create mapping of unique chars to integers, and a reverse mapping
    chars = sorted(set(text_data)) #sorted is necessary for checkpointing model 
    char_to_int = {c: i for i, c in enumerate(chars)}
    int_to_char = {i: c for i, c in enumerate(chars)}
    # summarize the loaded data
    n_chars = len(text_data)
    n_vocab = len(chars)    
    return char_to_int, int_to_char, n_chars, n_vocab, text_data

def process_text_syllable(text_data):
    syllable_data = slabikar.slabikar(text_data)
    syllables = sorted(set(syllable_data))
    syllable_to_int = {c: i for i, c in enumerate(syllables)}
    int_to_syllable = {i: c for i, c in enumerate(syllables)}
    # summarize the loaded data
    n_syllables = len(text_data)
    n_vocab = len(syllables)    
    return syllable_to_int, int_to_syllable, n_syllables, n_vocab, syllable_data

Processes data to overlapping sequences. Targets are single atoms

In [5]:
def createInput(text, maxlen, step, n_vocab, atom_to_int):
    dataX = []
    dataY = []
    for i in range(0, len(text) - maxlen - 1, step):
        seq_in = text[i: i + maxlen]
        seq_out = text[i+maxlen] #(text[i + 1: i + maxlen + 1]) #weird
        dataX.append([atom_to_int[atom] for atom in seq_in])
        dataY.append(atom_to_int[seq_out])
    #should one hot encode
    print(len(dataX))
    print(len(dataY))
    print(len(dataX[0]))
    X = np_utils.to_categorical(dataX, num_classes=n_vocab)
    y = np_utils.to_categorical(dataY, num_classes=n_vocab)
    #same thing as
    #X = np.zeros((len(sentences), maxlen, n_vocab), dtype=np.bool)
    #y = np.zeros((len(sentences), n_vocab), dtype=np.bool)
    #for i in range(len(sentences)):
    #    sentence = sentences[i]
    #    target = targets[i]
    #    for j in range(maxlen):
    #        X[i][j][atom_to_int[sentence[j]]] = 1
    #    y[i][atom_to_int[target[j]]] = 1
    return X,y

Model builder.

In [6]:
def build_model(batch_size, seq_length, n_vocab, rnn_size, num_layers, drop_prob):
    model = Sequential()
    for i in range(num_layers):
        if i == num_layers - 1:
            # add last hidden layer
            model.add(LSTM(rnn_size, return_sequences=False))
            #model.add(TimeDistributed(Dense(num_chars))) #what is better?
        elif i == 0:
            # add first hidden layer
            model.add(LSTM(rnn_size, batch_input_shape=(None, seq_length, n_vocab), return_sequences=True))
        else:
            # add middle hidden layer
            model.add(LSTM(rnn_size, return_sequences=True))
        
        model.add(Dropout(drop_prob))
    # add output layer
    model.add(Dense(n_vocab, activation='softmax'))
    # compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  

    return model


Here we prepare everything.

In [7]:
callbacks = [ModelCheckpoint('checkpoints/weights-{epoch:02d}-{val_acc:.2f}-{val_loss:.2f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')]

#two choices, still indempotent
#atom_to_int, int_to_atom, n_atoms, n_vocab, data = process_text_syllable(text_data)
atom_to_int, int_to_atom, n_atoms, n_vocab, data = process_text_char(text_data)


In [8]:
X,y = createInput(data, maxlen, step, n_vocab, atom_to_int)

model = build_model(batch_size, maxlen, n_vocab, rnn_size, num_layers, drop_prob)

40438
40438
120


Few test checks to ensure everything looks good

In [9]:
print(atom_to_int)
print(int_to_atom)
print(n_atoms)
print(n_vocab)
print(len(y))
print(len(X))

{'\n': 0, ' ': 1, '!': 2, '"': 3, '(': 4, ')': 5, ',': 6, '-': 7, '.': 8, ':': 9, ';': 10, '?': 11, 'A': 12, 'B': 13, 'C': 14, 'D': 15, 'E': 16, 'F': 17, 'G': 18, 'H': 19, 'I': 20, 'J': 21, 'K': 22, 'L': 23, 'M': 24, 'N': 25, 'O': 26, 'P': 27, 'Q': 28, 'R': 29, 'S': 30, 'T': 31, 'U': 32, 'V': 33, 'W': 34, 'X': 35, 'Z': 36, 'a': 37, 'b': 38, 'c': 39, 'd': 40, 'e': 41, 'f': 42, 'g': 43, 'h': 44, 'i': 45, 'j': 46, 'k': 47, 'l': 48, 'm': 49, 'n': 50, 'o': 51, 'p': 52, 'q': 53, 'r': 54, 's': 55, 't': 56, 'u': 57, 'v': 58, 'x': 59, 'y': 60, 'z': 61, 'Á': 62, 'Í': 63, 'Ó': 64, 'Ú': 65, 'á': 66, 'ä': 67, 'é': 68, 'ì': 69, 'í': 70, 'ó': 71, 'ô': 72, 'ú': 73, 'ý': 74, 'Č': 75, 'č': 76, 'Ď': 77, 'ď': 78, 'ĺ': 79, 'Ľ': 80, 'ľ': 81, 'Ň': 82, 'ň': 83, 'ŕ': 84, 'Š': 85, 'š': 86, 'Ť': 87, 'ť': 88, 'Ž': 89, 'ž': 90, '‒': 91, '‘': 92, '’': 93, '‚': 94, '“': 95, '„': 96}
{0: '\n', 1: ' ', 2: '!', 3: '"', 4: '(', 5: ')', 6: ',', 7: '-', 8: '.', 9: ':', 10: ';', 11: '?', 12: 'A', 13: 'B', 14: 'C', 15: 'D',

Training

In [10]:
model.fit(X,y,batch_size=batch_size,epochs=10,callbacks=callbacks, validation_split=validation_split)

Train on 36394 samples, validate on 4044 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.15554, saving model to checkpoints/weights-01-0.16-3.34.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.15554 to 0.21464, saving model to checkpoints/weights-02-0.21-2.91.hdf5
Epoch 3/10

Epoch 00003: val_acc improved from 0.21464 to 0.24209, saving model to checkpoints/weights-03-0.24-2.77.hdf5
Epoch 4/10

Epoch 00004: val_acc improved from 0.24209 to 0.26187, saving model to checkpoints/weights-04-0.26-2.67.hdf5
Epoch 5/10

Epoch 00005: val_acc improved from 0.26187 to 0.26978, saving model to checkpoints/weights-05-0.27-2.62.hdf5
Epoch 6/10

Epoch 00006: val_acc improved from 0.26978 to 0.28165, saving model to checkpoints/weights-06-0.28-2.58.hdf5
Epoch 7/10

Epoch 00007: val_acc improved from 0.28165 to 0.28487, saving model to checkpoints/weights-07-0.28-2.54.hdf5
Epoch 8/10

Epoch 00008: val_acc improved from 0.28487 to 0.28882, saving model to checkpoints/weights-08-0

<keras.callbacks.History at 0x7f5094beafd0>