# Dante

Following code is heavily inspired by these projects:
https://github.com/mathematiguy/keras-char-rnn
http://karpathy.github.io/2015/05/21/rnn-effectiveness/
https://www.kaggle.com/mrisdal/intro-to-lstms-w-keras-gpu-for-text-generation/notebook

In [10]:
import numpy as np
import os
import random
import slabikar
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.utils import np_utils
from keras import backend as K

Hyperparameters

In [None]:
maxlen = 120 #length of sequence
step = 13 #overlap
validation_split = 0.1
batch_size = 100
rnn_size = 128
num_layers = 2
drop_prob = 0.1
epochs = 1000

Function for concatenating all text files from directory. Text files are expected to be utf-8 encoded.

In [9]:
text_data = ''
for filename in filter(lambda s: s.endswith(".txt"), os.listdir('resources/')):
    # open file with default encoding
    print("loading file: %s" % filename)
    filepath = os.path.join('resources/', filename)
    with open(filepath,'r', encoding='utf-8') as f:
        text_data += f.read() + "\n"


loading file: Peklo.txt
loading file: Nebo.txt
loading file: Ocistec.txt


Methods for processing texts. One uses syllables as text atoms, the other uses characters
Args:
- text data
- seq_length: (int) length of character sequences to be considered

Return values:
- atom_to_int: (dict) Maps characters in the character set to ints.
- int_to_atom: (dict) Maps ints to characters in the character set.
- n_atom: (int) The number of characters in the text.
- n_vocab: (int) The number of unique characters in the text.'''
- data: preprocessed input

In [11]:
def process_text_char(text_data, seq_length):
    # create mapping of unique chars to integers, and a reverse mapping
    chars = sorted(set(text_data)) #sorted is necessary for checkpointing model 
    char_to_int = {c: i for i, c in enumerate(chars)}
    int_to_char = {i: c for i, c in enumerate(chars)}
    # summarize the loaded data
    n_chars = len(text_data)
    n_vocab = len(chars)    
    return char_to_int, int_to_char, n_chars, n_vocab, text_data

def process_text_syllable(text_data, seq_length):
    syllable_data = slabikar(text_data)
    syllables = sorted(set(syllable_data))
    syllable_to_int = {c: i for i, c in enumerate(syllables)}
    int_to_sylllable = {i: c for i, c in enumerate(syllables)}
    # summarize the loaded data
    n_syllables = len(text_data)
    n_vocab = len(syllables)    
    return syllable_to_int, int_to_syllable, n_syllables, n_vocab, syllable_data

Processes data to overlapping sequences. Targets are single atoms

In [12]:
def createInput(text, maxlen, step, n_vocab, atom_to_int):
    dataX = []
    dataY = []
    for i in range(0, len(text) - maxlen - 1, step):
        seq_in = text[i: i + maxlen]
        seq_out = text[i+maxlen] #(text[i + 1: i + maxlen + 1]) #weird
        dataX.append([atom_to_int[atom] for atom in seq_in])
        dataY.append(atom_to_int[seq_out])
    #should one hot encode
    X = np_utils.to_categorical(dataX, num_classes=n_vocab)
    y = np_utils.to_categorical(dataY, num_classes=n_vocab)
    #same thing as
    #X = np.zeros((len(sentences), maxlen, n_vocab), dtype=np.bool)
    #y = np.zeros((len(sentences), n_vocab), dtype=np.bool)
    #for i in range(len(sentences)):
    #    sentence = sentences[i]
    #    target = targets[i]
    #    for j in range(maxlen):
    #        X[i][j][atom_to_int[sentence[j]]] = 1
    #    y[i][atom_to_int[target[j]]] = 1
    return X,y

Model builder.

In [8]:
def build_model(batch_size, seq_length, n_vocab, rnn_size, num_layers, drop_prob):
    model = Sequential()
    for i in range(num_layers):
        if i == num_layers - 1:
            # add last hidden layer
            model.add(LSTM(rnn_size, return_sequences=False))
            #model.add(TimeDistributed(Dense(num_chars))) #what is better?
        elif i == 0:
            # add first hidden layer
            model.add(LSTM(rnn_size, batch_input_shape=(None, seq_length, n_vocab), return_sequences=True))
        else:
            # add middle hidden layer
            model.add(LSTM(rnn_size, return_sequences=True))
        
        model.add(Dropout(drop_prob))
    # add output layer
    model.add(Dense(n_vocab, activation='softmax'))
    # compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metric=['accuracy'])  

    return model


In [11]:

model = build_model(batch_size, seq_length, n_vocab, rnn_size, num_layers, drop_prob)
callbacks = [ModelCheckpoint('checkpoints/weights-{epoch:02d}-{val_acc:.2f}-{val_loss:.2f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')]

    # load text data to memory
    text_data = load_data(data_dir)

    # preprocess the text - construct character dictionaries etc
    char_to_int, int_to_char, n_chars, n_vocab = \
                                process_text(text_data, seq_length)

    # build and compile Keras model
    model = build_model(batch_size, seq_length, n_vocab,
                        rnn_size, num_layers, drop_prob)

    # fit model using generator
model.fit(x,y,batch_size=batch_size,epochs,callbacks)