In [1]:
MAIN_DIR = './'


BATCH_SIZE = 64
LSTM_DIM = 128
SELECTED_BOOKS = ['genesis',
                  'exodus',
                  'leviticus',
                  'deuteronomy',
                  '1_samuel',
                  '2_samuel',
                  '1_kings',
                  '2_kings',
                  'esther',
                  'daniel',
                  '1_chronicles',
                  '2_chronicles',
                  'ecclesiastes']

SEQ_LEN = 128

In [3]:
texts = {}

for book in SELECTED_BOOKS:
    path = os.path.join(MAIN_DIR, 'corpora', 'word', f'{book}.txt')
    
    with open(path, 'rb') as f:
        texts[book] = f.read().decode()

        
tokenizer = Tokenizer(texts.values(), character_level=True)

Length of tokenizer: 81


In [4]:
import tensorflow as tf

In [6]:
train_generator = DataGenerator(tokenizer, texts['genesis'], seq_len=SEQ_LEN, batch_size=BATCH_SIZE, with_embedding=True,
                         train=True)

test_generator = DataGenerator(tokenizer, texts['genesis'], seq_len=SEQ_LEN, batch_size=BATCH_SIZE, with_embedding=True,
                         train=False)

Setting up training generator...
Length of encoded texts: 213360
Number of sequences: 170585
Setting up testing generator...
Length of encoded texts: 213360
Number of sequences: 42646


In [7]:
train_generator[0][1].shape

(64, 128, 1)

In [8]:
def load_model(max_length, num_words, with_embedding=True, stateful=False, batch_size=512, lstm_dim=128, embedding_dim=300, return_state=False):
    
    if with_embedding:
        input = tf.keras.Input(name='seed', shape=(max_length,))

    else:
        input = tf.keras.Input(name='seed', shape=(max_length, num_words), batch_size=batch_size)
    
    if with_embedding:
        embedding = tf.keras.layers.Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=max_length, batch_size=batch_size)(input)
        
    else:
        embedding = input
    
#     lstm1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=lstm_dim, stateful=stateful, return_sequences=True))(embedding)


    if return_state:
        lstm, hf, cf, hb, cb = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
            units=lstm_dim, return_state=return_state, stateful=stateful, return_sequences=True))(embedding)
    
    else:
        lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=lstm_dim, return_state=return_state,
                                                                  stateful=stateful, return_sequences=True))(embedding)
    
    dense = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(num_words, activation='softmax'))(lstm)
    
    if return_state:
        model = tf.keras.Model([input], [dense, hf, cf, hb, cb])

    else:
        model = tf.keras.Model([input], [dense])
        
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['sparse_categorical_accuracy'])
        
    return model

In [9]:
model = load_model(SEQ_LEN, tokenizer.num_words, with_embedding=True, lstm_dim=LSTM_DIM, batch_size=BATCH_SIZE)

In [10]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
seed (InputLayer)            (None, 128)               0         
_________________________________________________________________
embedding (Embedding)        (None, 128, 300)          24300     
_________________________________________________________________
bidirectional (Bidirectional (None, 128, 256)          439296    
_________________________________________________________________
time_distributed (TimeDistri (None, 128, 81)           20817     
Total params: 484,413
Trainable params: 484,413
Non-trainable params: 0
_________________________________________________________________


In [11]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
model_name = 'characters'
file_path = os.path.join(MAIN_DIR, 'models', f'{model_name}.hdf5')
checkpoint = ModelCheckpoint(file_path, monitor='val_acc', save_best_only=True)
early_stopping = EarlyStopping(monitor='val_acc', patience=5)
callbacks_list = [checkpoint, early_stopping]


model.fit_generator(
    train_generator,
    validation_data=test_generator,
    callbacks=callbacks_list
)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1
 406/2666 [===>..........................] - ETA: 56:34 - loss: 0.5530 - sparse_categorical_accuracy: 0.8844

KeyboardInterrupt: 