In [1]:
# source/inspiration: https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py
import numpy as np
import pandas as pd
from keras.callbacks import LambdaCallback, ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
import random

Using TensorFlow backend.


In [2]:
data = np.load('../data/processed/compressed_data.npz')
x_data = data['x_data']
y_data = data['y_data']

In [3]:
print(f'x_data.shape: {x_data.shape}')
print(f'y_data.shape: {y_data.shape}')

x_data.shape: (6428972, 40)
y_data.shape: (6428972,)


In [4]:
maxlen = 40 # TODO: this should be loaded in somehow
number_of_chars = 104 # TODO: this should be loaded in somehow
n_samples = y_data.shape[0]

In [12]:
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, number_of_chars)))
model.add(Dense(number_of_chars, activation='softmax'))

In [13]:
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [14]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 128)               119296    
_________________________________________________________________
dense_2 (Dense)              (None, 104)               13416     
Total params: 132,712
Trainable params: 132,712
Non-trainable params: 0
_________________________________________________________________


In [15]:
text = [] # encoded text
with open('../data/interim/cleaned_corpus/about.txt', 'r') as src:
    text = src.read()
    
char_int_map = pd.read_csv('./data/features/char_int_map.csv')

Unnamed: 0,char,indices
0,,1
1,e,2
2,t,3
3,a,4
4,o,5


In [16]:
char_indices = {row['char'] : row['indices'] for _, row in char_int_map.iterrows()}
indices_char = {row['indices'] : row['char'] for _, row in char_int_map.iterrows()}

In [17]:

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            sentence = sentence[1:] + next_char

        print()
        
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [28]:
def batch_generator(batch_size):
    nb_batches = 1 + n_samples//batch_size
    while True:
        for b in range(nb_batches):
            cut_off = min([(b+1)*batch_size, n_samples])
            x_sample = x_data[b * batch_size : cut_off, :]
            y_sample = y_data[b * batch_size : cut_off]
            
            # onehot-encode x_sample
            x_onehot_sample = np.zeros((x_sample.shape[0], maxlen, number_of_chars))
            for i in range(x_sample.shape[0]):
                for j in range(maxlen):
                    onehot_index = x_sample[i, j] - 1
                    x_onehot_sample[i, j, onehot_index] = 1
                    
            # onehot-encode y_sample
            y_onehot_sample = np.zeros((y_sample.shape[0], number_of_chars))
            for i in range(y_sample.shape[0]):
                onehot_index = y_sample[i] - 1
                y_onehot_sample[i, onehot_index] = 1
            yield (x_onehot_sample, y_onehot_sample)
                
    
x_, y_ = next(batch_generator(23))

In [29]:
print(x_.shape)
print(y_.shape)

(23, 40, 104)
(23, 104)


In [32]:
model_checkpoint_saver = ModelCheckpoint("../models/checkpoint", monitor="loss", save_best_only=True)

In [33]:
batch_size = 64
model.fit_generator(batch_generator(batch_size),
                    steps_per_epoch=n_samples / batch_size,
                    epochs=1, callbacks=[print_callback, model_checkpoint_saver])

Epoch 1/1
  4470/100452 [>.............................] - ETA: 1:32:32 - loss: 0.6106

KeyboardInterrupt: 

Training this model on CPU will be an unacceptably time-consuming task. Time to port this over to Google Colab.