In [1]:
import os
import numpy as np
import pandas as pd
import gzip, json
import random
from keras.models import Model, Sequential
from keras.layers import *
from keras.models import load_model
import keras.backend as K
import keras.callbacks
from keras.utils import Sequence
from keras.utils import to_categorical
from keras.preprocessing.sequence import TimeseriesGenerator

Using TensorFlow backend.


### build a toy dataset from Gutenberg project

In [2]:
#!curl -O http://static.decontextualize.com/gutenberg-poetry-v001.ndjson.gz

In [3]:
all_lines = []
for line in gzip.open("gutenberg-poetry-v001.ndjson.gz"):
    all_lines.append(json.loads(line.strip()))

In [4]:
corpi = {}
for line in all_lines:
    if line["gid"] not in corpi:
        corpi[line["gid"]] = []
    corpi[line["gid"]].append(line['s'])

In [5]:
#corpi.keys()

In [6]:
corpi["37648"]

['_Printed by R. & R. CLARK, Edinburgh._',
 'Ven^{ble.} Lord A. Compton, R. Barnes, J. D. Cooper,',
 'Alciati, therefore, considering that the illustrations formed no',
 'reckoned among the fossils of literature, which may be dug out of',
 'Though the new be gold, some love the old.',
 '"They have wrecked the old farm with its chimneys so high,',
 "And white flashing gables--my childhood's delight,",
 'The old home is gone, and the sorrowing eye',
 'Shuns the blue-slated upstart that glares from its site;"',
 'So flowed my fresh feeling, when loud at my side',
 'Rose the voice of a stranger arresting the tide:',
 '"What an emblem is here of the glories of change,',
 'Which purges and pares the old world to its quick;',
 'Transforming that rat-hole and ricketty grange,',
 'With its plaster and laths to a mansion of brick."',
 'The prose chilled like ice,--I sank into my skin,',
 'And felt my poor sentiment almost a sin.',
 'comparatively few accompanying woodcuts.',
 ' This little book 

In [7]:
corpus = corpi["37648"].copy()
print(corpus)



### Concat corpus and build a dictionnary

In [13]:
big_corpus = ""
# ponctuation
for i, line in enumerate(corpus):
    corpus[i] += " \n "
    corpus[i] = list(corpus[i])
    for j, c in enumerate(corpus[i]):
        if c in ',;:.!?)"':
            corpus[i][j] = " " + c
        elif c in '("':
            corpus[i][j] = c + " "
        if c == "-" and corpus[i][j+1] == "-":
            corpus[i][j] = ""
            corpus[i][j+1] = ""
        
            
    corpus[i] = "".join(corpus[i]) 
    big_corpus += corpus[i]
print(big_corpus)

_Printed by R  . & R  . CLARK  , Edinburgh  ._ 
 
 Ven^{ble  .} Lord A  . Compton  , R  . Barnes  , J  . D  . Cooper  , 
 
 Alciati  , therefore  , considering that the illustrations formed no 
 
 reckoned among the fossils of literature  , which may be dug out of 
 
 Though the new be gold  , some love the old  . 
 
   "They have wrecked the old farm with its chimneys so high  , 
 
 And white flashing gables my childhood's delight  , 
 
 The old home is gone  , and the sorrowing eye 
 
 Shuns the blue-slated upstart that glares from its site  ;  " 
 
 So flowed my fresh feeling  , when loud at my side 
 
 Rose the voice of a stranger arresting the tide  : 
 
   "What an emblem is here of the glories of change  , 
 
 Which purges and pares the old world to its quick  ; 
 
 Transforming that rat-hole and ricketty grange  , 
 
 With its plaster and laths to a mansion of brick  .  " 
 
 The prose chilled like ice  , I sank into my skin  , 
 
 And felt my poor sentiment almost a sin  . 
 


In [14]:
big_corpus = big_corpus.split(" ")

In [15]:
vocab = []
for word in big_corpus:
    if word not in vocab:
        vocab.append(word)
print(len(vocab))
print(vocab)

3610


In [32]:
seq_len = 32
batch_size = 32

class dataGenerator(Sequence):
    def __init__(self, x, batch_size):
        self.batch_size = batch_size
        self.x = x
        
    def __len__(self):
        return (len(self.x) // self.batch_size) - 1
    
    def __getitem__(self, idx):
        X = []
        Y = []
        for i in range(self.batch_size):
            if len(self.x[idx * batch_size + i : idx * batch_size + i + seq_len]) == batch_size:
                seq = []
                for w in self.x[idx * batch_size + i : idx * batch_size + i + seq_len]:
                    cat_w = to_categorical(vocab.index(w), num_classes=len(vocab))
                    seq.append(cat_w)
                X.append(seq)
                Y.append(to_categorical(vocab.index(self.x[idx + i + seq_len]), num_classes=len(vocab)))
        return np.array(X), np.array(Y)

split = int(0.8 * len(big_corpus))    
data_gen = dataGenerator(big_corpus[:split], batch_size)
val_gen = dataGenerator(big_corpus[split:], batch_size)

In [33]:
print(data_gen[0][1].shape)

(32, 3610)


In [61]:
K.clear_session()

in_w = Input(shape=(seq_len, len(vocab)))

x = LSTM(64, return_sequences=True)(in_w)
x = Bidirectional(LSTM(32))(x)
x = Dropout(0.3)(x)

x = Dense(128 , activation='relu')(x)
out = Dense(len(vocab), activation='softmax')(x)

model = Model(in_w, out)
from keras.optimizers import RMSprop
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])

model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 32, 3610)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 32, 64)            940800    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                24832     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               8320      
_________________________________________________________________
dense_2 (Dense)              (None, 3610)              465690    
Total params: 1,439,642
Trainable params: 1,439,642
Non-trainable params: 0
_________________________________________________

In [62]:
#save best model if model improved
model_name = "poetry.h5"
best_checkpoint = keras.callbacks.ModelCheckpoint(model_name, monitor='loss', verbose=1, save_best_only=True, mode='min')

In [63]:
h = model.fit_generator(data_gen, validation_data=val_gen, epochs = 30, callbacks=[best_checkpoint])

Epoch 1/30

Epoch 00001: loss improved from inf to 3.75641, saving model to poetry.h5
Epoch 2/30

Epoch 00002: loss improved from 3.75641 to 3.55990, saving model to poetry.h5
Epoch 3/30

Epoch 00003: loss improved from 3.55990 to 3.54414, saving model to poetry.h5
Epoch 4/30

Epoch 00004: loss improved from 3.54414 to 3.50493, saving model to poetry.h5
Epoch 5/30

Epoch 00005: loss improved from 3.50493 to 3.47113, saving model to poetry.h5
Epoch 6/30

Epoch 00006: loss improved from 3.47113 to 3.44703, saving model to poetry.h5
Epoch 7/30

Epoch 00007: loss improved from 3.44703 to 3.43157, saving model to poetry.h5
Epoch 8/30

Epoch 00008: loss improved from 3.43157 to 3.40895, saving model to poetry.h5
Epoch 9/30

Epoch 00009: loss improved from 3.40895 to 3.37702, saving model to poetry.h5
Epoch 10/30

Epoch 00010: loss improved from 3.37702 to 3.34498, saving model to poetry.h5
Epoch 11/30

Epoch 00011: loss improved from 3.34498 to 3.30854, saving model to poetry.h5
Epoch 12/30


In [64]:
model.save(model_name)

In [16]:
#model = load_model(model_name)

W1010 13:40:20.830971 15184 deprecation_wrapper.py:119] From c:\users\rock_\appdata\local\programs\python\python37\lib\site-packages\keras\backend\tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.



In [69]:
idx = 1
sent = val_gen[idx][0][0]

for _ in range(seq_len-1):
    pred = model.predict(sent.reshape((1, sent.shape[0], sent.shape[1])))
    sent = np.array(list(sent) + list(pred))
    sent = sent[1:]

print("start sentence :")
print(" ".join([vocab[np.argmax(w)] for w in val_gen[idx][0][0]]))

print("predicted sentence :")
res = [vocab[np.argmax(w)] for w in sent[1:]]
print(" ".join(res))    

print("real sentence :")
print(" ".join([vocab[np.argmax(w)] for w in val_gen[idx+seq_len][0][0]]))

start sentence :
force  , 
 
 Tells at a banquet how a fish's head 
 
 For great Theodoric with blood imbrued  , 
 
 Blood of the guiltless  , was
predicted sentence :

 
 
 
 
 
   
 
  
 
   
 
 
  
 
 
 
 
 
 
 
 
 
 
 

real sentence :
; 
 
 And some the pony pet  , though lame  , 
 
 A little mule of Pegasus  . 
 
 Then haste  , thou atom of
