In [15]:
import os
import numpy as np
import pandas as pd
import gzip, json
import random
from keras.models import Model, Sequential
from keras.layers import *
from keras.models import load_model
import keras.backend as K
import keras.callbacks
from keras.utils import Sequence
from keras.utils import to_categorical
from keras.preprocessing.sequence import TimeseriesGenerator

### build a toy dataset from Gutenberg project

In [16]:
#!curl -O http://static.decontextualize.com/gutenberg-poetry-v001.ndjson.gz

In [18]:
all_lines = []
for line in gzip.open("gutenberg-poetry-v001.ndjson.gz"):
    all_lines.append(json.loads(line.strip()))

In [48]:
corpi = {}
for line in all_lines:
    if line["gid"] not in corpi:
        corpi[line["gid"]] = []
    corpi[line["gid"]].append(line['s'])

In [328]:
#corpi.keys()

In [84]:
corpi["37648"]

['_Printed by R. & R. CLARK, Edinburgh._',
 'Ven^{ble.} Lord A. Compton, R. Barnes, J. D. Cooper,',
 'Alciati, therefore, considering that the illustrations formed no',
 'reckoned among the fossils of literature, which may be dug out of',
 'Though the new be gold, some love the old.',
 '"They have wrecked the old farm with its chimneys so high,',
 "And white flashing gables--my childhood's delight,",
 'The old home is gone, and the sorrowing eye',
 'Shuns the blue-slated upstart that glares from its site;"',
 'So flowed my fresh feeling, when loud at my side',
 'Rose the voice of a stranger arresting the tide:',
 '"What an emblem is here of the glories of change,',
 'Which purges and pares the old world to its quick;',
 'Transforming that rat-hole and ricketty grange,',
 'With its plaster and laths to a mansion of brick."',
 'The prose chilled like ice,--I sank into my skin,',
 'And felt my poor sentiment almost a sin.',
 'comparatively few accompanying woodcuts.',
 ' This little book 

In [389]:
corpus = corpi["37648"].copy()
print(corpus)



### Concat corpus and build a dictionnary

In [390]:
big_corpus = ""
# ponctuation
for i, line in enumerate(corpus):
    corpus[i] += " \n"
    corpus[i] = list(corpus[i])
    for j, c in enumerate(corpus[i]):
        if c in ',;:.!?)"':
            corpus[i][j] = " " + c
        elif c in '("':
            corpus[i][j] = c + " "
        if c == "-" and corpus[i][j+1] == "-":
            corpus[i][j] = ""
            corpus[i][j+1] = ""
        
            
    corpus[i] = "".join(corpus[i]) 
    big_corpus += corpus[i]
print(big_corpus)

_Printed by R . & R . CLARK , Edinburgh ._ 
Ven^{ble .} Lord A . Compton , R . Barnes , J . D . Cooper , 
Alciati , therefore , considering that the illustrations formed no 
reckoned among the fossils of literature , which may be dug out of 
Though the new be gold , some love the old . 
 "They have wrecked the old farm with its chimneys so high , 
And white flashing gables my childhood's delight , 
The old home is gone , and the sorrowing eye 
Shuns the blue-slated upstart that glares from its site ; " 
So flowed my fresh feeling , when loud at my side 
Rose the voice of a stranger arresting the tide : 
 "What an emblem is here of the glories of change , 
Which purges and pares the old world to its quick ; 
Transforming that rat-hole and ricketty grange , 
With its plaster and laths to a mansion of brick . " 
The prose chilled like ice , I sank into my skin , 
And felt my poor sentiment almost a sin . 
comparatively few accompanying woodcuts . 
 This little book was followed by another

In [332]:
big_corpus = big_corpus.split(" ")

In [333]:
vocab = []
for word in big_corpus:
    if word not in vocab:
        vocab.append(word)
print(len(vocab))
print(vocab)

3641


In [358]:
seq_len = 32
batch_size = 32

class dataGenerator(Sequence):
    def __init__(self, x, batch_size):
        self.batch_size = batch_size
        self.x = x
        
    def __len__(self):
        return (len(self.x) // self.batch_size) - 1
    
    def __getitem__(self, idx):
        X = []
        Y = []
        for i in range(self.batch_size):
            if len(self.x[idx + i : idx + i + seq_len]) == batch_size:
                seq = []
                for w in self.x[idx + i : idx + i + seq_len]:
                    cat_w = to_categorical(vocab.index(w), num_classes=len(vocab))
                    seq.append(cat_w)
                X.append(seq)
                Y.append(to_categorical(vocab.index(self.x[idx + i + seq_len]), num_classes=len(vocab)))
        return np.array(X), np.array(Y)

split = int(0.8 * len(big_corpus))    
data_gen = dataGenerator(big_corpus[:split], batch_size)
val_gen = dataGenerator(big_corpus[split:], batch_size)

In [345]:
print(data_gen[0][1].shape)

(32, 3641)


In [352]:
K.clear_session()

in_w = Input(shape=(seq_len, len(vocab)))

x = LSTM(16, return_sequences=False)(in_w)
#x = Bidirectional(LSTM(32))(x)
x = Dropout(0.3)(x)

x = Dense(128 , activation='relu')(x)
out = Dense(len(vocab), activation='softmax')(x)

model = Model(in_w, out)
from keras.optimizers import RMSprop
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])

model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 32, 3641)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 16)                234112    
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               2176      
_________________________________________________________________
dense_2 (Dense)              (None, 3641)              469689    
Total params: 705,977
Trainable params: 705,977
Non-trainable params: 0
_________________________________________________________________


In [353]:
#save best model if model improved
model_name = "poetry.h5"
best_checkpoint = keras.callbacks.ModelCheckpoint(model_name, monitor='loss', verbose=1, save_best_only=True, mode='min')

In [354]:
h = model.fit_generator(data_gen, epochs = 30, callbacks=[best_checkpoint])

Epoch 1/30

Epoch 00001: loss improved from inf to 3.01715, saving model to poetry.h5
Epoch 2/30

Epoch 00002: loss improved from 3.01715 to 0.90579, saving model to poetry.h5
Epoch 3/30

Epoch 00003: loss improved from 0.90579 to 0.46839, saving model to poetry.h5
Epoch 4/30

Epoch 00004: loss improved from 0.46839 to 0.36480, saving model to poetry.h5
Epoch 5/30

Epoch 00005: loss improved from 0.36480 to 0.30774, saving model to poetry.h5
Epoch 6/30

Epoch 00006: loss improved from 0.30774 to 0.25026, saving model to poetry.h5
Epoch 7/30

Epoch 00007: loss improved from 0.25026 to 0.22501, saving model to poetry.h5
Epoch 8/30

Epoch 00008: loss improved from 0.22501 to 0.21561, saving model to poetry.h5
Epoch 9/30

Epoch 00009: loss improved from 0.21561 to 0.19459, saving model to poetry.h5
Epoch 10/30

Epoch 00010: loss improved from 0.19459 to 0.16830, saving model to poetry.h5
Epoch 11/30

Epoch 00011: loss improved from 0.16830 to 0.16519, saving model to poetry.h5
Epoch 12/30


In [355]:
model.save(model_name)

In [385]:
idx = 468
sent = val_gen[idx][0][0]

for _ in range(seq_len-1):
    pred = model.predict(sent.reshape((1, sent.shape[0], sent.shape[1])))
    sent = np.array(list(sent) + list(pred))
    sent = sent[1:]

print("start sentence :\n")
print(" ".join([vocab[np.argmax(w)] for w in val_gen[idx][0][0]]))
print()
print("predicted sentence :\n")
res = [vocab[np.argmax(w)] for w in sent[1:]]
print(" ".join(res))    
print()
print("real sentence :\n")
print(" ".join([vocab[np.argmax(w)] for w in val_gen[idx+seq_len][0][0]]))

start sentence :

, " 
And millionaires Tokay . 
Some as if empty-handed come ; 
Yet with brave sound and show 
Add to the brilliance and the hum ; 
Life scarce might these forego .

predicted sentence :

quick ; 
Transforming that rat-hole and ricketty grange , 
With its plaster and laths to a mansion of brick . " 
The prose chilled like ice , I sank into my

real sentence :


And faithful guests will aye believe 
The poor who nought afford , 
Welcomed , bring more than they receive , 
In blessings from the Lord . 
And surely 'twere a godless roll
