<a href="https://colab.research.google.com/github/Aaronsom/poem-generation/blob/master/colab_poem_generator_word.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/Aaronsom/poem-generation
%cd poem-generation
%mkdir models
!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
!gunzip GoogleNews-vectors-negative300.bin

Cloning into 'poem-generation'...
remote: Enumerating objects: 167, done.[K
remote: Counting objects: 100% (167/167), done.[K
remote: Compressing objects: 100% (120/120), done.[K
remote: Total 167 (delta 106), reused 99 (delta 45), pack-reused 0
Receiving objects: 100% (167/167), 1.93 MiB | 13.01 MiB/s, done.
Resolving deltas: 100% (106/106), done.
/content/poem-generation
--2019-06-04 16:20:58--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.162.117
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.162.117|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2019-06-04 16:21:33 (45.1 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [0]:
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
import tensorflow.train as optimizer
from poem_generator.dataGenerator import TupleDataGenerator
import poem_generator.data_prepocessing as dp
import poem_generator.embedding as embedding_loader
from poem_generator.global_constants import TRAINING_DATA, EMBEDDING_DIMENSION, EMBEDDING_BINARY, MODELS_DICT
from poem_generator.transformer import transformer
#from poem_generator.PoemCallback import PoemCallback
from tensorflow.keras.layers import *
from tensorflow.keras import Sequential
from tensorflow.contrib.tpu import keras_to_tpu_model, TPUDistributionStrategy
from tensorflow.contrib.cluster_resolver import TPUClusterResolver
import os

In [0]:
from tensorflow.keras.callbacks import Callback
import numpy as np
from poem_generator.global_constants import START_OF_SEQUENCE_TOKEN, END_OF_LINE_TOKEN, END_OF_SEQUENCE_TOKEN, \
    PADDING_TOKEN, OUT_OF_VOCAB_TOKEN, MODELS_DICT

class PoemCallback(Callback):

    def __init__(self, poems, seed_length, dictionary, single=True):
        super(PoemCallback, self).__init__()
        self.poems = poems
        self.dictionary = dictionary
        self.reverse_dictionary = {dictionary[key]: key for key in dictionary.keys()}
        self.seed_length = seed_length
        self.single = single

    def on_epoch_end(self, epoch, logs=None):
        for i in range(self.poems):
            print(f"Poem {i+1}/{self.poems}")
            model = self.model.sync_to_cpu()
            self.generate_poem(model, self.reverse_dictionary, self.dictionary, self.seed_length, single=self.single)
            
    def generate_poem(self, model, reverse_dictionary, dictionary, seed_length, dynamic_seed=False, single=False):
        poem = ""
        last_output = ""
        iterations = 0
        seed = np.array([dictionary[START_OF_SEQUENCE_TOKEN]]*seed_length)
        already_eol = False  # Sometimes too many eols are generated, this breaks the format
        while iterations < 60 and last_output != END_OF_SEQUENCE_TOKEN:
            if single:
                last_output_dist = model.predict(np.array([seed])).squeeze()
            else:
                last_output_dist = model.predict(np.array([seed]))[:, -1].squeeze()
            last_output_idx = np.random.choice(len(dictionary), 1, p=last_output_dist).item()
            last_output = reverse_dictionary[last_output_idx]


            iterations += 1

            if last_output == END_OF_SEQUENCE_TOKEN or iterations == 60:
                if already_eol:
                    poem += "\n"
                else:
                    poem += "\n\n"
            elif last_output == OUT_OF_VOCAB_TOKEN or last_output == PADDING_TOKEN \
                    or last_output == START_OF_SEQUENCE_TOKEN:
                iterations -= 1
            elif last_output == END_OF_LINE_TOKEN:
                if iterations > 1 and not already_eol:
                    already_eol = True
                    poem += "\n"

            else:
                already_eol = False
                poem += last_output + " "
            if last_output != OUT_OF_VOCAB_TOKEN and last_output != PADDING_TOKEN:
                if not dynamic_seed:
                    seed = np.append(seed[1:], last_output_idx)
                else:
                    seed = np.append(seed, last_output_idx)
        print(poem)

In [0]:
def bidirectional_lstm(n, embedding, vocab_len):
    model = Sequential([
        Embedding(input_dim=vocab_len, output_dim=EMBEDDING_DIMENSION, input_length=n, weights=[embedding]),
        Bidirectional(LSTM(512, return_sequences=True)),
        Bidirectional(LSTM(512, return_sequences=False)),
        Dropout(0.1),
        Dense(vocab_len, activation="softmax")
    ])
    return model
  
def lstm_rnn(n, embedding, vocab_len):
    model = Sequential([
        Embedding(input_dim=vocab_len, output_dim=EMBEDDING_DIMENSION, input_length=n, weights=[embedding]),
        LSTM(512, return_sequences=True),
        LSTM(512, return_sequences=False),
        Dropout(0.1),
        Dense(vocab_len, activation="softmax")
    ])
    return model
  
def mlp(n, embedding, vocab_len):
    model = Sequential([
        Embedding(input_dim=vocab_len, output_dim=EMBEDDING_DIMENSION, input_length=n, weights=[embedding]),
        Flatten(),
        Dropout(0.1),
        Dense(n*512, activation="relu"),
        Dropout(0.1),
        Dense(vocab_len,activation="softmax"),
    ])
    return model

In [0]:


ns = [5]
epochs = 30
batch_size = 512
max_limit = 20000
validation_split = 0.9

poems = dp.tokenize_poems(TRAINING_DATA)
words = sorted(list(set([token for poem in poems for token in poem])))

#Save embedding for generator
embedding, dictionary = embedding_loader.get_embedding(words, binary=EMBEDDING_BINARY, limit=max_limit, save=True, file="GoogleNews-vectors-negative300.bin")

#model = load_model(MODELS_DICT+"/5model.hdf5", custom_objects={"PositionalEncoding": PositionalEncoding, "Attention": Attention})
#model = transformer(ns[0], embedding, len(dictionary), single_out=False, train_embedding=True, input_sequence_length=ns[0], blocks=2, heads=5)
#model = bidirectional_lstm(ns[0], embedding, len(dictionary))
model = mlp(ns[0], embedding, len(dictionary))
#model.summary()
tpu_model = keras_to_tpu_model(
    model,
    strategy=TPUDistributionStrategy(
        TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
    )
)
tpu_model.compile(optimizer=optimizer.AdamOptimizer(),
            loss="categorical_crossentropy", metrics=["accuracy"])

generator = TupleDataGenerator(poems[:int(validation_split*len(poems))], ns, dictionary, 0.2, batch_size, single=True)
validation_generator = TupleDataGenerator(poems[int(validation_split*len(poems)):], ns, dictionary, 0, batch_size, single=True)
callbacks = [ModelCheckpoint(MODELS_DICT+"/best-model.hdf5", save_best_only=True),
             ModelCheckpoint(MODELS_DICT+"/model.hdf5", save_best_only=False),
           CSVLogger(MODELS_DICT+"/log.csv", append=False, separator=';'), PoemCallback(2, ns[0], dictionary, single=True)]
tpu_model.fit_generator(
  generator, epochs=epochs, callbacks=callbacks, validation_data=validation_generator, workers=4)

!mkdir generated
from poem_generator.word_generator import generate_poems
n = ns[0]
#generate_poems(1000, n, "generated/poems.zip", MODELS_DICT+"/model.hdf5", single=True)
generate_poems(1000, n, "generated/best-poems.zip", MODELS_DICT+"/best-model.hdf5", single=True)
from google.colab import files
files.download("generated/poems.zip")
files.download("generated/best-poems.zip")
files.download("models/log.csv")

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
INFO:tensorflow:Querying Tensorflow master (grpc://10.77.191.138:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 9042971628776113856)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 16980959293380121517)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 11210451548987186205)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 7960413685278401711)
INFO:t

In [0]:
!mkdir generated

In [8]:
from poem_generator.word_generator import generate_poems
n = ns[0]
generate_poems(1000, n, "generated/poems.zip", MODELS_DICT+"/model.hdf5", single=True)
generate_poems(1000, n, "generated/best-poems.zip", MODELS_DICT+"/best-model.hdf5", single=True)

Using TensorFlow backend.


1/1000
the world did not our own 
as i have seen the day 
if any means are told 
the mother he loved her brother s desire 
some god my heart my song 
is only privilege o man 
i m fond as memory 
which is the world s same 
within this same bed 


2/1000
the world did not our own 
as i have seen the day 
if any means are told 
the mother he loved her brother s desire 
some god my heart my song 
is only privilege o man 
i m fond as memory 
which is the world s same 
within this same bed 


3/1000
the world did not our own 
as i have seen the day 
if any means are told 
the mother he loved her brother s desire 
some god my heart my song 
is only privilege o man 
i m fond as memory 
which is the world s same 
within this same bed 


4/1000
the world did not our own 
as i have seen the day 
if any means are told 
the mother he loved her brother s desire 
some god my heart my song 
is only privilege o man 
i m fond as memory 
which is the world s same 
within this same bed 


5/1000
the world

In [0]:
from google.colab import files
files.download("generated/poems.zip")
files.download("generated/best-poems.zip")
files.download("models/log.csv")