<a href="https://colab.research.google.com/github/Aaronsom/poem-generation/blob/master/colab_poem_generator_word.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!git clone https://github.com/Aaronsom/poem-generation
%cd poem-generation
%mkdir models
!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
!gunzip GoogleNews-vectors-negative300.bin

fatal: destination path 'poem-generation' already exists and is not an empty directory.
/content/poem-generation
mkdir: cannot create directory ‘models’: File exists
--2019-06-01 17:49:55--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.134.93
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.134.93|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2019-06-01 17:51:03 (23.3 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]

gzip: GoogleNews-vectors-negative300.bin: unknown suffix -- ignored


In [0]:
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
import tensorflow.train as optimizer
from poem_generator.dataGenerator import TupleDataGenerator
import poem_generator.data_prepocessing as dp
import poem_generator.embedding as embedding_loader
from poem_generator.global_constants import TRAINING_DATA, EMBEDDING_DIMENSION, EMBEDDING_BINARY, MODELS_DICT
from poem_generator.transformer import transformer
#from poem_generator.PoemCallback import PoemCallback
from tensorflow.keras.layers import *
from tensorflow.keras import Sequential
from tensorflow.contrib.tpu import keras_to_tpu_model, TPUDistributionStrategy
from tensorflow.contrib.cluster_resolver import TPUClusterResolver
import os

In [0]:
from tensorflow.keras.callbacks import Callback

class PoemCallback(Callback):

    def __init__(self, poems, seed_length, dictionary, single=True):
        super(PoemCallback, self).__init__()
        self.poems = poems
        self.dictionary = dictionary
        self.reverse_dictionary = {dictionary[key]: key for key in dictionary.keys()}
        self.seed_length = seed_length
        self.single = single

    def on_epoch_end(self, epoch, logs=None):
        for i in range(self.poems):
            print(f"Poem {i+1}/{self.poems}")
            model = self.model.sync_to_cpu()
            self.generate_poem(model, self.reverse_dictionary, self.dictionary, self.seed_length, single=self.single)
            
    def generate_poem(self, model, reverse_dictionary, dictionary, seed_length, dynamic_seed=False, single=False):
        poem = ""
        last_output = ""
        iterations = 0
        seed = np.array([dictionary[START_OF_SEQUENCE_TOKEN]]*seed_length)
        already_eol = False  # Sometimes too many eols are generated, this breaks the format
        while iterations < 60 and last_output != END_OF_SEQUENCE_TOKEN:
            if single:
                last_output_dist = model.predict(np.array([seed])).squeeze()
            else:
                last_output_dist = model.predict(np.array([seed]))[:, -1].squeeze()
            last_output_idx = np.random.choice(len(dictionary), 1, p=last_output_dist).item()
            last_output = reverse_dictionary[last_output_idx]


            iterations += 1

            if last_output == END_OF_SEQUENCE_TOKEN or iterations == 60:
                if already_eol:
                    poem += "\n"
                else:
                    poem += "\n\n"
            elif last_output == OUT_OF_VOCAB_TOKEN or last_output == PADDING_TOKEN \
                    or last_output == START_OF_SEQUENCE_TOKEN:
                iterations -= 1
            elif last_output == END_OF_LINE_TOKEN:
                if iterations > 1 and not already_eol:
                    already_eol = True
                    poem += "\n"

            else:
                already_eol = False
                poem += last_output + " "
            if last_output != OUT_OF_VOCAB_TOKEN and last_output != PADDING_TOKEN:
                if not dynamic_seed:
                    seed = np.append(seed[1:], last_output_idx)
                else:
                    seed = np.append(seed, last_output_idx)
        print(poem)

In [0]:
def bidirectional_lstm(n, embedding, vocab_len):
    model = Sequential([
        Embedding(input_dim=vocab_len, output_dim=EMBEDDING_DIMENSION, input_length=n, weights=[embedding]),
        Bidirectional(LSTM(512, return_sequences=True)),
        Bidirectional(LSTM(512, return_sequences=False)),
        Dropout(0.1),
        Dense(vocab_len, activation="softmax")
    ])
    return model
  
def lstm_rnn(n, embedding, vocab_len):
    model = Sequential([
        Embedding(input_dim=vocab_len, output_dim=EMBEDDING_DIMENSION, input_length=n, weights=[embedding]),
        LSTM(512, return_sequences=True),
        LSTM(512, return_sequences=False),
        Dropout(0.1),
        Dense(vocab_len, activation="softmax")
    ])
    return model
  
def mlp(n, embedding, vocab_len):
    model = Sequential([
        Embedding(input_dim=vocab_len, output_dim=EMBEDDING_DIMENSION, input_length=n, weights=[embedding]),
        Flatten(),
        Dropout(0.1),
        Dense(n*512, activation="relu"),
        Dropout(0.1),
        Dense(vocab_len,activation="softmax"),
    ])
    return model

In [0]:


ns = [5]
epochs = 30
batch_size = 512
max_limit = 25000
validation_split = 0.9

poems = dp.tokenize_poems(TRAINING_DATA)
words = sorted(list(set([token for poem in poems for token in poem])))

#Save embedding for generator
embedding, dictionary = embedding_loader.get_embedding(words, binary=EMBEDDING_BINARY, limit=max_limit, save=True, file="GoogleNews-vectors-negative300.bin")

#model = load_model(MODELS_DICT+"/5model.hdf5", custom_objects={"PositionalEncoding": PositionalEncoding, "Attention": Attention})
#model = transformer(ns[0], embedding, len(dictionary), single_out=True, train_embedding=True, input_sequence_length=ns[0], blocks=1, heads=5)
#model = bidirectional_lstm(ns[0], embedding, len(dictionary))
model = mlp(ns[0], embedding, len(dictionary))
#model.summary()
tpu_model = keras_to_tpu_model(
    model,
    strategy=TPUDistributionStrategy(
        TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
    )
)
tpu_model.compile(optimizer=optimizer.AdamOptimizer(),
            loss="categorical_crossentropy", metrics=["accuracy"])

generator = TupleDataGenerator(poems[:int(validation_split*len(poems))], ns, dictionary, 0, batch_size, single=True)
validation_generator = TupleDataGenerator(poems[int(validation_split*len(poems)):], ns, dictionary, 0, batch_size, single=True)
callbacks = [ModelCheckpoint(MODELS_DICT+"/model.hdf5", save_best_only=True),
           CSVLogger(MODELS_DICT+"/log.csv", append=True, separator=';'), PoemCallback(2, ns[0], dictionary)]
tpu_model.fit_generator(
  generator, epochs=epochs, callbacks=callbacks, validation_data=validation_generator, workers=4)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 5)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 5, 300)       2662800     input_2[0][0]                    
__________________________________________________________________________________________________
positional_encoding_1 (Position (None, 5, 300)       0           embedding_1[0][0]                
__________________________________________________________________________________________________
dropout_8 (Dropout)             (None, 5, 300)       0           positional_encoding_1[0][0]      
__________________________________________________________________________________________________
attention_

In [0]:
!mkdir generated

In [0]:
from poem_generator.word_generator import generate_poems
n = ns[0]
generate_poems(1000, n, "generated/poems.zip", MODELS_DICT+"/model.hdf5", single=True)

In [0]:
from google.colab import files
files.download("generated/poems.zip")
files.download("model/log.csv")