In [1]:
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks')

In [2]:
# Importing Libraries
import numpy as np
import string
from pickle import dump
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.layers import Layer
from keras.layers import Input, Dense, LSTM, Embedding
import keras.backend as K
from keras import Model
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
import gpt_2_simple as gpt2

In [3]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	text = file.read()
	file.close()
	return text

In [9]:
# load document
in_filename = '/content/drive/MyDrive/Colab Notebooks/Text Generation/republic_clean.txt'
doc = load_doc(in_filename)
print(doc[:200])

The Project Gutenberg EBook of The Republic, by Plato

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it u


## Cleaning Text

*   Replace ‘–‘ with a white space so we can split words better.
*   Split words based on white space.
*   Remove all punctuation from words to reduce the vocabulary size (e.g. ‘What?’ becomes ‘What’).
*   Remove all words that are not alphabetic to remove standalone punctuation tokens.
*   Normalize all words to lowercase to reduce the vocabulary size.









In [10]:
# turn a doc into clean tokens
def clean_doc(doc):
	# replace '--' with a space ' '
	doc = doc.replace('--', ' ')
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# make lower case
	tokens = [word.lower() for word in tokens]
	return tokens

In [11]:
# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'republic', 'by', 'plato', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 'reuse', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'wwwgutenbergorg', 'title', 'the', 'republic', 'author', 'plato', 'translator', 'b', 'jowett', 'posting', 'date', 'august', 'ebook', 'release', 'date', 'october', 'last', 'updated', 'june', 'language', 'english', 'start', 'of', 'this', 'project', 'gutenberg', 'ebook', 'the', 'republic', 'produced', 'by', 'sue', 'asscher', 'the', 'republic', 'by', 'plato', 'translated', 'by', 'benjamin', 'jowett', 'note', 'the', 'republic', 'by', 'plato', 'jowett', 'etext', 'introduction', 'and', 'analysis', 'the', 'republic', 'of', 'plato', 'is', 'the', 'longest', 'of', 'hi

In [12]:
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
	# select sequence of tokens
	seq = tokens[i-length:i]
	# convert into a line
	line = ' '.join(seq)
	# store
	sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 216740


In [13]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

In [14]:
# save sequences to file
out_filename = '/content/drive/MyDrive/Colab Notebooks/Text Generation/republic_sequences.txt'
save_doc(sequences, out_filename)

In [4]:
in_filename = '/content/drive/MyDrive/Colab Notebooks/Text Generation/republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [16]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [17]:
# vocabulary size to handle zero offset in arrays while passing the sequence to Embedding Layer
vocab_size = len(tokenizer.word_index) 

In [18]:
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

In [19]:
# separate into input and output
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size,dtype='int8')
seq_length = X.shape[1]

In [20]:

# Add attention layer to the deep learning network
class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)
 
    def build(self,input_shape):
        self.W=self.add_weight(name='attention_weight', shape=(input_shape[-1],1), 
                               initializer='random_normal', trainable=True)
        self.b=self.add_weight(name='attention_bias', shape=(input_shape[1],1), 
                               initializer='zeros', trainable=True)        
        super(attention, self).build(input_shape)
 
    def call(self,x):
        # Alignment scores. Pass them through tanh function
        e = K.tanh(K.dot(x,self.W)+self.b)
        # Remove dimension of size 1
        e = K.squeeze(e, axis=-1)   
        # Compute the weights
        alpha = K.softmax(e)
        # Reshape to tensorFlow format
        alpha = K.expand_dims(alpha, axis=-1)
        # Compute the context vector
        context = x * alpha
        context = K.sum(context, axis=1)
        return context

In [21]:
inputs = Input(length-1)
embedding_layer = Embedding(vocab_size + 1, 50)(inputs)
lstm_1 = LSTM(100, return_sequences=True,activation='tanh')(embedding_layer)
lstm_2 = LSTM(100, return_sequences=True,activation='tanh')(lstm_1)
attention_layer = attention()(lstm_2)
dense_1 = Dense(100,activation='relu')(attention_layer)
outputs = Dense(vocab_size,trainable = True,activation='softmax')(dense_1)
model = Model(inputs,outputs)
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam', metrics = ['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding (Embedding)       (None, 50, 50)            522750    
                                                                 
 lstm (LSTM)                 (None, 50, 100)           60400     
                                                                 
 lstm_1 (LSTM)               (None, 50, 100)           80400     
                                                                 
 attention (attention)       (None, 100)               150       
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 10454)             105585

In [None]:
filepath = "/content/drive/MyDrive/Colab Notebooks/Text Generation/model_weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
history = model.fit(X,
         y,
         epochs = 50,
         batch_size = 128,
         callbacks = callbacks_list,
         verbose = 1)

Epoch 00048: loss improved from 2.51892 to 2.47372, saving model to /content/drive/MyDrive/Colab Notebooks/Text Generation/model_2_weights.hdf5
Epoch 49/50
Epoch 00049: loss improved from 2.47372 to 2.43728, saving model to /content/drive/MyDrive/Colab Notebooks/Text Generation/model_2_weights.hdf5
Epoch 50/50
Epoch 00050: loss improved from 2.43728 to 2.39953, saving model to /content/drive/MyDrive/Colab Notebooks/Text Generation/model_2_weights.hdf5


Let's train for 50 more epochs.

In [None]:
model.save('/content/drive/MyDrive/Colab Notebooks/Text Generation/model',save_format='tf')
loaded_model = load_model('/content/drive/MyDrive/Colab Notebooks/Text Generation/model')

In [None]:
# Removed the logS
history_loaded_model = loaded_model.fit(X,
         y,
         epochs = 50,
         batch_size = 128,
         callbacks = callbacks_list,
         verbose = 1)

In [None]:
loaded_model.save('/content/drive/MyDrive/Colab Notebooks/Text Generation/model.hdf5')

With training complete, we now have a model that can generate text. However, we need to give it a starting point. To do this, we write a function that takes a string input, tokenizes it, then pads it with zeroes so it fits into our 50 long prediction window.

In [22]:
def gen(model,seq,max_len = 51):
    ''' Generates a sequence given a string seq using specified model until the total sequence length
    reaches max_len'''
    # Tokenize the input string
    tokenized_sent = tokenizer.texts_to_sequences([seq])
    max_len = max_len+len(tokenized_sent[0])
    # If sentence is not as long as the desired sentence length, we need to 'pad sequence' so that
    # the array input shape is correct going into our LSTM. the `pad_sequences` function adds 
    # zeroes to the left side of our sequence until it becomes 50 long, the number of input features.
    while len(tokenized_sent[0]) < max_len:
        padded_sentence = pad_sequences(tokenized_sent[-50:],maxlen=50)
        op = model.predict(np.asarray(padded_sentence).reshape(1,-1))
        tokenized_sent[0].append(op.argmax()+1)
        
    return " ".join(map(lambda x : reverse_word_map[x],tokenized_sent[0]))

In [23]:
final_model = load_model('/content/drive/MyDrive/Colab Notebooks/Text Generation/model.hdf5',
                         custom_objects={'attention': attention})

In [32]:
seq = 'When we had finished our prayers and viewed the spectacle we turned in the direction of the city'

gen(final_model,seq)

'i love being with her i true true i with by speaking by for word to question be is is being play is is fractions is is or reason things i with by by form be saying as be and an differ and king is is try is rise said a rich in of replied by'

In [None]:
seq_1 = 'Certainly we will, said Glaucon; and in a few minutes Polemarchus appeared, and with him Adeimantus, Glaucon’s brother, Niceratus the son of Nicias, and several'

gen(final_model,seq_1)

'certainly we will said glaucon and in a few minutes polemarchus appeared and with him adeimantus brother niceratus the son of nicias and several rolling to children but i true true you yes or question by men by men be or is or is or began is sensible said a feast like differently to of he into from honoured not survive have interposing was of of state their supposed to of of reflect i years'

In [None]:
seq_2 = 'The servant took hold of me by the cloak behind'

gen(final_model,seq_2)

'the servant took hold of me by the cloak behind i with word true that only what but but i reason and lives call from who i true but i lives by for going is would is by was of was is is makes of friends is begins is else he injustice injustice this making not he pleasures cp of for'

In [27]:
seq_3 = 'The argument of the Republic is the search after Justice, the nature of which is first hinted at by Cephalus, the just and blameless old man'

gen(final_model,seq_3)

'the argument of the republic is the search after justice the nature of which is first hinted at by cephalus the just and blameless old man about from from he that that that company what certainly we so whom so what but what in just only making of of state of of for passed be is is public principles by by soul of for friend of he grow a sounds is value their step from of state'

While it doesn’t make the most sense, it actually seems readable. Given a larger set of training data, this approach and model can be used to generate much more understandable text (most likely with some parameter tweaking, of course).

Let's try to fine tune gpt2 small and see the results

In [6]:
gpt2.download_gpt2(model_name='124M')

Fetching checkpoint: 1.05Mit [00:00, 305Mit/s]                                                      
Fetching encoder.json: 1.05Mit [00:00, 7.40Mit/s]
Fetching hparams.json: 1.05Mit [00:00, 254Mit/s]                                                    
Fetching model.ckpt.data-00000-of-00001: 498Mit [00:21, 22.7Mit/s]                                  
Fetching model.ckpt.index: 1.05Mit [00:00, 484Mit/s]                                                
Fetching model.ckpt.meta: 1.05Mit [00:00, 9.97Mit/s]
Fetching vocab.bpe: 1.05Mit [00:00, 8.43Mit/s]


In [7]:
session = gpt2.start_tf_sess()
gpt2.finetune(sess=session,
              dataset = in_filename,
              model_name = '124M',
              steps = 100,
              restore_from = 'fresh',
              run_name = 'run1',
              print_every = 10,
              sample_every = 20,
              save_every = 20
            )

 be the best and the most reasonable.
the best at any rate is the least he thinks and most is the best
best at any rate is the least he thinks and most is the most sensible and most
at any rate is the least he thinks and most is the best and the most reasonable but
any rate is the least he thinks and most is the most sensible and most reasonable but the
rate is the least he thinks and most is the most sensible and most reasonable but the most
is the least he thinks and most is the most sensible and most reasonable but the most irrational
the least he thinks and most is the most sensible and most reasonable but the most irrational he
most seems to be the best and the most reasonable but the most irrational he is
says and most is the most sensible and most reasonable but the most irrational he is the
and most is the most sensible and most reasonable but the most irrational he is the most
most is the most sensible and most reasonable but the most irrational he is the most irrational
is th

In [17]:
gpt2.copy_checkpoint_to_gdrive(run_name='run1')

In [12]:
seq = 'When we had finished our prayers and viewed the spectacle we turned in the direction of the city'

gpt2.generate(session,
              length = 50,
              temperature = 0.7,
              prefix = seq,
              nsamples = 5,
              batch_size = 5,
              top_k = 40
              )

When we had finished our prayers and viewed the spectacle we turned in the direction of the city and went on the other way to the place where the sun was shining and the sun in his true colours of light and dark they met and found a messenger to the city of
in the city or place where the sun was shining and the sun in
When we had finished our prayers and viewed the spectacle we turned in the direction of the city the light of the sun and the moon were coming out the
the city were not the people who would have to live with them but cities were the means of life for them and they were not men but they were the rulers of the cities and they
When we had finished our prayers and viewed the spectacle we turned in the direction of the city and we went along and went along the paths of the citizens and we said to them as they went along we said you are very right and we should have done the same to the citizens at your time when we were speaking
we have been instructed that
When we had finishe

In [14]:
seq_1 = 'Certainly we will, said Glaucon; and in a few minutes Polemarchus appeared, and with him Adeimantus, Glaucon’s brother, Niceratus the son of Nicias, and several'

gpt2.generate(session,
              length = 50,
              temperature = 0.7,
              prefix = seq_1,
              nsamples = 5,
              batch_size = 5,
              top_k = 40
              )

Certainly we will, said Glaucon; and in a few minutes Polemarchus appeared, and with him Adeimantus, Glaucon’s brother, Niceratus the son of Nicias, and several of his sons and sisters were present at the feast of the king of
another and all of them were present at the feast of the king of the rich and
and all of them were present at the feast of the rich and the
all of
Certainly we will, said Glaucon; and in a few minutes Polemarchus appeared, and with him Adeimantus, Glaucon’s brother, Niceratus the son of Nicias, and several others were of opinion that the same was the case and when the state was divided into two parts they were to be
said Glaucon’s brother, Niceratus the son of Nicias, and several others were of opinion that the
Certainly we will, said Glaucon; and in a few minutes Polemarchus appeared, and with him Adeimantus, Glaucon’s brother, Niceratus the son of Nicias, and several others who were already eminent in the art of medicine and medicine and medicine and medicine an

In [13]:
seq_2 = 'The servant took hold of me by the cloak behind'

gpt2.generate(session,
              length = 50,
              temperature = 0.7,
              prefix = seq_2,
              nsamples = 5,
              batch_size = 5,
              top_k = 40
              )

The servant took hold of me by the cloak behind his back and said to me and my friend what
in the city and the servant took hold of me by the cloak behind his back and said to me and my friend what is
the city and the servant took hold of me by the cloak behind
The servant took hold of me by the cloak behind me and in a vision of the world falling down a great storm arose in the city
the servant took hold of me by the cloak behind me and in a vision of the world falling down a great storm arose in the city of
servant took
The servant took hold of me by the cloak behind my back and when he came out of the wood and said to me in a kind of voice and said to me
would have done a good and he said to me in a kind of voice and said to me in a kind of voice and said
The servant took hold of me by the cloak behind the doors and led me into the garden of the great palace of the
the house of the great poet of the eleventh century and the son of a nobleman and a young nobleman who had been a serva

In [15]:
seq_3 = 'The argument of the Republic is the search after Justice, the nature of which is first hinted at by Cephalus, the just and blameless old man'

gpt2.generate(session,
              length = 50,
              temperature = 0.7,
              prefix = seq_3,
              nsamples = 5,
              batch_size = 5,
              top_k = 40
              )

The argument of the Republic is the search after Justice, the nature of which is first hinted at by Cephalus, the just and blameless old man
of their great ambition has been to take over the state of the world and to become the great guardians of the state which is the true republic of the republic the law of their own constitution is the principle of the republic which is the true republic of
The argument of the Republic is the search after Justice, the nature of which is first hinted at by Cephalus, the just and blameless old man being the ideal state for his time the state becomes the instrument of the
to the argument of the Republic is the search after Justice, the nature of which is first hinted at by Cephalus the ideal state for his time the state becomes the
The argument of the Republic is the search after Justice, the nature of which is first hinted at by Cephalus, the just and blameless old man who is now dead is the great poet and the new man who is now dead is the philosophe