<a href="https://colab.research.google.com/github/Blion6868/bryanmccormack0-gmail.com/blob/master/Paradise_Lost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#basic imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

  import pandas.util.testing as tm


In [5]:
#"Paradise Lost," by John Milton, courtesy of https://www.gutenberg.org/

Milton_File = 'Milton.txt'

In [7]:
#read text file

Milton_text = open(Milton_File, 'r').read()

In [11]:
#to begin processing the text, we must create a vocabulary list

vocabulary = sorted(set(Milton_text))
print(vocabulary)
len(vocabulary)

['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '’']


86

In [12]:
#once the vocabulary list is created, we also need to index it

vocabulary_index = {u:i for i, u in enumerate(vocabulary)}

In [13]:
#we need to create a numpy array of the vocabulary
index_to_vocab = np.array(vocabulary)

In [16]:
#we have both an index # and each individual character; we now need to encode the character as a unique #
encoded_Milton_text = np.array([vocabulary_index[c] for c in Milton_text])

In [17]:
encoded_Milton_text[:20]

array([ 0, 49, 66, 63,  1, 45, 76, 73, 68, 63, 61, 78,  1, 36, 79, 78, 63,
       72, 60, 63])

In [18]:
# create training sequences

vocabulary_dataset = tf.data.Dataset.from_tensor_slices(encoded_Milton_text)

In [20]:
#to create the model that will mimic the inputted text, we need to create batches to feed into the model. The # of sequences is rather arbitrary and can be played with accordingly

sequence_len = 120
sequences = vocabulary_dataset.batch(sequence_len+1, drop_remainder=True)

In [21]:
#for the model to work correctly, we need to create a function that extracts the last character and skips the first character: 
# F Mans First Disobedience, and the Fruit--OF Mans First Disobedience, and the Frui

def create_seq_targets(seq):
    input_txt = seq[:-1]
    target_txt = seq[1:]
    return input_txt, target_txt

In [22]:
dataset = sequences.map(create_seq_targets)

In [23]:
for input_txt, target_txt in  dataset.take(1):
    print(input_txt.numpy())
    print(''.join(index_to_vocab[input_txt.numpy()]))
    print('\n')
    print(target_txt.numpy())
    print(''.join(index_to_vocab[target_txt.numpy()]))

[ 0 49 66 63  1 45 76 73 68 63 61 78  1 36 79 78 63 72 60 63 76 65  1 34
 31 73 73 69  1 73 64  1 45 59 76 59 62 67 77 63  1 41 73 77 78 12  1 60
 83  1 39 73 66 72  1 42 67 70 78 73 72  0  0 49 66 67 77  1 63 31 73 73
 69  1 67 77  1 64 73 76  1 78 66 63  1 79 77 63  1 73 64  1 59 72 83 73
 72 63  1 59 72 83 81 66 63 76 63  1 67 72  1 78 66 63  1 50 72 67 78 63]

The Project Gutenberg EBook of Paradise Lost, by John Milton

This eBook is for the use of anyone anywhere in the Unite


[49 66 63  1 45 76 73 68 63 61 78  1 36 79 78 63 72 60 63 76 65  1 34 31
 73 73 69  1 73 64  1 45 59 76 59 62 67 77 63  1 41 73 77 78 12  1 60 83
  1 39 73 66 72  1 42 67 70 78 73 72  0  0 49 66 67 77  1 63 31 73 73 69
  1 67 77  1 64 73 76  1 78 66 63  1 79 77 63  1 73 64  1 59 72 83 73 72
 63  1 59 72 83 81 66 63 76 63  1 67 72  1 78 66 63  1 50 72 67 78 63 62]
The Project Gutenberg EBook of Paradise Lost, by John Milton

This eBook is for the use of anyone anywhere in the United


In [24]:
# Batch size for training the model
batch_size = 128

# Buffer so model doesn't learn the pattern of the text
buffer_size = 10000

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [25]:
dataset

<BatchDataset shapes: ((128, 120), (128, 120)), types: (tf.int64, tf.int64)>

In [43]:
# Length of the vocabulary in characters
vocab_size = len(vocabulary)

# The embedding dimension
embed_dim = 64

# Number of RNN units
rnn_neurons = 1050

In [44]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding,Dropout,GRU
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [45]:
#to help build the model, we need to create own own loss function. logits refers to creating a map of probabilties values

def sparse_cat_loss(y_true,y_pred):
  return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

In [50]:
#building th TF model

def create_model(vocab_size, embed_dim, rnn_neurons, batch_size):
    model = Sequential()
    model.add(Embedding(vocab_size, embed_dim,batch_input_shape=[batch_size, None]))
    model.add(GRU(rnn_neurons,return_sequences=True,stateful=True,recurrent_initializer='glorot_uniform'))
    # Final Dense Layer to Predict
    model.add(Dense(vocab_size))
    model.compile(optimizer='adam', loss=sparse_cat_loss) 
    return model

In [51]:
model = create_model(
  vocab_size = vocab_size,
  embed_dim=embed_dim,
  rnn_neurons=rnn_neurons,
  batch_size=batch_size)

In [52]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (128, None, 64)           5504      
_________________________________________________________________
gru_7 (GRU)                  (128, None, 1050)         3515400   
_________________________________________________________________
dense (Dense)                (128, None, 86)           90386     
Total params: 3,611,290
Trainable params: 3,611,290
Non-trainable params: 0
_________________________________________________________________


In [53]:
for input_example_batch, target_example_batch in dataset.take(1):

  # Predict off some random batch
  example_batch_predictions = model(input_example_batch)

  # Display the dimensions of the predictions
  print(example_batch_predictions.shape)

(128, 120, 86)


In [54]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)

In [55]:
# Reformat to not be a list of lists
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [60]:
epochs = 30

model.fit(dataset,epochs=epochs)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f550bd634e0>

In [64]:
model.save('Milton_gen.h5') 

In [65]:
from tensorflow.keras.models import load_model

In [68]:
model = create_model(vocab_size, embed_dim, rnn_neurons, batch_size=1)

model.load_weights('Milton_gen.h5')

model.build(tf.TensorShape([1, None]))

In [69]:
def generate_text(model, start_seed,gen_size=100,temp=1.0):
  # Number of characters to generate
  num_generate = gen_size

  # Vecotrizing starting seed text
  input_eval = [vocabulary_index[s] for s in start_seed]

  # Expand to match batch format shape
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty list to hold resulting generated text
  text_generated = []

  temperature = temp

  # Here batch size == 1
  model.reset_states()

  for i in range(num_generate):

      # Generate Predictions
      predictions = model(input_eval)

      # Remove the batch shape dimension
      predictions = tf.squeeze(predictions, 0)

      # Use a cateogircal disitribution to select the next character
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # Pass the predicted charracter for the next input
      input_eval = tf.expand_dims([predicted_id], 0)

      # Transform back to character letter
      text_generated.append(vocabulary_index[predicted_id])

  return (start_seed + ''.join(text_generated))

In [72]:
print(generate_text(model,"Satan",gen_size=1000))

KeyError: ignored