<a href="https://colab.research.google.com/github/ForestPearson/CS410-510-NLP-project/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import tensorflow as tf
import os
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import StringLookup
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.models import Model

BATCH_SIZE = 64
BUFFER_SIZE = 10000
EPOCHS = 75
DIM = 256
RNN = 1024

path = tf.keras.utils.get_file('combined.txt', 'https://raw.githubusercontent.com/ForestPearson/CS410-510-NLP-project/main/data/combined.txt')

In [None]:
text = open(path, 'rb').read().decode(encoding='utf-8')
print("Length:", len(text))
print(text[:500])

vocab = sorted(set(text))

Length: 389861
ACT I

SCENE I. Rousillon. The COUNT's palace.

Enter BERTRAM, the COUNTESS of Rousillon, HELENA, and LAFEU, all in black
COUNTESS
In delivering my son from me, I bury a second husband.
BERTRAM
And I in going, madam, weep o'er my father's death
anew: but I must attend his majesty's command, to
whom I am now in ward, evermore in subjection.
LAFEU
You shall find of the king a husband, madam; you,
sir, a father: he that so generally is at all times
good must of necessity hold his virtue to you; who


In [None]:
example_texts = ['abcdefg', 'xyz']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')

ids_from_chars = StringLookup(vocabulary=list(vocab), mask_token=None)
chars_from_ids = StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)
vocabSize = len(ids_from_chars.get_vocabulary())


In [None]:
seq_length = 100
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
#Convert to character indices
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
#Form sequences made up of 100 characters
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

In [None]:
#Training data creation and target creation using sequences
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)
dataset = (dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [None]:
class MyModel(Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    #Map each character ID
    self.embedding = Embedding(vocab_size, embedding_dim)
    #RNN layer
    self.gru = GRU(rnn_units,return_sequences=True,return_state=True)
    #Output layer
    self.dense = Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [None]:
model = MyModel(
    vocab_size=vocabSize,
    embedding_dim=DIM,
    rnn_units=RNN)

In [None]:
model.compile(optimizer='adam', loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True))
# Directory where the checkpoints will be saved
dir = './data/epochs'
#File names
fileName = os.path.join(dir, "ckpt_{epoch}")
reduce_alpha = ReduceLROnPlateau(monitor ='loss', factor = 0.2,patience = 1, min_lr = 0.001)
results = tf.keras.callbacks.ModelCheckpoint(filepath=fileName,save_weights_only=True)

In [None]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[results])

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75


In [None]:
class Generate(Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars
    
    #Remove unknown words
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(values=[-float('inf')]*len(skip_ids),indices=skip_ids,dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  #Prediction for two gate layer
  def predict(self, inputs, states=None):
    #Convert from chars to IDs
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()
    #Get prediction
    preds, states = self.model(inputs=input_ids, states=states,return_state=True)
    preds = preds[:, -1, :]
    preds = preds/self.temperature
    preds = preds + self.prediction_mask
    #Convert back from IDs to chars after generation
    preds_ids = tf.random.categorical(preds, num_samples=1)
    preds_ids = tf.squeeze(preds_ids, axis=-1)
    predicted_chars = self.chars_from_ids(preds_ids)

    return predicted_chars, states

In [None]:
Generator = Generate(model, chars_from_ids, ids_from_chars)
states = None
seed = tf.constant(['COUNTESS'])
result = [seed]

for n in range(1000):
  seed, states = Generator.predict(seed, states=states)
  result.append(seed)

result = tf.strings.join(result)
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)

COUNTESS
To be young again, if we could: I will be a fool
Which he fains my thoughts Indeed.
LEONTES
How! daughter!
ORLANDO
And so am I for Rosalind.
ROSALIND
And I for no woman.
SILVIUS
It is to be all made of faith and service;
And therefore look you call me Ganymede.
But my free upon your virgin, she had with mess
pirchuse to his majesty. Even died to be too little a bastard.
POLIXENES
Pray now, come, sir, we see
Cold wish'd of her honesty: and, alas,
The spark so much in bring on the hedge of two daughter
To her allow of the square, under thou think,
That fools should be so deep-contemplative,
And I did laugh sans intermission
An hour by his dialage, but I do see't
Not becomes it: she's
entertainment: you are too young, to say I live:
In once or this that I have forgiven alo, To say it is
no thought of his charic, but so cut a ladys like to
our woman. Therefore, young lord
When you push away thy lands and all things that thou to
ckild her forth size understand but that I
may beg; f