<a href="https://colab.research.google.com/github/AbolajiOsobamiro/pythoncodes/blob/main/Natural_language_processing_with_RNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

In [2]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [None]:
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print('Length of text: {} characters'.format(len(text)))

In [4]:
vocab = sorted(set(text))
char2idx = {u:i for i,u in enumerate(vocab)}
idx2char = np.array(vocab)

def text_to_int(text):
  return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

In [None]:
print('Original Text: ',text[:13])
print('Encoded Text: ' ,text_to_int(text[:13]))

In [None]:
def int_to_text(ints):
  try:
    ints=ints.numpy()
  except:
    pass
  return(''.join(idx2char[ints]))

print(int_to_text(text_as_int[:13]))

In [7]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length + 1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [8]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [9]:
def split_input_target(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text

dataset = sequences.map(split_input_target)

In [None]:
for x,y in dataset.take(2):
  print("\n\nEXAMPLE\n")
  print('INPUT')
  print(int_to_text(x))
  print("\nOUTPUT")
  print(int_to_text(y))

In [11]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 256
RNN_UNITS = 1024


BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
def build_model(VOCAB_SIZE,EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE):
  model= tf.keras.Sequential([
      tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM,
                                batch_input_shape =[BATCH_SIZE, None]),
      tf.keras.layers.LSTM(
          RNN_UNITS,
          return_sequences = True,
          stateful = True,
          recurrent_initializer = 'glorot_uniform'),
      tf.keras.layers.Dense(VOCAB_SIZE)

  ])
  return model

model= build_model(VOCAB_SIZE,EMBEDDING_DIM,RNN_UNITS,BATCH_SIZE)
model.summary()

In [None]:
for input_example_batch, target_example_batch in data.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "#(BATCH_SIZE, seq_length,VOCAB_SIZE)" )

In [None]:
print(len(example_batch_predictions))
print(example_batch_predictions)

In [None]:
pred=example_batch_predictions[0]
print(len(pred))
print(pred)

In [None]:
time_pred = pred[0]
print(len(time_pred))
print(time_pred)

In [None]:
sampled_indices = tf.random.categorical(pred, num_samples=1)

sampled_indices = np.reshape(sampled_indices, (1,-1))[0]
predicted_chars = int_to_text(sampled_indices)

predicted_chars

In [18]:
def loss(labels,logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels,logits,from_logits=True)

In [19]:
model.compile(optimizer='adam', loss=loss)

In [28]:
checkpoint_dir = './training_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only = True
)

In [None]:
history = model.fit(data, epochs=40, callbacks=[checkpoint_callback])

In [23]:
model = build_model(VOCAB_SIZE,EMBEDDING_DIM,RNN_UNITS,BATCH_SIZE=1)

In [27]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1,None]))

In [32]:
model.save_weights(checkpoint_prefix)

In [36]:
checkpoint_num = 10
checkpoint_path = "./training_checkpoints/ckpt_" + str(checkpoint_num)
model.load_weights(checkpoint_path)
model.build(tf.TensorShape([1,None]))

In [None]:
def generate_text(model, start_string):

  num_generate = 1000

  input_eval= [char2idx[s] for s in start_string]
  input_eval= tf.expand_dims(input_eval,0)

  text_generated = []

  temperature = 1.0

  model.reset_states()
  for i in range(num_generate):
    predictions = model(input_eval)
    predictions = tf.squeeze(predictions,0)

    predictions = predictions/temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

    input_eval = tf.expand_dims([predicted_id],0)
    text_generated.append(idx2char[predicted_id])

  return (start_string +''.join(text_generated))

In [None]:
inp =  input('Type a starting string: ')
print(generate_text(model,inp))