<a href="https://colab.research.google.com/github/FranklineMisango/NLP_Sequential_classifier_Freecodecamp/blob/main/NLP_Sequential_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

RNN Play Generator --> Sequential data 

Now time for one of the coolest examples we've seen so far. We are going to use a RNN to generate a play. We will simply show the RNN an example of something we want it to recreate and it will learn how to write a version of it on its own. We'll do this using a character predictive model that will take as input a variable length sequence and predict the next character. We can use the model many times in a row with the output from the last predicition as the input for the next call to generate a sequence.

In [1]:
%tensorflow_version 2.x  # this line is not required unless you are in a notebook
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `2.x  # this line is not required unless you are in a notebook`. This will be interpreted as: `2.x`.


TensorFlow 2.x selected.


In [3]:
#download the Romeo and juliet text


path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [4]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [5]:
# Take a look at the first 250 characters in text
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [6]:
vocab = sorted(set(text))
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

def text_to_int(text):
  return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

In [7]:
# lets look at how part of our text is encoded
print("Text:", text[:13])
print("Encoded:", text_to_int(text[:13]))

Text: First Citizen
Encoded: [18 47 56 57 58  1 15 47 58 47 64 43 52]


In [8]:
def int_to_text(ints):
  try:
    ints = ints.numpy()
  except:
    pass
  return ''.join(idx2char[ints])

print(int_to_text(text_as_int[:13]))

#Converts text to an integer

First Citizen


###Creating Training Examples
Remember our task is to feed the model a sequence and have it return to us the next character. This means we need to split our text data from above into many shorter sequences that we can pass to the model as training examples. 

The training examples we will prepapre will use a *seq_length* sequence as input and a *seq_length* sequence as the output where that sequence is the original sequence shifted one letter to the right. For example:

```input: Hell | output: ello```

Our first step will be to create a stream of characters from our text data.

In [9]:
#Training examples


seq_length = 100  # length of sequence for a training example
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [10]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)


#Adds them into batches of 101 in length

In [11]:
def split_input_target(chunk):  # for the example: hello
    input_text = chunk[:-1]  # hell
    target_text = chunk[1:]  # ello
    return input_text, target_text  # hell, ello

dataset = sequences.map(split_input_target)  # we use map to apply the above function to every entry

In [None]:
for x, y in dataset.take(2):
  print("\n\nEXAMPLE\n")
  print("INPUT")
  print(int_to_text(x))
  print("\nOUTPUT")
  print(int_to_text(y))

In [14]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)  # vocab is number of unique characters
EMBEDDING_DIM = 256
RNN_UNITS = 1024

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [15]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

model = build_model(VOCAB_SIZE,EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           16640     
                                                                 
 lstm (LSTM)                 (64, None, 1024)          5246976   
                                                                 
 dense (Dense)               (64, None, 65)            66625     
                                                                 
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


In [16]:
for input_example_batch, target_example_batch in data.take(1):
  example_batch_predictions = model(input_example_batch)  # ask our model for a prediction on our first batch of training data (64 entries)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")  # print out the output shape

(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [17]:
# we can see that the predicition is an array of 64 arrays, one for each entry in the batch
print(len(example_batch_predictions))
print(example_batch_predictions)

64
tf.Tensor(
[[[-1.38484628e-03 -2.62958091e-03 -2.20049103e-03 ...  1.95705006e-03
    5.30055072e-03  1.19098974e-03]
  [ 2.91415444e-03 -8.85869609e-04  2.75406474e-03 ...  1.01359480e-03
   -1.52257876e-03 -5.32643124e-03]
  [ 9.63903777e-03 -1.40086829e-03  5.64751308e-03 ... -4.53862920e-03
   -6.14792167e-04 -1.84996775e-03]
  ...
  [ 1.82924848e-02  6.41267700e-03  6.44259341e-03 ... -9.70287807e-03
   -5.73623646e-03 -5.69551066e-03]
  [ 1.86425224e-02  4.94888565e-03  4.60319873e-03 ... -8.48404970e-03
   -7.71339610e-03 -1.08253201e-02]
  [ 1.88630521e-02  2.38239439e-03  1.79630122e-03 ... -2.02258863e-03
   -5.63251041e-03 -2.33585271e-03]]

 [[ 1.33940857e-03 -1.59762672e-03 -2.21553212e-03 ...  7.70559243e-04
   -5.17989858e-04 -1.16878841e-03]
  [-3.40611581e-03  3.71578103e-03 -1.53601146e-03 ...  2.76343198e-03
   -5.11797029e-04 -1.66128110e-03]
  [ 8.30206904e-04  4.42638155e-03 -4.53643827e-03 ... -7.90145015e-04
    6.83878956e-04 -7.44724553e-03]
  ...
  [ 6.837

In [18]:
# lets examine one prediction
pred = example_batch_predictions[0]
print(len(pred))
print(pred)
# notice this is a 2d array of length 100, where each interior array is the prediction for the next character at each time step

100
tf.Tensor(
[[-0.00138485 -0.00262958 -0.00220049 ...  0.00195705  0.00530055
   0.00119099]
 [ 0.00291415 -0.00088587  0.00275406 ...  0.00101359 -0.00152258
  -0.00532643]
 [ 0.00963904 -0.00140087  0.00564751 ... -0.00453863 -0.00061479
  -0.00184997]
 ...
 [ 0.01829248  0.00641268  0.00644259 ... -0.00970288 -0.00573624
  -0.00569551]
 [ 0.01864252  0.00494889  0.0046032  ... -0.00848405 -0.0077134
  -0.01082532]
 [ 0.01886305  0.00238239  0.0017963  ... -0.00202259 -0.00563251
  -0.00233585]], shape=(100, 65), dtype=float32)


In [20]:
# and finally well look at a prediction at the first timestep
time_pred = pred[0]
print(len(time_pred))
print(time_pred)
# and of course its 65 values representing the probabillity of each character occuring next

65
tf.Tensor(
[-1.3848463e-03 -2.6295809e-03 -2.2004910e-03  2.9544574e-03
 -5.1897638e-03 -4.7564169e-04 -3.9052966e-03  2.5018067e-03
  4.6357163e-04 -3.1731743e-03  6.0384290e-04 -7.1505625e-03
  3.4406949e-03  6.6216444e-03  4.0221615e-03  5.1128417e-03
  2.6328303e-03  2.1358929e-03  1.4310228e-03 -1.3378740e-03
  1.5920610e-05  4.1813180e-03  2.7221760e-03 -5.0958456e-04
 -2.0707110e-03 -7.0740837e-03  9.3742553e-04  1.4835687e-03
 -4.5352052e-03 -2.5776094e-03 -4.9438584e-03  1.5492011e-03
  3.5637263e-03 -5.8521412e-04  6.9765449e-03 -3.5347024e-03
 -4.6314156e-04  2.1656547e-03  1.1349176e-03  1.7992344e-03
  5.3559095e-03 -1.6233213e-03 -1.7289056e-03 -2.1866183e-03
 -1.5952294e-03 -2.9887492e-03  1.7089176e-03 -1.5906236e-03
  3.8634590e-03 -1.9506941e-04 -4.1275034e-03 -2.7687869e-03
 -3.4905989e-03 -4.5727547e-03 -6.4161711e-04 -3.0181343e-03
 -2.9695691e-03 -3.2593170e-04  1.3507605e-03  5.7806727e-04
 -1.1127588e-05  1.2221730e-03  1.9570501e-03  5.3005507e-03
  1.190989

In [19]:
# If we want to determine the predicted character we need to sample the output distribution (pick a value based on probabillity)
sampled_indices = tf.random.categorical(pred, num_samples=1)

# now we can reshape that array and convert all the integers to numbers to see the actual characters
sampled_indices = np.reshape(sampled_indices, (1, -1))[0]
predicted_chars = int_to_text(sampled_indices)

predicted_chars  # and this is what the model predicted for training sequence 1

"I. Psu3mqPGph:?lxd!Xg&C:Bki:'eUH:!'RbZfdluUMfb-,l.M\nvomI&sNVIFV'Yd GYiInqJZAQJtPpqKXHv-Ao;dryHeRngh'"

In [21]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)