<a href="https://colab.research.google.com/github/Dimisz/tensorflow_lectures/blob/main/NLP_lecture.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Read in the Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf


### Download and prepare the text

In [2]:
!wget "https://raw.githubusercontent.com/Dimisz/tensorflow_lectures/main/shakespeare.txt"

--2021-10-07 06:23:42--  https://raw.githubusercontent.com/Dimisz/tensorflow_lectures/main/shakespeare.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5445609 (5.2M) [text/plain]
Saving to: ‘shakespeare.txt’


2021-10-07 06:23:43 (66.3 MB/s) - ‘shakespeare.txt’ saved [5445609/5445609]



In [3]:
path = "/content/shakespeare.txt"

In [4]:
text = open(path, 'r').read()

In [5]:
#print(text[:500])

### Grab unique charachters

In [6]:
vocab = sorted(set(text))

In [7]:
# important for the last Dense layer
len(vocab)

84

## Text Proccessing

* Vectorize the text

* Create encoding dictionary

In [8]:
# for pair in enumerate(vocab):
#   print(pair)

In [9]:
char_to_ind = {char: ind for ind, char in enumerate(vocab)}

In [10]:
char_to_ind['H']

33

In [11]:
ind_to_char = np.array(vocab)

In [12]:
ind_to_char[33]

'H'

In [13]:
encoded_text = np.array([char_to_ind[c] for c in text])

In [14]:
encoded_text.shape

(5445609,)

In [15]:
sample = text[:500]

In [16]:
sample

"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou contracted to thine own bright eyes,\n  Feed'st thy light's flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thy self thy foe, to thy sweet self too cruel:\n  Thou that art now the world's fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bu"

In [17]:
encoded_text[:500]

array([ 0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1, 12,  0,  1,  1, 31, 73, 70, 68,  1, 61, 56, 64,
       73, 60, 74, 75,  1, 58, 73, 60, 56, 75, 76, 73, 60, 74,  1, 78, 60,
        1, 59, 60, 74, 64, 73, 60,  1, 64, 69, 58, 73, 60, 56, 74, 60,  8,
        0,  1,  1, 45, 63, 56, 75,  1, 75, 63, 60, 73, 60, 57, 80,  1, 57,
       60, 56, 76, 75, 80,  5, 74,  1, 73, 70, 74, 60,  1, 68, 64, 62, 63,
       75,  1, 69, 60, 77, 60, 73,  1, 59, 64, 60,  8,  0,  1,  1, 27, 76,
       75,  1, 56, 74,  1, 75, 63, 60,  1, 73, 64, 71, 60, 73,  1, 74, 63,
       70, 76, 67, 59,  1, 57, 80,  1, 75, 64, 68, 60,  1, 59, 60, 58, 60,
       56, 74, 60,  8,  0,  1,  1, 33, 64, 74,  1, 75, 60, 69, 59, 60, 73,
        1, 63, 60, 64, 73,  1, 68, 64, 62, 63, 75,  1, 57, 60, 56, 73,  1,
       63, 64, 74,  1, 68, 60, 68, 70, 73, 80, 21,  0,  1,  1, 27, 76, 75,
        1, 75, 63, 70, 76,  1, 58, 70, 69, 75, 73, 56, 58, 75, 60, 59,  1,
       75, 70,  1, 75, 63

## Creating Batches

Batches should be long enough to let the model to pick up the structure

In [18]:
print(text[:500])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bu


In [19]:
line = "From fairest creatures we desire increase"
len(line)

41

In [20]:
lines = """
From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
"""
len(lines)

133

In [21]:
seq_len = 120

In [22]:
total_num_seq = len(text) // (seq_len + 1)

In [23]:
total_num_seq

45005

### Prepare dataset using sequences

In [24]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [25]:
type(char_dataset)

tensorflow.python.data.ops.dataset_ops.TensorSliceDataset

In [26]:
# Demonstrate how it looks like
# for item in char_dataset.take(500):
#   print(ind_to_char[item.numpy()])

In [27]:
sequences = char_dataset.batch(seq_len + 1, drop_remainder=True)

In [28]:
def create_seq_targets(seq):
  input_txt = seq[:-1] # Hello my nam
  target_txt = seq[1:] # ello my name
  return input_txt, target_txt

In [29]:
# map the function to all the sequences
dataset = sequences.map(create_seq_targets)

In [30]:
for input_txt, target_txt in dataset.take(1):
  print(input_txt.numpy())
  print("".join(ind_to_char[input_txt.numpy()]))
  print("\n")
  print(target_txt.numpy())
  print("".join(ind_to_char[target_txt.numpy()]))

[ 0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0
  1  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74
  1 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45
 63 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74
 60  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75]

                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But


[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0  1
  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74  1
 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45 63
 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74 60
  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75  1]
                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But 


### Generate training batches

In [31]:
batch_size = 128

In [32]:
buffer_size = 10000 
# Need to shuffle!!!
dataset = dataset.shuffle(buffer_size).batch(batch_size,  drop_remainder=True)

In [33]:
dataset

<BatchDataset shapes: ((128, 120), (128, 120)), types: (tf.int64, tf.int64)>

## Create a model

In [34]:
vocab_size = len(vocab)

In [35]:
vocab_size

84

In [36]:
embed_dim = 64

In [37]:
rnn_neurons = 1026

In [38]:
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [39]:
help(sparse_categorical_crossentropy)

Help on function sparse_categorical_crossentropy in module keras.losses:

sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1)
    Computes the sparse categorical crossentropy loss.
    
    Standalone usage:
    
    >>> y_true = [1, 2]
    >>> y_pred = [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]
    >>> loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)
    >>> assert loss.shape == (2,)
    >>> loss.numpy()
    array([0.0513, 2.303], dtype=float32)
    
    Args:
      y_true: Ground truth values.
      y_pred: The predicted values.
      from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
        we assume that `y_pred` encodes a probability distribution.
      axis: Defaults to -1. The dimension along which the entropy is
        computed.
    
    Returns:
      Sparse categorical crossentropy loss value.



In [40]:
def sparse_cat_loss(y_true, y_pred):
  return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

In [41]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

In [42]:
def create_model(vocab_size, embed_dim, rnn_neurons, batch_size):
  model = Sequential()
  model.add(Embedding(vocab_size, embed_dim, batch_input_shape=[batch_size, None]))
  model.add(GRU(rnn_neurons, 
                return_sequences=True, 
                stateful=True, 
                recurrent_initializer='glorot_uniform'))
  model.add(Dense(vocab_size))

  model.compile(optimizer='adam', loss=sparse_cat_loss)

  return model

In [43]:
model = create_model(vocab_size=vocab_size,
                     embed_dim=embed_dim,
                     rnn_neurons=rnn_neurons,
                     batch_size=batch_size)

In [44]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 64)           5376      
_________________________________________________________________
gru (GRU)                    (128, None, 1026)         3361176   
_________________________________________________________________
dense (Dense)                (128, None, 84)           86268     
Total params: 3,452,820
Trainable params: 3,452,820
Non-trainable params: 0
_________________________________________________________________


In [45]:
for input_example_batch, target_example_batch in dataset.take(1):

  example_batch_predictions = model(input_example_batch)

In [46]:
example_batch_predictions.shape

TensorShape([128, 120, 84])

In [47]:
example_batch_predictions[0]

<tf.Tensor: shape=(120, 84), dtype=float32, numpy=
array([[-4.4641254e-04, -5.6545073e-03, -3.2771968e-03, ...,
         1.0830046e-02, -2.3791230e-04,  4.3035531e-03],
       [ 7.2741094e-03, -4.5144404e-03, -9.8069310e-03, ...,
         4.0336875e-03, -1.1707855e-02,  5.6841723e-03],
       [-8.8659152e-05, -4.7053621e-04, -5.5421661e-03, ...,
        -3.2657834e-03,  5.2272761e-04,  4.0610339e-03],
       ...,
       [ 8.0655162e-05, -1.1545721e-03, -6.4481478e-03, ...,
        -3.0350538e-03,  7.0549821e-04,  5.3036502e-03],
       [ 1.0104814e-04, -6.4861821e-03, -5.9962366e-03, ...,
         9.9907927e-03,  1.2214131e-03,  7.6431124e-03],
       [-1.4029222e-04, -1.3858570e-03,  2.3633808e-04, ...,
         1.1033756e-02, -1.4753001e-03,  6.9138147e-03]], dtype=float32)>

In [49]:
samples_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)

In [51]:
samples_indices = tf.squeeze(samples_indices, axis=-1).numpy()

In [52]:
ind_to_char[samples_indices]

array(['8', "'", 'V', ' ', 'U', 'h', '>', ')', 'h', 'm', 'D', '(', 'f',
       'l', 'U', 'F', ':', '?', 'A', 'X', 'X', '!', 'c', 'A', '|', ':',
       'N', 'S', '5', 'i', 'I', '?', '`', 'B', 'w', 'x', 'N', 'S', 'p',
       'H', 'm', 'H', 'A', 'T', '6', 'J', 'D', 'j', '"', 'j', 'V', '6',
       'j', 'T', '\n', 'g', ' ', 'b', ':', 'x', ']', '?', 'J', 'C', 'd',
       'B', '-', 'V', 'k', 'S', 'J', 'p', 'v', 'A', 'H', 'y', 'T', '5',
       'n', '.', 'S', '`', '3', '`', 'm', 'U', 'V', 'c', 'B', ' ', 'P',
       'w', 'Y', 'L', '`', '4', '&', 'v', 'I', ')', '6', 'l', ']', 'n',
       '`', 'X', '_', '`', '<', '3', "'", '0', 't', 't', 'e', '.', 'p',
       '5', '>', 'v'], dtype='<U1')

In [53]:
epochs = 30

In [55]:
# Takes around 45 minutes to train the model
# model.fit(dataset, epochs=epochs)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fbddd87c350>

In [56]:
model.save("shakespeare_model.h5")

In [57]:
from tensorflow.keras.models import load_model

In [58]:
model = create_model(vocab_size, embed_dim, rnn_neurons, batch_size=1)

model.load_weights("/content/shakespeare_model.h5")

model.build(tf.TensorShape([1, None]))

In [59]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 64)             5376      
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1026)           3361176   
_________________________________________________________________
dense_1 (Dense)              (1, None, 84)             86268     
Total params: 3,452,820
Trainable params: 3,452,820
Non-trainable params: 0
_________________________________________________________________


In [60]:
def generate_text(model, start_seed, gen_size=500, temp=1.0):
  
  num_generate = gen_size
  input_eval = [char_to_ind[s] for s in start_seed]
  input_eval = tf.expand_dims(input_eval,0)
  text_generated = []
  temperature = temp
  model.reset_states()

  for i in range(num_generate):
    predictions = model(input_eval)
    predictions = tf.squeeze(predictions, 0)
    predictionns = predictions/temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
    input_eval = tf.expand_dims([predicted_id],0)
    text_generated.append(ind_to_char[predicted_id])
  
  return (start_seed + "".join(text_generated))


In [61]:
print(generate_text(model, "JULIET", gen_size=2000))

JULIET. Then she approve it. Well, we have vering parts
    To th' dascling which he bids be past.
  COSTARD. 'Tis no hair up, but by ambitious tear;
    But when she was a wife?
  MARILEN. The smitest he were best.
  Rom. There lies your base, and whe'd much mov'd possession; which so oft?
    That he, he's desolate, all things of all,
    If 'o behootfor more than this is up.
  TRANIO. Let him adored with grass again.
    Now, when the artistractory day
    Good sign to dian
sweet with the
    inform but he was Walter Brutus, will be pacious I may have answer the
    constancy, Gonzia, begins.
  FIRST MUSICIAN. Come, sir, by this hand, and all ostractione world
    How many of the world can lia a voice
    To parley. He is with customise her musicians, cass with that
    thou and hain so good diseased with a truth.
  RODARIUS. O then; you are welcome offich one, it was my love;
    And yet, ig your swift trembling native rests
    That never had drown dralling his envy be pleas'd
   