# Building Recurrent Neural Networks/Long-Short Term Memory Models Using TensorFlow

In [1]:
!pip install tensorflow==2.9.1
#!pip install numpy==1.21.4


Collecting tensorflow==2.9.1
  Downloading tensorflow-2.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting flatbuffers<2,>=1.12 (from tensorflow==2.9.1)
  Downloading flatbuffers-1.12-py2.py3-none-any.whl.metadata (872 bytes)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow==2.9.1)
  Downloading gast-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting keras<2.10.0,>=2.9.0rc0 (from tensorflow==2.9.1)
  Downloading keras-2.9.0-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting keras-preprocessing>=1.1.1 (from tensorflow==2.9.1)
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting protobuf<3.20,>=3.9.2 (from tensorflow==2.9.1)
  Downloading protobuf-3.19.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (787 bytes)
Collecting tensorboard<2.10,>=2.9 (from tensorflow==2.9.1)
  Downloading tensorboard-2.9.1-py3-none-any.whl.metadata (1.9 kB)
Collecting tensorflow-estimator<2.10.0,>=2.9.0rc0 (from 

In [2]:
import time
import numpy as np
import tensorflow as tf
if not tf.__version__ == '2.9.1':
    print(tf.__version__)
    raise ValueError('please upgrade to TensorFlow 2.9.1, or restart your Kernel (Kernel->Restart & Clear Output)')

In [3]:
#!mkdir data
#!mkdir data/ptb
#!wget -q -O data/ptb/reader.py https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DL0120EN-SkillsNetwork/labs/Week3/data/ptb/reader.py
#!cp data/ptb/reader.py .



In [4]:
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================


"""Utilities for parsing PTB text files."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import os

import numpy as np
import tensorflow as tf


def _read_words(filename):
  with tf.io.gfile.GFile(filename, "r") as f:
    return f.read().replace("\n", "<eos>").split()


def _build_vocab(filename):
  data = _read_words(filename)

  counter = collections.Counter(data)
  count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

  words, _ = list(zip(*count_pairs))
  word_to_id = dict(zip(words, range(len(words))))

  return word_to_id


def _file_to_word_ids(filename, word_to_id):
  data = _read_words(filename)
  return [word_to_id[word] for word in data if word in word_to_id]


def ptb_raw_data(data_path=None):
  """Load PTB raw data from data directory "data_path".

  Reads PTB text files, converts strings to integer ids,
  and performs mini-batching of the inputs.

  The PTB dataset comes from Tomas Mikolov's webpage:

  http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz

  Args:
    data_path: string path to the directory where simple-examples.tgz has
      been extracted.

  Returns:
    tuple (train_data, valid_data, test_data, vocabulary)
    where each of the data objects can be passed to PTBIterator.
  """

  train_path = os.path.join(data_path, "ptb.train.txt")
  valid_path = os.path.join(data_path, "ptb.valid.txt")
  test_path = os.path.join(data_path, "ptb.test.txt")

  word_to_id = _build_vocab(train_path)
  train_data = _file_to_word_ids(train_path, word_to_id)
  valid_data = _file_to_word_ids(valid_path, word_to_id)
  test_data = _file_to_word_ids(test_path, word_to_id)
  vocabulary = len(word_to_id)
  return train_data, valid_data, test_data, vocabulary, word_to_id


def ptb_iterator(raw_data, batch_size, num_steps):
  """Iterate on the raw PTB data.

  This generates batch_size pointers into the raw PTB data, and allows
  minibatch iteration along these pointers.

  Args:
    raw_data: one of the raw data outputs from ptb_raw_data.
    batch_size: int, the batch size.
    num_steps: int, the number of unrolls.

  Yields:
    Pairs of the batched data, each a matrix of shape [batch_size, num_steps].
    The second element of the tuple is the same data time-shifted to the
    right by one.

  Raises:
    ValueError: if batch_size or num_steps are too high.
  """
  raw_data = np.array(raw_data, dtype=np.int32)

  data_len = len(raw_data)
  batch_len = data_len // batch_size
  data = np.zeros([batch_size, batch_len], dtype=np.int32)
  for i in range(batch_size):
    data[i] = raw_data[batch_len * i:batch_len * (i + 1)]

  epoch_size = (batch_len - 1) // num_steps

  if epoch_size == 0:
    raise ValueError("epoch_size == 0, decrease batch_size or num_steps")

  for i in range(epoch_size):
    x = data[:, i*num_steps:(i+1)*num_steps]
    y = data[:, i*num_steps+1:(i+1)*num_steps+1]
    yield (x, y)

### Building the LSTM model for Language Modeling


In [5]:
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/SPqCgT4JZp9royRjGgbqSA/data.zip
!unzip -o data.zip

--2024-10-15 14:47:37--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/SPqCgT4JZp9royRjGgbqSA/data.zip
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.45.118.108
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.45.118.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2020687 (1.9M) [application/zip]
Saving to: ‘data.zip’


2024-10-15 14:47:38 (3.62 MB/s) - ‘data.zip’ saved [2020687/2020687]

Archive:  data.zip
  inflating: data/ptb.test.txt       
  inflating: data/ptb.train.txt      
  inflating: data/ptb.valid.txt      


In [6]:
#Initial weight scale
init_scale = 0.1
#Initial learning rate
learning_rate = 1.0
#Maximum permissible norm for the gradient (For gradient clipping -- another measure against Exploding Gradients)
max_grad_norm = 5
#The number of layers in our model
num_layers = 2
#The total number of recurrence steps, also known as the number of layers when our RNN is "unfolded"
num_steps = 20
#The number of processing units (neurons) in the hidden layers
hidden_size_l1 = 256
hidden_size_l2 = 128
#The maximum number of epochs trained with the initial learning rate
max_epoch_decay_lr = 4
#The total number of epochs in training
max_epoch = 15
#The probability for keeping data in the Dropout Layer (This is an optimization, but is outside our scope for this notebook!)
#At 1, we ignore the Dropout Layer wrapping.
keep_prob = 0.5
#The decay for the learning rate
decay = 0.5
#The size for each batch of data
batch_size = 30
#The size of our vocabulary
vocab_size = 10000
embeding_vector_size= 200
#Training flag to separate training from testing
is_training = 1
#Data directory for our dataset
data_dir = "data"

In [7]:
# Reads the data and separates it into training data, validation data and testing data
raw_data = ptb_raw_data(data_dir)
train_data, valid_data, test_data, vocab, word_to_id = raw_data

In [8]:
len(train_data)

929589

In [9]:
def id_to_word(id_list):
    line = []
    for w in id_list:
        for word, wid in word_to_id.items():
            if wid == w:
                line.append(word)
    return line


print(id_to_word(train_data[0:100]))

['aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec', 'ipo', 'kia', 'memotec', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim', 'snack-food', 'ssangyong', 'swapo', 'wachter', '<eos>', 'pierre', '<unk>', 'N', 'years', 'old', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', 'N', '<eos>', 'mr.', '<unk>', 'is', 'chairman', 'of', '<unk>', 'n.v.', 'the', 'dutch', 'publishing', 'group', '<eos>', 'rudolph', '<unk>', 'N', 'years', 'old', 'and', 'former', 'chairman', 'of', 'consolidated', 'gold', 'fields', 'plc', 'was', 'named', 'a', 'nonexecutive', 'director', 'of', 'this', 'british', 'industrial', 'conglomerate', '<eos>', 'a', 'form', 'of', 'asbestos', 'once', 'used', 'to', 'make', 'kent', 'cigarette', 'filters', 'has', 'caused', 'a', 'high', 'percentage', 'of', 'cancer', 'deaths', 'among', 'a', 'group', 'of']


In [10]:
itera = ptb_iterator(train_data, batch_size, num_steps)
first_touple = itera.__next__()
_input_data = first_touple[0]
_targets = first_touple[1]

In [11]:
_input_data.shape

(30, 20)

In [12]:
_targets.shape

(30, 20)

In [13]:
_input_data[0:3]

array([[9970, 9971, 9972, 9974, 9975, 9976, 9980, 9981, 9982, 9983, 9984,
        9986, 9987, 9988, 9989, 9991, 9992, 9993, 9994, 9995],
       [2654,    6,  334, 2886,    4,    1,  233,  711,  834,   11,  130,
         123,    7,  514,    2,   63,   10,  514,    8,  605],
       [   0, 1071,    4,    0,  185,   24,  368,   20,   31, 3109,  954,
          12,    3,   21,    2, 2915,    2,   12,    3,   21]],
      dtype=int32)

In [14]:
print(id_to_word(_input_data[0,:]))

['aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec', 'ipo', 'kia', 'memotec', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim']


In [15]:
embedding_layer = tf.keras.layers.Embedding(vocab_size, embeding_vector_size,batch_input_shape=(batch_size, num_steps),trainable=True,name="embedding_vocab")

In [16]:
# Define where to get the data for our embeddings from
inputs = embedding_layer(_input_data)
inputs

<tf.Tensor: shape=(30, 20, 200), dtype=float32, numpy=
array([[[ 0.04628572, -0.03097992,  0.00240773, ..., -0.04494647,
         -0.00347878,  0.01393459],
        [-0.00488645, -0.03126304, -0.01484302, ..., -0.03757387,
         -0.01726284,  0.02120251],
        [ 0.01877541, -0.04161077,  0.02181014, ..., -0.00261043,
         -0.006056  ,  0.02337099],
        ...,
        [-0.02855209,  0.00399095,  0.03136278, ..., -0.02801545,
         -0.01248659,  0.0456008 ],
        [-0.0160727 , -0.02884952, -0.00086705, ..., -0.01025946,
         -0.04318186,  0.04106566],
        [-0.01729947,  0.01848404, -0.02169886, ...,  0.02589177,
         -0.03545898,  0.03210132]],

       [[-0.02545879,  0.00100525, -0.03875574, ..., -0.00841223,
         -0.0037452 ,  0.03212852],
        [-0.04887114,  0.03633438, -0.04794074, ..., -0.02668142,
          0.03375185, -0.02624012],
        [-0.0435494 ,  0.02437932,  0.03671456, ..., -0.04177038,
         -0.04392853, -0.0342554 ],
        ...,

<h3>Constructing Recurrent Neural Networks</h3>


In [17]:
lstm_cell_l1 = tf.keras.layers.LSTMCell(hidden_size_l1)
lstm_cell_l2 = tf.keras.layers.LSTMCell(hidden_size_l2)

In [18]:
stacked_lstm = tf.keras.layers.StackedRNNCells([lstm_cell_l1, lstm_cell_l2])

In [19]:
layer  =  tf.keras.layers.RNN(stacked_lstm,[batch_size, num_steps],return_state=False,stateful=True,trainable=True)

In [20]:
init_state = tf.Variable(tf.zeros([batch_size,embeding_vector_size]),trainable=False)

In [21]:
layer.inital_state = init_state

In [22]:
layer.inital_state

<tf.Variable 'Variable:0' shape=(30, 200) dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [23]:
outputs = layer(inputs)

In [24]:
outputs

<tf.Tensor: shape=(30, 20, 128), dtype=float32, numpy=
array([[[ 9.12942982e-04, -8.01309769e-04, -3.02348140e-04, ...,
          1.53320166e-03, -4.43523808e-04,  3.60070117e-05],
        [ 4.07505053e-04, -1.39851146e-03, -1.69620709e-03, ...,
          1.26759626e-03, -9.50086454e-04,  5.41495741e-04],
        [-6.71674847e-04, -1.00509613e-03, -5.60034590e-04, ...,
          3.58155259e-04,  1.65742007e-04,  1.17412338e-03],
        ...,
        [-1.47903233e-03,  2.03869655e-03,  2.13089329e-03, ...,
         -3.33704217e-03,  2.92561878e-03, -3.34688113e-04],
        [-1.99918728e-03,  1.04197999e-03,  1.87523046e-03, ...,
         -2.89060501e-03,  1.95932784e-03,  1.93514279e-04],
        [-1.79821381e-03,  9.36657772e-04,  2.50274246e-03, ...,
         -3.15028057e-03,  1.11550861e-03,  5.76283128e-06]],

       [[ 2.03793883e-04, -2.47581120e-05,  9.34942858e-04, ...,
         -1.02057203e-03,  7.24201498e-04, -1.43109908e-04],
        [-6.03578228e-04, -3.44451226e-04,  1.37

### Dense layer

In [25]:
dense = tf.keras.layers.Dense(vocab_size)

In [26]:
logits_outputs  = dense(outputs)

In [27]:
print("shape of the output from dense layer: ", logits_outputs.shape) #(batch_size, sequence_length, vocab_size)

shape of the output from dense layer:  (30, 20, 10000)


### Activation layer


In [28]:
activation = tf.keras.layers.Activation('softmax')

In [29]:
output_words_prob = activation(logits_outputs)

In [30]:
print("shape of the output from the activation layer: ", output_words_prob.shape) #(batch_size, sequence_length, vocab_size)

shape of the output from the activation layer:  (30, 20, 10000)


In [31]:
print("The probability of observing words in t=0 to t=20", output_words_prob[0,0:num_steps])

The probability of observing words in t=0 to t=20 tf.Tensor(
[[1.00013793e-04 1.00001926e-04 1.00013851e-04 ... 9.99997355e-05
  1.00004239e-04 9.99829936e-05]
 [1.00028294e-04 9.99857220e-05 9.99879630e-05 ... 1.00000325e-04
  9.99876138e-05 9.99746117e-05]
 [1.00023790e-04 9.99748299e-05 9.99730910e-05 ... 9.99939439e-05
  9.99923941e-05 9.99824770e-05]
 ...
 [9.99906406e-05 9.99673139e-05 1.00175261e-04 ... 9.99688054e-05
  1.00018944e-04 9.99508484e-05]
 [9.99947515e-05 9.99573676e-05 1.00171652e-04 ... 9.99627373e-05
  1.00036174e-04 9.99592303e-05]
 [9.99816257e-05 9.99598560e-05 1.00141900e-04 ... 9.99457261e-05
  1.00029501e-04 9.99705080e-05]], shape=(20, 10000), dtype=float32)


### Prediction


In [32]:
np.argmax(output_words_prob[0,0:num_steps], axis=1)

array([5356, 8774, 8774, 8566, 8566, 5861, 5861, 4266, 4266, 1523, 1523,
       3101, 6178, 6178, 6178, 1069, 2408, 2408, 3840, 6087])

In [33]:
_targets[0]

array([9971, 9972, 9974, 9975, 9976, 9980, 9981, 9982, 9983, 9984, 9986,
       9987, 9988, 9989, 9991, 9992, 9993, 9994, 9995, 9996], dtype=int32)

In [34]:
def crossentropy(y_true, y_pred):
    return tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)

In [35]:
loss  = crossentropy(_targets, output_words_prob)

In [36]:
loss[0,:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([9.210342, 9.210597, 9.211056, 9.210915, 9.210085, 9.209992,
       9.211426, 9.210394, 9.209596, 9.210715], dtype=float32)>

In [37]:
cost = tf.reduce_sum(loss / batch_size)
cost

<tf.Tensor: shape=(), dtype=float32, numpy=184.20763>

### Training


In [38]:
# Create a variable for the learning rate
lr = tf.Variable(0.0, trainable=False)
optimizer = tf.keras.optimizers.SGD(lr=lr, clipnorm=max_grad_norm)

  super(SGD, self).__init__(name, **kwargs)


In [39]:
model = tf.keras.Sequential()
model.add(embedding_layer)
model.add(layer)
model.add(dense)
model.add(activation)
model.compile(loss=crossentropy, optimizer=optimizer)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_vocab (Embedding)  (30, 20, 200)            2000000   
                                                                 
 rnn (RNN)                   (30, 20, 128)             671088    
                                                                 
 dense (Dense)               (30, 20, 10000)           1290000   
                                                                 
 activation (Activation)     (30, 20, 10000)           0         
                                                                 
Total params: 3,961,088
Trainable params: 3,955,088
Non-trainable params: 6,000
_________________________________________________________________


In [40]:
# Get all TensorFlow variables marked as "trainable" (i.e. all of them except _lr, which we just created)
tvars = model.trainable_variables

In [41]:
[v.name for v in tvars]

['embedding_vocab/embeddings:0',
 'rnn/stacked_rnn_cells/lstm_cell/kernel:0',
 'rnn/stacked_rnn_cells/lstm_cell/recurrent_kernel:0',
 'rnn/stacked_rnn_cells/lstm_cell/bias:0',
 'rnn/stacked_rnn_cells/lstm_cell_1/kernel:0',
 'rnn/stacked_rnn_cells/lstm_cell_1/recurrent_kernel:0',
 'rnn/stacked_rnn_cells/lstm_cell_1/bias:0',
 'dense/kernel:0',
 'dense/bias:0']

In [42]:
x = tf.constant(1.0)
y =  tf.constant(2.0)
with tf.GradientTape(persistent=True) as g:
    g.watch(x)
    g.watch(y)
    func_test = 2 * x * x + 3 * x * y

In [43]:
var_grad = g.gradient(func_test, x) # Will compute to 10.0
print(var_grad)

tf.Tensor(10.0, shape=(), dtype=float32)


In [44]:
var_grad = g.gradient(func_test, y) # Will compute to 3.0
print(var_grad)

tf.Tensor(3.0, shape=(), dtype=float32)


In [45]:
with tf.GradientTape() as tape:
    # Forward pass.
    output_words_prob = model(_input_data)
    # Loss value for this batch.
    loss  = crossentropy(_targets, output_words_prob)
    cost = tf.reduce_sum(loss,axis=0) / batch_size

In [46]:
# Get gradients of loss wrt the trainable variables.
grad_t_list = tape.gradient(cost, tvars)

In [47]:
print(grad_t_list)

[<tensorflow.python.framework.indexed_slices.IndexedSlices object at 0x7d57bfffe050>, <tf.Tensor: shape=(200, 1024), dtype=float32, numpy=
array([[-1.3347782e-06, -3.4106438e-07,  4.5607811e-07, ...,
        -2.4933527e-07, -5.7886632e-07, -3.0212448e-08],
       [ 6.4784081e-07,  2.8341006e-08, -5.9881700e-08, ...,
         3.3739576e-07, -1.9811230e-07, -8.3863711e-08],
       [-6.3820505e-07,  5.0137305e-07,  3.8869780e-07, ...,
        -3.5307789e-07, -1.0785769e-07, -2.5388249e-07],
       ...,
       [ 4.1377348e-07, -6.8695550e-07,  8.7615717e-07, ...,
         1.2999340e-07, -2.4557602e-07, -3.5809654e-07],
       [-1.1652573e-06,  4.4974945e-07,  1.1784881e-06, ...,
         1.8469943e-07,  2.7766112e-07,  1.3589651e-07],
       [-1.9000025e-07,  1.6800466e-07, -4.1431690e-07, ...,
         1.9421735e-07,  5.0795364e-07, -1.6784658e-07]], dtype=float32)>, <tf.Tensor: shape=(256, 1024), dtype=float32, numpy=
array([[-2.16322050e-07, -2.02714602e-07, -4.52373683e-07, ...,
      

In [48]:
# Define the gradient clipping threshold
grads, _ = tf.clip_by_global_norm(grad_t_list, max_grad_norm)
grads

[<tensorflow.python.framework.indexed_slices.IndexedSlices at 0x7d57c0303820>,
 <tf.Tensor: shape=(200, 1024), dtype=float32, numpy=
 array([[-1.3347782e-06, -3.4106438e-07,  4.5607811e-07, ...,
         -2.4933527e-07, -5.7886632e-07, -3.0212448e-08],
        [ 6.4784081e-07,  2.8341006e-08, -5.9881700e-08, ...,
          3.3739576e-07, -1.9811230e-07, -8.3863711e-08],
        [-6.3820505e-07,  5.0137305e-07,  3.8869780e-07, ...,
         -3.5307789e-07, -1.0785769e-07, -2.5388249e-07],
        ...,
        [ 4.1377348e-07, -6.8695550e-07,  8.7615717e-07, ...,
          1.2999340e-07, -2.4557602e-07, -3.5809654e-07],
        [-1.1652573e-06,  4.4974945e-07,  1.1784881e-06, ...,
          1.8469943e-07,  2.7766112e-07,  1.3589651e-07],
        [-1.9000025e-07,  1.6800466e-07, -4.1431690e-07, ...,
          1.9421735e-07,  5.0795364e-07, -1.6784658e-07]], dtype=float32)>,
 <tf.Tensor: shape=(256, 1024), dtype=float32, numpy=
 array([[-2.16322050e-07, -2.02714602e-07, -4.52373683e-07, ..

<h4> 4.Apply the optimizer to the variables/gradients tuple. </h4>


In [49]:
# Create the training TensorFlow Operation through our optimizer
train_op = optimizer.apply_gradients(zip(grads, tvars))

<a id="ltsm"></a>
<h2>LSTM</h2>


In [50]:
class PTBModel(object):


    def __init__(self):
        ######################################
        # Setting parameters for ease of use #
        ######################################
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.hidden_size_l1 = hidden_size_l1
        self.hidden_size_l2 = hidden_size_l2
        self.vocab_size = vocab_size
        self.embeding_vector_size = embeding_vector_size
        # Create a variable for the learning rate
        self._lr = 1.0

        ###############################################################################
        # Initializing the model using keras Sequential API  #
        ###############################################################################

        self._model = tf.keras.models.Sequential()

        ####################################################################
        # Creating the word embeddings layer and adding it to the sequence #
        ####################################################################
        with tf.device("/cpu:0"):
            # Create the embeddings for our input data. Size is hidden size.
            self._embedding_layer = tf.keras.layers.Embedding(self.vocab_size, self.embeding_vector_size,batch_input_shape=(self.batch_size, self.num_steps),trainable=True,name="embedding_vocab")  #[10000x200]
            self._model.add(self._embedding_layer)


        ##########################################################################
        # Creating the LSTM cell structure and connect it with the RNN structure #
        ##########################################################################
        # Create the LSTM Cells.
        # This creates only the structure for the LSTM and has to be associated with a RNN unit still.
        # The argument  of LSTMCell is size of hidden layer, that is, the number of hidden units of the LSTM (inside A).
        # LSTM cell processes one word at a time and computes probabilities of the possible continuations of the sentence.
        lstm_cell_l1 = tf.keras.layers.LSTMCell(hidden_size_l1)
        lstm_cell_l2 = tf.keras.layers.LSTMCell(hidden_size_l2)



        # By taking in the LSTM cells as parameters, the StackedRNNCells function junctions the LSTM units to the RNN units.
        # RNN cell composed sequentially of stacked simple cells.
        stacked_lstm = tf.keras.layers.StackedRNNCells([lstm_cell_l1, lstm_cell_l2])




        ############################################
        # Creating the input structure for our RNN #
        ############################################
        # Input structure is 20x[30x200]
        # Considering each word is represended by a 200 dimentional vector, and we have 30 batchs, we create 30 word-vectors of size [30xx2000]
        # The input structure is fed from the embeddings, which are filled in by the input data
        # Feeding a batch of b sentences to a RNN:
        # In step 1,  first word of each of the b sentences (in a batch) is input in parallel.
        # In step 2,  second word of each of the b sentences is input in parallel.
        # The parallelism is only for efficiency.
        # Each sentence in a batch is handled in parallel, but the network sees one word of a sentence at a time and does the computations accordingly.
        # All the computations involving the words of all sentences in a batch at a given time step are done in parallel.

        ########################################################################################################
        # Instantiating our RNN model and setting stateful to True to feed forward the state to the next layer #
        ########################################################################################################

        self._RNNlayer  =  tf.keras.layers.RNN(stacked_lstm,[batch_size, num_steps],return_state=False,stateful=True,trainable=True)

        # Define the initial state, i.e., the model state for the very first data point
        # It initialize the state of the LSTM memory. The memory state of the network is initialized with a vector of zeros and gets updated after reading each word.
        self._initial_state = tf.Variable(tf.zeros([batch_size,embeding_vector_size]),trainable=False)
        self._RNNlayer.inital_state = self._initial_state

        ############################################
        # Adding RNN layer to keras sequential API #
        ############################################
        self._model.add(self._RNNlayer)

        #self._model.add(tf.keras.layers.LSTM(hidden_size_l1,return_sequences=True,stateful=True))
        #self._model.add(tf.keras.layers.LSTM(hidden_size_l2,return_sequences=True))


        ####################################################################################################
        # Instantiating a Dense layer that connects the output to the vocab_size  and adding layer to model#
        ####################################################################################################
        self._dense = tf.keras.layers.Dense(self.vocab_size)
        self._model.add(self._dense)


        ####################################################################################################
        # Adding softmax activation layer and deriving probability to each class and adding layer to model #
        ####################################################################################################
        self._activation = tf.keras.layers.Activation('softmax')
        self._model.add(self._activation)

        ##########################################################
        # Instantiating the stochastic gradient decent optimizer #
        ##########################################################
        self._optimizer = tf.keras.optimizers.SGD(lr=self._lr, clipnorm=max_grad_norm)


        ##############################################################################
        # Compiling and summarizing the model stacked using the keras sequential API #
        ##############################################################################
        self._model.compile(loss=self.crossentropy, optimizer=self._optimizer)
        self._model.summary()


    def crossentropy(self,y_true, y_pred):
        return tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)

    def train_batch(self,_input_data,_targets):
        #################################################
        # Creating the Training Operation for our Model #
        #################################################
        # Create a variable for the learning rate
        self._lr = tf.Variable(0.0, trainable=False)
        # Get all TensorFlow variables marked as "trainable" (i.e. all of them except _lr, which we just created)
        tvars = self._model.trainable_variables
        # Define the gradient clipping threshold
        with tf.GradientTape() as tape:
            # Forward pass.
            output_words_prob = self._model(_input_data)
            # Loss value for this batch.
            loss  = self.crossentropy(_targets, output_words_prob)
            # average across batch and reduce sum
            cost = tf.reduce_sum(loss/ self.batch_size)
        # Get gradients of loss wrt the trainable variables.
        grad_t_list = tape.gradient(cost, tvars)
        # Define the gradient clipping threshold
        grads, _ = tf.clip_by_global_norm(grad_t_list, max_grad_norm)
        # Create the training TensorFlow Operation through our optimizer
        train_op = self._optimizer.apply_gradients(zip(grads, tvars))
        return cost

    def test_batch(self,_input_data,_targets):
        #################################################
        # Creating the Testing Operation for our Model #
        #################################################
        output_words_prob = self._model(_input_data)
        loss  = self.crossentropy(_targets, output_words_prob)
        # average across batch and reduce sum
        cost = tf.reduce_sum(loss/ self.batch_size)

        return cost
    @classmethod
    def instance(cls) :
        return PTBModel()

In [51]:

########################################################################################################################
# run_one_epoch takes as parameters  the model instance, the data to be fed, training or testing mode and verbose info #
########################################################################################################################
def run_one_epoch(m, data,is_training=True,verbose=False):

    #Define the epoch size based on the length of the data, batch size and the number of steps
    epoch_size = ((len(data) // m.batch_size) - 1) // m.num_steps
    start_time = time.time()
    costs = 0.
    iters = 0

    m._model.reset_states()

    #For each step and data point
    for step, (x, y) in enumerate(ptb_iterator(data, m.batch_size, m.num_steps)):

        #Evaluate and return cost, state by running cost, final_state and the function passed as parameter
        #y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)
        if is_training :
            loss=  m.train_batch(x, y)
        else :
            loss = m.test_batch(x, y)


        #Add returned cost to costs (which keeps track of the total costs for this epoch)
        costs += loss

        #Add number of steps to iteration counter
        iters += m.num_steps

        if verbose and step % (epoch_size // 10) == 10:
            print("Itr %d of %d, perplexity: %.3f speed: %.0f wps" % (step , epoch_size, np.exp(costs / iters), iters * m.batch_size / (time.time() - start_time)))



    # Returns the Perplexity rating for us to keep track of how the model is evolving
    return np.exp(costs / iters)


In [52]:
# Reads the data and separates it into training data, validation data and testing data
raw_data = ptb_raw_data(data_dir)
train_data, valid_data, test_data, _, _ = raw_data

In [None]:
# Instantiates the PTBModel class
m=PTBModel.instance()
K = tf.keras.backend
for i in range(max_epoch):
    # Define the decay for this epoch
    lr_decay = decay ** max(i - max_epoch_decay_lr, 0.0)
    dcr = learning_rate * lr_decay
    m._lr = dcr
    K.set_value(m._model.optimizer.learning_rate,m._lr)
    print("Epoch %d : Learning rate: %.3f" % (i + 1, m._model.optimizer.learning_rate))
    # Run the loop for this epoch in the training mode
    train_perplexity = run_one_epoch(m, train_data,is_training=True,verbose=True)
    print("Epoch %d : Train Perplexity: %.3f" % (i + 1, train_perplexity))

    # Run the loop for this epoch in the validation mode
    valid_perplexity = run_one_epoch(m, valid_data,is_training=False,verbose=False)
    print("Epoch %d : Valid Perplexity: %.3f" % (i + 1, valid_perplexity))

# Run the loop in the testing mode to see how effective was our training
test_perplexity = run_one_epoch(m, test_data,is_training=False,verbose=False)
print("Test Perplexity: %.3f" % test_perplexity)



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_vocab (Embedding)  (30, 20, 200)            2000000   
                                                                 
 rnn_1 (RNN)                 (30, 20, 128)             671088    
                                                                 
 dense_1 (Dense)             (30, 20, 10000)           1290000   
                                                                 
 activation_1 (Activation)   (30, 20, 10000)           0         
                                                                 
Total params: 3,961,088
Trainable params: 3,955,088
Non-trainable params: 6,000
_________________________________________________________________
Epoch 1 : Learning rate: 1.000
Itr 10 of 1549, perplexity: 4607.672 speed: 530 wps
Itr 164 of 1549, perplexity: 1095.470 speed: 983 wps
Itr 318 of 1549, perplexity: 843.252