<a href="https://colab.research.google.com/github/FII78/DNN/blob/main/Building_RNN_and_LSTM_101.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from rnn_utils import *

In [None]:
# Forward prop for basic RNN

def rnn_cell_forward(xt, a_prev, parameters):
    """
    Implements a single forward step of the RNN-cell
    Arguments:
    xt -- input data at timestep "t", numpy array of shape (n_x, m).
    a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
    parameters -- python dictionary containing:
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        ba --  Bias, numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    Returns:
    a_next -- next hidden state, of shape (n_a, m)
    yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m)
    cache -- tuple of values needed for the backward pass, contains (a_next, a_prev, xt, parameters)
    """
    # Retrieve parameters from "parameters"
    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]

    # calc the next activation funtion
    a_next = np.tanh(np.dot(Wax, xt) + np.dot(Waa, a_prev) + ba)
    # predict y
    yt_pred = softmax(np.dot(Wya, a_next) + by)

    # store values for backprop
    cache = (a_next, a_prev, xt, parameters)
    return a_next, yt_pred, cache



In [None]:
np.random.seed(1)
xt = np.random.randn(3,10)
a_prev = np.random.randn(5,10)
Waa = np.random.randn(5,5)
Wax = np.random.randn(5,3)
Wya = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Waa": Waa, "Wax": Wax, "Wya": Wya, "ba": ba, "by": by}

a_next, yt_pred, cache = rnn_cell_forward(xt, a_prev, parameters)
print("a_next[4] = ", a_next[4])
print("a_next.shape = ", a_next.shape)
print("yt_pred[1] =", yt_pred[1])
print("yt_pred.shape = ", yt_pred.shape)


a_next[4] =  [ 0.59584544  0.18141802  0.61311866  0.99808218  0.85016201  0.99980978
 -0.18887155  0.99815551  0.6531151   0.82872037]
a_next.shape =  (5, 10)
yt_pred[1] = [0.9888161  0.01682021 0.21140899 0.36817467 0.98988387 0.88945212
 0.36920224 0.9966312  0.9982559  0.17746526]
yt_pred.shape =  (2, 10)


In [None]:
# RNN Forward pass: Basic RNN. The input sequence  𝑥=(𝑥⟨1⟩,𝑥⟨2⟩,...,𝑥⟨𝑇𝑥⟩)  is carried over  𝑇𝑥  time steps. The network outputs  𝑦=(𝑦⟨1⟩,𝑦⟨2⟩,...,𝑦⟨𝑇𝑥⟩) .

In [None]:

def rnn_forward(x, a0, parameters):
    """
    Implement the forward propagation of the recurrent neural network
    Input Data (x):
        Dimensions: (n_x, m, T_x)
        n_x: Number of features or input dimensions.
        m: Batch size, representing the number of examples processed simultaneously.
        T_x: Number of time steps in the sequence.
        Explanation: Each column of x represents an example in the batch, and the data is organized over time steps.
        Hidden State (a):
        Dimensions: (n_a, m, T_x)
        n_a: Number of hidden units in the RNN layer.
        m: Batch size.
        T_x: Number of time steps.
        Explanation: The tensor a stores the hidden states at each time step for each example in the batch.
        Weight Matrix (Wya):
        Dimensions: (n_y, n_a)
        n_y: Number of output units (e.g., classes in classification).
        n_a: Number of hidden units in the RNN layer.
        Explanation: Wya is the weight matrix connecting the hidden state to the output. It transforms the hidden state into the space of the output.
        Output (y_pred):
        Dimensions: (n_y, m, T_x)
        n_y: Number of output units.
        m: Batch size.
        T_x: Number of time steps.
Explanation: y_pred contains the predicted outputs for each time step and example in the batch.
    Arguments:
    x -- Input data for every time-step, of shape (n_x, m, T_x).
      # x: Input data of shape (n_x, m, T_x) where n_x is the number of features, m is the batch size, and T_x is the number of time steps.
      # parameters: Dictionary containing model parameters including Wya.
    a0 -- Initial hidden state, of shape (n_a, m)
    parameters -- python dictionary containing:
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        ba --  Bias numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)

    Returns:
    a -- Hidden states for every time-step, numpy array of shape (n_a, m, T_x)
    y_pred -- Predictions for every time-step, numpy array of shape (n_y, m, T_x)
    caches -- tuple of values needed for the backward pass, contains (list of caches, x)
    """

    # Initialize "caches" which will contain the list of all caches
    caches = []

    # Retrieve dimensions from shapes of x and Wy
    n_x, m, T_x = x.shape
    n_y, n_a = parameters["Wya"].shape

    a = np.zeros((n_a, m, T_x))
    y_pred = np.zeros((n_y, m, T_x))

    a_next = a0

    for t in range(T_x):
      a_next, yt_pred, cache = rnn_cell_forward(x[:,:,t], a_next, parameters)
      a[:,:,t] = a_next
      y_pred[:,:,t] = yt_pred
      caches.append(cache)

    caches = (caches, x)

    return a, y_pred, caches




In [None]:
np.random.seed(1)
x = np.random.randn(3,10,4)
a0 = np.random.randn(5,10)
Waa = np.random.randn(5,5)
Wax = np.random.randn(5,3)
Wya = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Waa": Waa, "Wax": Wax, "Wya": Wya, "ba": ba, "by": by}

a, y_pred, caches = rnn_forward(x, a0, parameters)
print("a[4][1] = ", a[4][1])
print("a.shape = ", a.shape)
print("y_pred[1][3] =", y_pred[1][3])
print("y_pred.shape = ", y_pred.shape)
print("caches[1][1][3] =", caches[1][1][3])
print("len(caches) = ", len(caches))

a[4][1] =  [-0.99999375  0.77911235 -0.99861469 -0.99833267]
a.shape =  (5, 10, 4)
y_pred[1][3] = [0.79560373 0.86224861 0.11118257 0.81515947]
y_pred.shape =  (2, 10, 4)
caches[1][1][3] = [-1.1425182  -0.34934272 -0.20889423  0.58662319]
len(caches) =  2


This will work well enough for some applications, but it suffers from vanishing gradient problems. So it works best when each output  𝑦⟨𝑡⟩  can be estimated using mainly "local" context (meaning information from inputs  𝑥⟨𝑡′⟩  where  𝑡′  is not too far from  𝑡 ).
In the next part, I will build a more complex LSTM model, which is better at addressing vanishing gradients. The LSTM will be better able to remember a piece of information and keep it saved for many timesteps.

In [18]:
# LSTM - one time step

def lstm_cell_forward(xt, a_prev, c_prev, parameters):
    """
    Implement a single forward step of the LSTM-cell

    Arguments:
    xt -- your input data at timestep "t", numpy array of shape (n_x, m).
    a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
    c_prev -- Memory state at timestep "t-1", numpy array of shape (n_a, m)
    parameters -- python dictionary containing:
                        Wf -- Weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
                        bf -- Bias of the forget gate, numpy array of shape (n_a, 1)
                        Wi -- Weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
                        bi -- Bias of the update gate, numpy array of shape (n_a, 1)
                        Wc -- Weight matrix of the first "tanh", numpy array of shape (n_a, n_a + n_x)
                        bc -- Bias of the first "tanh", numpy array of shape (n_a, 1)
                        Wo -- Weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
                        bo -- Bias of the output gate, numpy array of shape (n_a, 1)
                        Wy -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)

    Returns:
    a_next -- next hidden state, of shape (n_a, m)
    c_next -- next memory state, of shape (n_a, m)
    yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m)
    cache -- tuple of values needed for the backward pass, contains (a_next, c_next, a_prev, c_prev, xt, parameters)

    ft/it/ot stand for the forget/update/output gates, cct stands for the candidate value (c tilda),
          c stands for the memory value


    Concatenate  𝑎⟨𝑡−1⟩  and  𝑥⟨𝑡⟩  in a single matrix:  𝑐𝑜𝑛𝑐𝑎𝑡=[𝑎⟨𝑡−1⟩𝑥⟨𝑡⟩]
    Compute all the formulas:
    Compute the prediction  𝑦⟨𝑡⟩ .

    """

    # Retrieve parameters from "parameters"
    Wf = parameters["Wf"]
    bf = parameters["bf"]
    Wi = parameters["Wi"]
    bi = parameters["bi"]
    Wc = parameters["Wc"]
    bc = parameters["bc"]
    Wo = parameters["Wo"]
    bo = parameters["bo"]
    Wy = parameters["Wy"]
    by = parameters["by"]

    # Retrieve dimensions from shapes of xt and Wy
    n_x, m = xt.shape
    n_y, n_a = Wy.shape

    # Concatenate a_prev and xt
    # [𝑎⟨𝑡−1⟩,𝑥⟨𝑡⟩]
    concat = np.zeros((n_a + n_x, m))
    concat[: n_a, :] = a_prev
    concat[n_a :, :] = xt

    # Compute values for ft, it, cct, c_next, ot, a_next using the formulas

    # Γ⟨𝑡⟩𝑓 = 𝜎(𝑊𝑓[𝑎⟨𝑡−1⟩,𝑥⟨𝑡⟩]+𝑏𝑓)
    ft = sigmoid(np.dot(Wf, concat) + bf) #forget gate

    # Γ⟨𝑡⟩𝑢 = 𝜎(𝑊𝑢[𝑎⟨𝑡−1⟩,𝑥{𝑡}]+𝑏𝑢)
    it = sigmoid(np.dot(Wi, concat) + bi) #update gate

    # 𝑐̃⟨𝑡⟩ = tanh(𝑊𝑐[𝑎⟨𝑡−1⟩,𝑥⟨𝑡⟩]+𝑏𝑐)
    cct = np.tanh(np.dot(Wc, concat) + bc) # candidate memory value

    # 𝑐⟨𝑡⟩ = Γ⟨𝑡⟩𝑓 ∗ 𝑐⟨𝑡−1⟩ + Γ⟨𝑡⟩𝑢 ∗ 𝑐̃⟨𝑡⟩
    c_next = ft * c_prev + it * cct # memory cell

    # Γ⟨𝑡⟩𝑜 = 𝜎(𝑊𝑜[𝑎⟨𝑡−1⟩,𝑥⟨𝑡⟩]+𝑏𝑜)
    ot = sigmoid(np.dot(Wo, concat) + bo) #output gate

    # 𝑎⟨𝑡⟩ = Γ⟨𝑡⟩𝑜 ∗ tanh(𝑐⟨𝑡⟩)
    a_next = ot * np.tanh(c_next) # a<t>

    # Compute prediction of the LSTM cell
    yt_pred = softmax(np.dot(Wy, a_next) + by)

    # store values needed for backward propagation in cache
    cache = (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters)

    return a_next, c_next, yt_pred, cache




In [20]:
np.random.seed(1)
xt = np.random.randn(3,10)
a_prev = np.random.randn(5,10)
c_prev = np.random.randn(5,10)
Wf = np.random.randn(5, 5+3)
bf = np.random.randn(5,1)
Wi = np.random.randn(5, 5+3)
bi = np.random.randn(5,1)
Wo = np.random.randn(5, 5+3)
bo = np.random.randn(5,1)
Wc = np.random.randn(5, 5+3)
bc = np.random.randn(5,1)
Wy = np.random.randn(2,5)
by = np.random.randn(2,1)

parameters = {"Wf": Wf, "Wi": Wi, "Wo": Wo, "Wc": Wc, "Wy": Wy, "bf": bf, "bi": bi, "bo": bo, "bc": bc, "by": by}

a_next, c_next, yt, cache = lstm_cell_forward(xt, a_prev, c_prev, parameters)
print("a_next[4] = ", a_next[4])
print("a_next.shape = ", c_next.shape)
print("c_next[2] = ", c_next[2])
print("c_next.shape = ", c_next.shape)
print("yt[1] =", yt[1])
print("yt.shape = ", yt.shape)
print("cache[1][3] =", cache[1][3])
print("len(cache) = ", len(cache))

a_next[4] =  [-0.66408471  0.0036921   0.02088357  0.22834167 -0.85575339  0.00138482
  0.76566531  0.34631421 -0.00215674  0.43827275]
a_next.shape =  (5, 10)
c_next[2] =  [ 0.63267805  1.00570849  0.35504474  0.20690913 -1.64566718  0.11832942
  0.76449811 -0.0981561  -0.74348425 -0.26810932]
c_next.shape =  (5, 10)
yt[1] = [0.79913913 0.15986619 0.22412122 0.15606108 0.97057211 0.31146381
 0.00943007 0.12666353 0.39380172 0.07828381]
yt.shape =  (2, 10)
cache[1][3] = [-0.16263996  1.03729328  0.72938082 -0.54101719  0.02752074 -0.30821874
  0.07651101 -1.03752894  1.41219977 -0.37647422]
len(cache) =  10


In [22]:
sentence = "ChatGPT is a powerful a language model"

# Tokenize the sentence
words = sentence.split()

# Number of time steps (T_x) is the number of words in the sentence
T_x = len(words)

# Suppose you decide to represent each word using a one-hot encoding
# In this example, we have a vocabulary of unique words
vocab = set(words)

# Number of input features (n_x) is the size of the one-hot encoding for each word
n_x = len(vocab)
print(T_x, n_x)


7 6


In [23]:
# lstm_forward() to run an LSTM over  𝑇𝑥  time-steps.
# Note:  𝑐⟨0⟩  is initialized with zeros.

def lstm_forward(x, a0, parameters):
    caches = []

    # Retrieve dimensions from shapes of x and Wy
    n_x, m, T_x = x.shape
    n_y, n_a = parameters["Wy"].shape

    # initialize "a", "c" and "y" with zeros
    a = np.zeros((n_a, m, T_x))
    c = np.zeros((n_a, m, T_x))
    y = np.zeros((n_y, m, T_x))

    # Initialize a_next and c_next
    a_next = a0
    c_next = np.zeros(a_next.shape)

    # loop over all time-steps
    for t in range(T_x):
        # Update next hidden state, next memory state, compute the prediction, get the cache
        a_next, c_next, yt, cache = lstm_cell_forward(x[:, :, t], a_next, c_next, parameters)
        # Save the value of the new "next" hidden state in a
        a[:,:,t] = a_next
        # Save the value of the prediction in y
        y[:,:,t] = yt
        # Save the value of the next cell state
        c[:,:,t]  = c_next
        # Append the cache into caches
        caches.append(cache)

    # store values needed for backward propagation in cache
    caches = (caches, x)

    return a, y, c, caches


In [24]:
np.random.seed(1)
x = np.random.randn(3,10,7)
a0 = np.random.randn(5,10)
Wf = np.random.randn(5, 5+3)
bf = np.random.randn(5,1)
Wi = np.random.randn(5, 5+3)
bi = np.random.randn(5,1)
Wo = np.random.randn(5, 5+3)
bo = np.random.randn(5,1)
Wc = np.random.randn(5, 5+3)
bc = np.random.randn(5,1)
Wy = np.random.randn(2,5)
by = np.random.randn(2,1)

parameters = {"Wf": Wf, "Wi": Wi, "Wo": Wo, "Wc": Wc, "Wy": Wy, "bf": bf, "bi": bi, "bo": bo, "bc": bc, "by": by}

a, y, c, caches = lstm_forward(x, a0, parameters)
print("a[4][3][6] = ", a[4][3][6])
print("a.shape = ", a.shape)
print("y[1][4][3] =", y[1][4][3])
print("y.shape = ", y.shape)
print("caches[1][1[1]] =", caches[1][1][1])
print("c[1][2][1]", c[1][2][1])
print("len(caches) = ", len(caches))

a[4][3][6] =  0.1721177675329167
a.shape =  (5, 10, 7)
y[1][4][3] = 0.9508734618501101
y.shape =  (2, 10, 7)
caches[1][1[1]] = [ 0.82797464  0.23009474  0.76201118 -0.22232814 -0.20075807  0.18656139
  0.41005165]
c[1][2][1] -0.8555449167181983
len(caches) =  2
