In [3]:
#RNN is very effective for Natural language processing other sequence task because they have memory.
#In this Notebook I will design full RNN in Numpy.

#In this RNN I am considering No. of Input and No. of Output are same.
import numpy as np

In [4]:
#utility functions
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0) # only difference

In [5]:
#RNN forward propogation

def rnn_cell_forward(xt,a_prev,parameters):
    #xt -> Input to the cell (n_x,m)
    #a_prev -> activation of previous cell (n_a,m)
    #parameters -> python dict with weights and biasis
    #retrieveing the parameters
    Wax = parameters["Wax"] #(n_a,n_x)
    Waa = parameters["Waa"] #(n_a,n_a)
    Wya = parameters["Wya"] #(n_y,n_a)
    ba = parameters["ba"] #(n_a,1)
    by = parameters["by"] #(n_y,1)
    
    #computing a_next
    a_next = np.tanh(np.dot(Waa,a_prev)+np.dot(Wax,xt) +ba)
    
    #computing output of current cell
    yt_pred = softmax(np.dot(Wya, a_next) + by)
    
    #storing the values in cache for backward prop
    
    cache = (a_next,a_prev,xt,parameters)
    
    return a_next, yt_pred, cache
    
    

In [9]:
#RNN forward pass for sequence of data

def rnn_forward(x,a0,parameters):
    #x -> input at every time step(n_x, m, T_x)
    #a0 -> initial hidden state of shape (n_a,m)
    
    #initializing caches
    caches = []
    n_x,m,T_x = x.shape
    n_y,n_a = parameters['Wya'].shape
    
    #initializing a and y_pred with zeros
    
    a = np.zeros((n_a,m,T_x))
    y_pred = np.zeros((n_y,m,T_x))
    
    #for first time 
    a_next = a0
    
    for i in range(T_x):
        # Update next hidden state, compute the prediction, get the cache (≈1 line)
        a_next, yt_pred, cache = rnn_cell_forward(x[:,:,i], a_next, parameters)
        # Save the value of the new "next" hidden state in a (≈1 line)
        a[:,:,i] = a_next
        # Save the value of the prediction in y (≈1 line)
        y_pred[:,:,i] = yt_pred
        # Append "cache" to "caches" (≈1 line)
        caches.append(cache)
        #Storing values in caches for backward prop
    caches = (caches,x)
        
    return a,y_pred,caches    

In [11]:
#Backward propogation RNN cell

def rnn_cell_backward(da_next,cache):
    #da_next -> gradient of loss with respect to next hidden state
    #cache -> python dict containing values stored during forward pass
    
    (a_next,a_prev,xt,parameters) = cache
    
    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]   
    
    #gradient with respect to a_next
    dtanh = (1-a_next**2)*da_next
    dxt = np.dot(Wax.T, dtanh)
    dWax = np.dot(dtanh, xt.T)

    # gradient with respect to Waa (≈2 lines)
    da_prev = np.dot(Waa.T, dtanh)
    dWaa = np.dot(dtanh, a_prev.T)

    # gradient with respect to b (≈1 line)
    dba = np.sum(dtanh, 1, keepdims=True)

    # gradients in a python dictionary
    gradients = {"dxt": dxt, "da_prev": da_prev, "dWax": dWax, "dWaa": dWaa, "dba": dba}
    
    return gradients

In [13]:
#backward propogation of RNN
def rnn_backward(da, caches):
    (caches, x) = caches
    (a1, a0, x1, parameters) = caches[0]
    
    # Retrieve dimensions from da's and x1's shapes (≈2 lines)
    n_a, m, T_x = da.shape
    n_x, m = x1.shape
    
    # initialize the gradients with the right sizes (≈6 lines)
    dx = np.zeros((n_x, m, T_x))
    dWax = np.zeros((n_a, n_x))
    dWaa = np.zeros((n_a, n_a))
    dba = np.zeros((n_a, 1))
    da0 = np.zeros((n_a, m))
    da_prevt = np.zeros((n_a, m))
    
    # Loop through all the time steps
    for t in reversed(range(T_x)):
        # Compute gradients at time step t. Choose wisely the "da_next" and the "cache" to use in the backward propagation step. (≈1 line)
        gradients = rnn_cell_backward(da[:,:, t] + da_prevt, caches[t])
        # Retrieve derivatives from gradients (≈ 1 line)
        dxt, da_prevt, dWaxt, dWaat, dbat = gradients["dxt"], gradients["da_prev"], gradients["dWax"], gradients["dWaa"], gradients["dba"]
        # Increment global derivatives w.r.t parameters by adding their derivative at time-step t (≈4 lines)
        dx[:, :, t] = dxt
        dWax += dWaxt
        dWaa += dWaat
        dba += dbat
        
    # Set da0 to the gradient of a which has been backpropagated through all time-steps (≈1 line) 
    da0 = da_prevt
    
    # Store the gradients in a python dictionary
    gradients = {"dx": dx, "da0": da0, "dWax": dWax, "dWaa": dWaa,"dba": dba}
    
    return gradients