In [36]:
import numpy as np
import copy
from utils import *

### RNN
#### Forward Propagation

In [52]:
def softmax(x):
    e_x = np.exp(x-np.max(x))
    e_x = e_x / np.sum(e_x, axis=0)

    return e_x

In [59]:
def rnn_cell_forwards(xt, a_prev, parameters):
    Wax = parameters['Wax']
    Waa = parameters['Waa']
    Wya = parameters['Wya']
    ba = parameters['ba']
    by = parameters['by']

    a_next = np.tanh(np.dot(Wax, xt) + np.dot(Waa, a_prev) + ba)

    yt_pred = softmax(np.dot(Wya, a_next)+by)

    cache = (a_next, a_prev, xt, parameters)

    return a_next, yt_pred, cache

In [60]:
def rnn_forwards(x, a0, parameters):

    caches = []

    a = np.zeros((parameters["Wya"].shape[1], x.shape[1], x.shape[2]))
    y_pred = np.zeros((parameters["Wya"].shape[0], x.shape[1], x.shape[2]))

    a_next = a0

    for t in range(x.shape[2]):
        xt = x[:,:,t]

        a_next, yt_pred, cache = rnn_cell_forwards(xt, a_next, parameters)

        a[:,:,t] = a_next
        y_pred[:,:,t] = yt_pred

        caches.append(cache)

    caches = (caches, x)

    return a, y_pred, caches

#### Back propagation

In [61]:
def rnn_cell_backwards(da_next, cache):
    (a_next, a_prev, xt, parameters) = cache

    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]

    dz = da_next * (1 - a_next**2 )

    dxt = np.dot(Wax.T, dz)
    da_prev = np.dot(Waa.T, dz)
    dWax = np.dot(dz, xt.T)
    dWaa = np.dot(dz, a_prev.T)
    dba = np.sum(dz, axis=-1, keepdims=True)
    
    gradients = {
        "dxt":dxt ,
        "da_prev":da_prev ,
        "dWax":dWax ,
        "dWaa":dWaa ,
        "dba":dba 
    }

    return gradients

In [75]:
def rnn_backwards(da, caches):
    caches, x = caches
    a1, a0, x1, parameters = caches[0]

    dx = np.zeros((x1.shape[0], x1.shape[1], da.shape[2]))
    da0 = np.zeros((da.shape[0], x1.shape[1]))
    dWax = np.zeros((da.shape[0], x1.shape[0]))
    dWaa = np.zeros((da.shape[0], da.shape[0]))
    dba = np.zeros((da.shape[0], 1))
    da_prevt = np.zeros((a0.shape[0], x1.shape[1]))

    for t in reversed(range(da.shape[2])):
        gradients = rnn_cell_backwards(da[:,:,t]+da_prevt, caches[t])
        dxt, da_prevt, dWaxt, dWaat, dbat = gradients['dxt'], gradients['da_prev'], gradients['dWax'], gradients['dWaa'], gradients['dba']
        dx[:,:,t] = dxt
        dWax += dWaxt 
        dWaa += dWaat
        dba += dbat

    da0 = da_prevt
    
    gradients = {
        "dx":dx ,
        "da0":da0 ,
        "dWax":dWax ,
        "dWaa":dWaa ,
        "dba":dba 
    }

    return gradients

### Long Short Term Memory

In [14]:
def sigmoid(z):
    a = 1 / (1+np.exp(-z))
    return a

In [35]:
def lstm_cell_forward(xt, a_prev, c_prev, parameters):

    '''
    forget gate. ft = sigmoid(Wf * [a_prev, xt] +bf)
    multipy forget gate with c_prev. (applying mask to c_prev)
    
    candidate value cct = tanh(Wc * [a_prev, xt] +bc)
    update gate it = sigmoid(Wi * [a_prev, xt] +bi)
    multipy update gate with candidate value. (applying mask to candidate value)
    
    c_next = resulting mask of c_prev + resulting mask of candidate value
    
    ouput gate ot = sigmoid(Wo * [a_prev, xt] +bo)

    a_next = output gate multiplied by tanh(c_next) (applying mask to output gate based on what is remembered)

    prediction yt_pred = softmax(Wy * a_next + b)
    '''

    Wf = parameters["Wf"]
    Wc = parameters["Wc"]
    Wi = parameters["Wi"]
    Wo = parameters["Wo"]
    Wy = parameters["Wy"]

    bf = parameters["bf"]
    bc = parameters["bc"]
    bi = parameters["bi"]
    bo = parameters["bo"]
    by = parameters["by"]

    concat = np.concatenate((a_prev, xt), axis=0)

    ft = sigmoid(Wf.dot(concat) +bf)
    it = sigmoid(Wi.dot(concat) +bi)
    ot = sigmoid(Wo.dot(concat) +bo)

    cct = np.tanh(Wc.dot(concat) +bc)
    c_next = (ft * c_prev) + (it * cct)
    a_next = ot * np.tanh(c_next)
    yt_pred = softmax(Wy.dot(a_next) + by)
    

    cache = (a_next, c_next, a_prev, c_prev, ft, it, ot, cct, xt, parameters)

    return a_next, c_next, yt_pred, cache

In [39]:
def lstm_forward(x, a0, parameters):
    caches = []
    
    a = np.zeros((parameters["Wy"].shape[1], x.shape[1], x.shape[2]))
    c = np.zeros((parameters["Wy"].shape[1], x.shape[1], x.shape[2]))
    y = np.zeros((parameters["Wy"].shape[0], x.shape[1], x.shape[2]))

    a_next = copy.deepcopy(a0)
    c_next = np.zeros((parameters["Wy"].shape[1], x.shape[1]))

    for t in range(x.shape[2]):
        xt = x[:,:,t]

        a_next, c_next, yt, cache = lstm_cell_forward(xt, a_next, c_next, parameters)

        a[:,:,t] = a_next
        y[:,:,t] = yt
        c[:,:,t] = c_next

        caches.append(cache)

    caches = (caches, x)

    return a, y, c, caches

#### LSTM Backwards Propagation

In [92]:
def lstm_cell_backward(da_next, dc_next, cache):

    a_next, c_next, a_prev, c_prev, ft, it, ot, cct, xt, parameters = cache


    dot = da_next * np.tanh(c_next) * ot * (1-ot)
    dcct = ((dc_next + ot * (1 - np.tanh(c_next)**2) * da_next ) * it * (1-cct**2))
    dit = ((dc_next + ot * (1 - np.tanh(c_next)**2) * da_next ) * cct  * it * (1-it))
    dft = ((dc_next + ot * (1 - np.tanh(c_next)**2) * da_next ) * c_prev * ft * (1-ft))

    concate = np.concatenate((a_prev, xt))
    
    dWf = np.dot(dft, concate.T)
    dWi = np.dot(dit, concate.T)
    dWc = np.dot(dcct, concate.T)
    dWo = np.dot(dot, concate.T)
    
    dbf = np.sum(dft , axis=-1, keepdims=True)
    dbi = np.sum(dit , axis=-1, keepdims=True)
    dbc = np.sum(dcct , axis=-1, keepdims=True)
    dbo = np.sum(dot , axis=-1, keepdims=True)

    dconcat = np.dot(parameters["Wf"].T, dft) + np.dot(parameters["Wi"].T, dit) + np.dot(parameters["Wc"].T, dcct) + np.dot(parameters["Wo"].T, dot)
    
    da_prev = dconcat[:a_prev.shape[0] ,:]
    dc_prev = dc_next * ft + ot * (1- np.tanh(c_next)**2) * ft * da_next
    dxt = dconcat[a_prev.shape[0]: ,:]

    gradients = {
        "dxt": dxt, 
        "da_prev": da_prev, 
        "dc_prev": dc_prev, 
        "dWf": dWf,
        "dbf": dbf, 
        "dWi": dWi,
        "dbi": dbi,  
        "dWc": dWc,
        "dbc": dbc, 
        "dWo": dWo,
        "dbo": dbo
    }

    return gradients

In [103]:
def lstm_backward(da, caches):

    caches, x = caches
    a1, c1 , a0, c0, f1, i1, o1, cc1, x1, parameters = caches[0]

    
    dx = np.zeros((x1.shape[0], x1.shape[1], da.shape[2]))
    da0 = np.zeros((da.shape[0], x1.shape[1]))
    da_prevt = np.zeros((a0.shape[0], x1.shape[1]))
    dc_prevt = np.zeros((c0.shape[0], x1.shape[1]))
    
    dWf = np.zeros((da.shape[0], da.shape[0] + x1.shape[0]))
    dWi = np.zeros((da.shape[0], da.shape[0] + x1.shape[0]))
    dWc = np.zeros((da.shape[0], da.shape[0] + x1.shape[0]))
    dWo = np.zeros((da.shape[0], da.shape[0] + x1.shape[0]))
    
    dbf = np.zeros((da.shape[0], 1))
    dbi = np.zeros((da.shape[0], 1))
    dbc = np.zeros((da.shape[0], 1))
    dbo = np.zeros((da.shape[0], 1))

    for t in reversed(range(da.shape[2])):
        gradients = lstm_cell_backward(da[:,:,t] + da_prevt, dc_prevt, caches[t])
        
        da_prevt = gradients["da_prev"]
        dc_prevt = gradients["dc_prev"]
        dx[:,:,t] = gradients["dxt"]
        
        dWf += gradients["dWf"]
        dWi += gradients["dWi"]
        dWc += gradients["dWc"]
        dWo += gradients["dWo"]
        
        dbf += gradients["dbf"]
        dbi += gradients["dbi"]
        dbc += gradients["dbc"]
        dbo += gradients["dbo"]

    da0 = da_prevt

    gradients = {
        "dx": dx, 
        "da0": da0, 
        "dWf": dWf,
        "dbf": dbf, 
        "dWi": dWi,
        "dbi": dbi,  
        "dWc": dWc,
        "dbc": dbc, 
        "dWo": dWo,
        "dbo": dbo
    }

    return gradients

In [104]:
np.random.seed(1)
x_tmp = np.random.randn(3,10,7)
a0_tmp = np.random.randn(5,10)

parameters_tmp = {}
parameters_tmp['Wf'] = np.random.randn(5, 5+3)
parameters_tmp['bf'] = np.random.randn(5,1)
parameters_tmp['Wi'] = np.random.randn(5, 5+3)
parameters_tmp['bi'] = np.random.randn(5,1)
parameters_tmp['Wo'] = np.random.randn(5, 5+3)
parameters_tmp['bo'] = np.random.randn(5,1)
parameters_tmp['Wc'] = np.random.randn(5, 5+3)
parameters_tmp['bc'] = np.random.randn(5,1)
parameters_tmp['Wy'] = np.zeros((2,5))       # unused, but needed for lstm_forward
parameters_tmp['by'] = np.zeros((2,1))       # unused, but needed for lstm_forward

a_tmp, y_tmp, c_tmp, caches_tmp = lstm_forward(x_tmp, a0_tmp, parameters_tmp)

da_tmp = np.random.randn(5, 10, 4)
gradients_tmp = lstm_backward(da_tmp, caches_tmp)

print("gradients[\"dx\"][1][2] =", gradients_tmp["dx"][1][2])
print("gradients[\"dx\"].shape =", gradients_tmp["dx"].shape)
print("gradients[\"da0\"][2][3] =", gradients_tmp["da0"][2][3])
print("gradients[\"da0\"].shape =", gradients_tmp["da0"].shape)
print("gradients[\"dWf\"][3][1] =", gradients_tmp["dWf"][3][1])
print("gradients[\"dWf\"].shape =", gradients_tmp["dWf"].shape)
print("gradients[\"dWi\"][1][2] =", gradients_tmp["dWi"][1][2])
print("gradients[\"dWi\"].shape =", gradients_tmp["dWi"].shape)
print("gradients[\"dWc\"][3][1] =", gradients_tmp["dWc"][3][1])
print("gradients[\"dWc\"].shape =", gradients_tmp["dWc"].shape)
print("gradients[\"dWo\"][1][2] =", gradients_tmp["dWo"][1][2])
print("gradients[\"dWo\"].shape =", gradients_tmp["dWo"].shape)
print("gradients[\"dbf\"][4] =", gradients_tmp["dbf"][4])
print("gradients[\"dbf\"].shape =", gradients_tmp["dbf"].shape)
print("gradients[\"dbi\"][4] =", gradients_tmp["dbi"][4])
print("gradients[\"dbi\"].shape =", gradients_tmp["dbi"].shape)
print("gradients[\"dbc\"][4] =", gradients_tmp["dbc"][4])
print("gradients[\"dbc\"].shape =", gradients_tmp["dbc"].shape)
print("gradients[\"dbo\"][4] =", gradients_tmp["dbo"][4])
print("gradients[\"dbo\"].shape =", gradients_tmp["dbo"].shape)

gradients["dx"][1][2] = [ 0.00218254  0.28205375 -0.48292508 -0.43281115]
gradients["dx"].shape = (3, 10, 4)
gradients["da0"][2][3] = 0.3127703102572602
gradients["da0"].shape = (5, 10)
gradients["dWf"][3][1] = -0.08098023109383466
gradients["dWf"].shape = (5, 8)
gradients["dWi"][1][2] = 0.40512433092981814
gradients["dWi"].shape = (5, 8)
gradients["dWc"][3][1] = -0.07937467355121496
gradients["dWc"].shape = (5, 8)
gradients["dWo"][1][2] = 0.038948775762986956
gradients["dWo"].shape = (5, 8)
gradients["dbf"][4] = [-0.15745657]
gradients["dbf"].shape = (5, 1)
gradients["dbi"][4] = [-0.50848333]
gradients["dbi"].shape = (5, 1)
gradients["dbc"][4] = [-0.42510818]
gradients["dbc"].shape = (5, 1)
gradients["dbo"][4] = [-0.17958196]
gradients["dbo"].shape = (5, 1)
