In [2]:
import numpy as np

In [3]:
def rnn_step_forward(x, prev_h, Wx, Wh, b):
    # We separate on steps to make the backpropagation easier
    #forward pass in steps
    # step 1
    xWx = np.dot(x, Wx)

    # step 2
    phWh = np.dot(prev_h,Wh)

    # step 3
    # total
    affine = xWx + phWh + b.T

    # step 4
    next_h = np.tanh(t)

    # Cache iputs, state, and weights
    # we are having prev_h.copy() since python params are pass by reference.
    cache = (x, prev_h.copy(), Wx, Wh, next_h, affine)

    return next_h, cache

In [4]:
def rnn_step_backward(dnext_h, cache):
    (x, prev_h, Wx, Wh, next_h, affine) = cache

    #backward in step
    # step 4
    # dt delta of total
    # Gradient of tanh times dnext_h
    dt = (1 - np.square(np.tanh(affine))) * (dnext_h)

    # step 3
    # Gradient of sum block
    dxWx = dt
    dphWh = dt
    db = np.sum(dt, axis=0)

    # step 2
    # Gradient of the mul block
    dWh = prev_h.T.dot(dphWh)
    dprev_h = Wh.dot(dphWh.T).T

    # step 1
    # Gradient of the mul block
    dx = dxWx.dot(Wx.T)
    dWx = x.T.dot(dxWx)

    return dx, dprev_h, dWx, dWh, db

In [6]:
def rnn_forward(x, h0, Wx, Wh, b):
    """
    Run a vanilla RNN forward on an entire sequence of data. We assume an input
    sequence composed of T vectors, each of dimension D. The RNN uses a hidden
    size of H, and we work over a minibatch containing N sequences. After running
    the RNN forward, we return the hidden states for all timesteps.

    Inputs:
    - x: Input data for the entire timeseries, of shape (N, T, D).
    - h0: Initial hidden state, of shape (N, H)
    - Wx: Weight matrix for input-to-hidden connections, of shape (D, H)
    - Wh: Weight matrix for hidden-to-hidden connections, of shape (H, H)
    - b: Biases of shape (H,)

    Returns a tuple of:
    - h: Hidden states for the entire timeseries, of shape (N, T, H).
    - cache: Values needed in the backward pass
    """

    # Get shapes
    N, T, D = x.shape
    # Initialization
    h, cache = None, None
    H = h0.shape[1]
    h = np.zeros((N,T,H))

    # keeping the inital value in the last element
    # it will be overwritten
    h[:,-1,:] = h0
    cache = []

    # For each time-step
    for t in xrange(T):
        h[:,t,:], cache_step = rnn_step_forward(x[:,t,:], h[:,t-1,:], Wx, Wh, b)
        cache.append(cache_step)

    # Return current state and cache
    return h, cache

In [7]:

def rnn_backward(dh, cache):
    """
    Compute the backward pass for a vanilla RNN over an entire sequence of data.

    Inputs:
    - dh: Upstream gradients of all hidden states, of shape (N, T, H)

    Returns a tuple of:
    - dx: Gradient of inputs, of shape (N, T, D)
    - dh0: Gradient of initial hidden state, of shape (N, H)
    - dWx: Gradient of input-to-hidden weights, of shape (D, H)
    - dWh: Gradient of hidden-to-hidden weights, of shape (H, H)
    - db: Gradient of biases, of shape (H,)
    """
    dx, dh0, dWx, dWh, db = None, None, None, None, None
    # Get shapes
    N,T,H = dh.shape
    D = cache[0][0].shape[1] # D taken from x in cache

    # Initialization keeping the gradients with the same shape it's respective inputs/weights
    dx, dprev_h = np.zeros((N, T, D)),np.zeros((N, H))
    dWx, dWh, db = np.zeros((D, H)), np.zeros((H, H)), np.zeros((H,))
    dh = dh.copy()

    # For each time-step
    for t in reversed(xrange(T)):
        dh[:,t,:]  += dprev_h # updating the previous layer dh
        dx_, dprev_h, dWx_, dWh_, db_ = rnn_step_backward(dh[:,t,:], cache[t])
        # Observe that we sum each time-step gradient
        dx[:,t,:] += dx_
        dWx += dWx_
        dWh += dWh_
        db += db_

    dh0 = dprev_h

    return dx, dh0, dWx, dWh, db

In [8]:
def lstm_step_forward(x, prev_h, prev_c, Wx, Wh, b):
  N,H = prev_c.shape

  #forward pass in steps
  # step 1: calculate activation vector
  a = np.dot(x, Wx) + np.dot(prev_h,Wh) + b.T

  # step 2: input gate
  a_i = sigmoid(a[:,0:H])

  # step 3: forget gate
  a_f = sigmoid(a[:,H:2*H])

  # step 4: output gate
  a_o = sigmoid(a[:,2*H:3*H])

  # step 5: block input gate
  a_g= np.tanh(a[:,3*H:4*H])

  # step 6: next cell state
  next_c = a_f * prev_c +  a_i * a_g

  # step 7: next hidden state
  next_h = a_o * np.tanh(next_c)

  # we are having *.copy() since python params are pass by reference.
  cache = (x, prev_h.copy(), prev_c.copy(), a, a_i, a_f, a_o, a_g, next_h, next_c, Wx, Wh)

  return next_h, next_c, cache

In [9]:
def lstm_step_backward(dnext_h, dnext_c, cache):
  (x, prev_h, prev_c, a, a_i, a_f, a_o, a_g, next_h, next_c, Wx, Wh) = cache
  N,H = dnext_h.shape
  da = np.zeros(a.shape)

  # step 7:
  dnext_c = dnext_c.copy()
  dnext_c += dnext_h * a_o * (1 - np.tanh(next_c) ** 2)
  da_o = np.tanh(next_c) * dnext_h

  # step 6:
  da_f    = dnext_c * prev_c
  dprev_c = dnext_c * a_f
  da_i    = dnext_c * a_g
  da_g    = dnext_c * a_i

  # step 5:
  da[:,3*H:4*H] = (1 - np.square(a_g)) * da_g

  # step 4:
  da[:,2*H:3*H] = (1 - a_o) * a_o * da_o

  # step 3:
  da[:,H:2*H] = (1 - a_f) * a_f * da_f

  # step 2:
  da[:,0:H] = (1 - a_i) * a_i * da_i

  # step 1:
  db = np.sum(da, axis=0)
  dx = da.dot(Wx.T)
  dWx = x.T.dot(da)
  dprev_h = da.dot(Wh.T)
  dWh = prev_h.T.dot(da)

  return dx, dprev_h, dprev_c, dWx, dWh, db