# Vanilla RNN

In [121]:
import numpy as np

hyperparameters

In [122]:
hidden_size = 3
vocab_size = 4
inputs = [2]
targets = [3]

model parameters

In [123]:
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

$h(t)=tanh(WX + Vh(t-1) + b_h)$

$f(t) = Uh(t) + b_y$

$p(t) = softmax(f(t))$

代码中,

Wxh ==> $W$

Whh ==> $V$

Why ==> $U$

In [124]:
Wxh

array([[ -1.36458834e-03,   1.10118612e-02,  -4.54390582e-03,
          4.92045527e-03],
       [  1.13343035e-02,  -6.62299760e-03,  -7.77568912e-03,
          1.52714559e-02],
       [ -1.12811396e-02,   3.55922825e-05,  -3.11710168e-02,
          3.23161113e-02]])

In [125]:
Whh

array([[-0.00079653, -0.00196604, -0.01944714],
       [-0.00086006,  0.00289614, -0.0056044 ],
       [-0.01720294,  0.00805431, -0.00951426]])

In [126]:
Why

array([[-0.012362  , -0.00467771, -0.00561026],
       [ 0.00035526, -0.00330591, -0.0058389 ],
       [-0.01553049, -0.01051185, -0.00296599],
       [-0.00112019, -0.01283274, -0.01498699]])

In [127]:
xs, hs, ys, ps = {}, {}, {}, {}
hprev = np.zeros((hidden_size,1))
hs[-1] = np.copy(hprev)
loss = 0

In [128]:
xs[0] = np.zeros((vocab_size, 1))
xs[0][inputs[0]] = 1
xs[0]

array([[ 0.],
       [ 0.],
       [ 1.],
       [ 0.]])

In [129]:
hs[0] = np.tanh(np.dot(Wxh, xs[0]) + np.dot(Whh, hs[0-1]) + bh)
hs[0]

array([[-0.00454387],
       [-0.00777553],
       [-0.03116093]])

In [130]:
ys[0] = np.dot(Why, hs[0]) + by
ys[0]

array([[ 0.00026736],
       [ 0.00020604],
       [ 0.00024473],
       [ 0.00057188]])

In [131]:
ps[0] = np.exp(ys[0]) / np.sum(np.exp(ys[0]))
ps[0]

array([[ 0.24998621],
       [ 0.24997088],
       [ 0.24998055],
       [ 0.25006235]])

In [132]:
print(ps[0][targets[0], 0])
loss += -np.log(ps[0][targets[0], 0])
print(loss)

0.25006234967
1.38604499354


In [133]:
dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
dbh, dby = np.zeros_like(bh), np.zeros_like(by)
dhnext = np.zeros_like(hs[0])

In [134]:
dy = np.copy(ps[0])
dy[targets[0]] -= 1
dy

array([[ 0.24998621],
       [ 0.24997088],
       [ 0.24998055],
       [-0.74993765]])

In [135]:
dWhy += np.dot(dy, hs[0].T)
dWhy

array([[-0.00113591, -0.00194378, -0.0077898 ],
       [-0.00113584, -0.00194366, -0.00778932],
       [-0.00113588, -0.00194373, -0.00778963],
       [ 0.00340762,  0.00583116,  0.02336875]])

In [136]:
dby += dy
dby

array([[ 0.24998621],
       [ 0.24997088],
       [ 0.24998055],
       [-0.74993765]])

In [137]:
dh = np.dot(Why.T, dy)
dh

array([[-0.00604377],
       [ 0.00500025],
       [ 0.00763583]])

In [138]:
hs[0]

array([[-0.00454387],
       [-0.00777553],
       [-0.03116093]])

In [139]:
1 - hs[0] * hs[0]

array([[ 0.99997935],
       [ 0.99993954],
       [ 0.999029  ]])

In [158]:
dhraw = (1 - hs[0] * hs[0]) * dh
print(dhraw)
dbh += dhraw
print(dbh)
dhnext = np.dot(Whh.T, dhraw)
print(dhnext)

[[-0.00604365]
 [ 0.00499995]
 [ 0.00762842]]
[[-0.0120873 ]
 [ 0.0099999 ]
 [ 0.01525683]]
[[ -1.30717501e-04]
 [  8.78041806e-05]
 [  1.69311833e-05]]


In [153]:
dWxh += np.dot(dhraw, xs[0].T)
print(dWxh)
dWhh += np.dot(dhraw, hs[0-1].T)
print(dWhh)

[[ 0.          0.         -0.07856743  0.        ]
 [ 0.          0.          0.06499933  0.        ]
 [ 0.          0.          0.0991694   0.        ]]
[[ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]
