In [24]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

In [25]:
# Things you should automatically know and have memorized
# N = number of samples
# T = sequence length
# D = number of input feature
# M = nuber of hidden units
# K = number of output units

In [26]:
# Make some data
N = 1
T = 10
D = 3
M = 5
K = 2
X = np.random.randn(N, T, D)


In [27]:
### Define simple RNN
class SimpleRNN(nn.Module):
  # just 1 RNN layer
  def __init__(self, n_inputs, n_hidden, n_outputs):
    super(SimpleRNN, self).__init__()
    self.D = n_inputs
    self.M = n_hidden
    self.K = n_outputs 
    # note: batch_first=True
    # applies the convention that our data will be of shape:
    # (num_samples, sequence_length, num_features)
    # rather than:
    # (sequence_length, num_samples, num_features)
    self.rnn = nn.RNN(
        input_size=self.D,
        hidden_size = self.M,
        num_layers=1,
        nonlinearity='tanh', # tanh for fun instead of relu
        batch_first=True)
    self.fc = nn.Linear(self.M, self.K)

  def forward(self, X):
    # initial hidden states
    h0 = torch.zeros(1, X.size(0), self.M)

    # get RNN unit output
    # out is of size (N,T,M)
    # 2nd return value is hidden states at each hidden layer
    # we don't need those now
    out, _ = self.rnn(X, h0)

    # we are not going to have h(T) at the final time step
    # and pass it through the dense layers, instead we are
    # going to take all the h(T) and pass it through the final dense
    # layer
    #
    # out = self.fc(out[:, -1, :])
    # N x M -> N x T x K
    out = self.fc(out)
    return out

In [28]:
# Instantiate the model
model = SimpleRNN(n_inputs=D, n_hidden=M, n_outputs=K)

In [29]:
# get the output
inputs = torch.from_numpy(X.astype(np.float32))
out = model(inputs)
out

tensor([[[-0.4853,  0.5184],
         [-0.0956,  0.5052],
         [-0.5107,  0.5370],
         [ 0.0488,  0.3064],
         [-0.4796,  0.6349],
         [-0.0144,  0.3800],
         [-0.4191,  0.7495],
         [-0.1612,  0.4133],
         [-0.1674,  0.5918],
         [-0.5469,  0.5068]]], grad_fn=<ViewBackward0>)

In [30]:
out.shape

torch.Size([1, 10, 2])

In [31]:
# save for later
Yhats_torch = out.detach().numpy()

In [32]:
W_xh, W_hh, b_xh, b_hh = model.rnn.parameters()

In [33]:
W_xh.shape

torch.Size([5, 3])

In [34]:
W_xh

Parameter containing:
tensor([[ 0.2803, -0.3816,  0.0264],
        [-0.1077, -0.3107,  0.4373],
        [ 0.1981,  0.0214,  0.1331],
        [-0.0895,  0.1018, -0.3377],
        [-0.0107, -0.0766, -0.0457]], requires_grad=True)

In [35]:
W_xh = W_xh.data.numpy()
W_xh

array([[ 0.28025892, -0.3815815 ,  0.02636802],
       [-0.10774262, -0.3107205 ,  0.4372985 ],
       [ 0.19814193,  0.021377  ,  0.13311407],
       [-0.08954499,  0.10181811, -0.33769292],
       [-0.01073272, -0.07662936, -0.04569447]], dtype=float32)

In [36]:
b_xh = b_xh.data.numpy()
W_hh = W_hh.data.numpy()
b_hh = b_hh.data.numpy()

In [38]:
# Did we do it right?
W_xh.shape,b_xh.shape,W_hh.shape,b_hh.shape

((5, 3), (5,), (5, 5), (5,))

In [42]:
# Now get the FC layer weights
Wo, bo = model.fc.parameters()

In [43]:
Wo = Wo.data.numpy()
bo = bo.data.numpy()
Wo.shape, bo.shape

((2, 5), (2,))

In [47]:
# Manual RNN calculations
h_last = np.zeros(M) # initial hidden state
x = X[0] # the one and only sample
Yhats = np.zeros((T, K)) # where we store the ouptuts

for t in range(T):
  h = np.tanh(x[t].dot(W_xh.T) + b_xh + h_last.dot(W_hh.T) + b_hh)
  y = h.dot(Wo.T) + bo # we only care about this value on the last iteration
  Yhats[t] = y

  # important: assign h to h_last
  h_last = h

# print the final output
print(Yhats)


[[-0.48529267  0.51841695]
 [-0.09564181  0.50516516]
 [-0.51068418  0.53699132]
 [ 0.04883679  0.30641562]
 [-0.47963413  0.6348555 ]
 [-0.01444513  0.38001238]
 [-0.41907832  0.74950818]
 [-0.16117205  0.41331748]
 [-0.1674022   0.59178134]
 [-0.54685836  0.50679777]]


In [48]:
# Check
np.allclose(Yhats, Yhats_torch)

True