Recap on the fine grained operation of RNNs:

In [1]:
import torch
import torch.nn as nn

torch.manual_seed(1)

<torch._C.Generator at 0x10ba89ff0>

In [16]:
import numpy as np

In [2]:
rnn_layer = nn.RNN(input_size=5, hidden_size=2, num_layers=1, batch_first=True)

Inspecting the rnn layer's variables - weights in this case:

In [3]:
# lets visualise the variables (weights in this case) of this rnn layer:
w_xh = rnn_layer.weight_ih_l0 # input-hidden weights for layer zero
w_hh = rnn_layer.weight_hh_l0 # hidden-hidden weights for layer zero
b_xh = rnn_layer.bias_ih_l0 # input-hidden bias for layer zero
b_hh = rnn_layer.bias_hh_l0 # hidden-hidden bias for layer zero

In [4]:
print(f'w_xh shape (input to hidden weights): {w_xh.shape}')
print(f'w_hh shape (hidden to hidden weights): {w_hh.shape}')
print(f'b_xh shape (input to hidden bias): {b_xh.shape}')
print(f'b_hh shape (hidden to hidden bias): {b_hh.shape}')

w_xh shape (input to hidden weights): torch.Size([2, 5])
w_hh shape (hidden to hidden weights): torch.Size([2, 2])
b_xh shape (input to hidden bias): torch.Size([2])
b_hh shape (hidden to hidden bias): torch.Size([2])


The weight and bias shapes make sense, considering the defined layer shapes.

Inference:

In [5]:
x_seq = torch.tensor([[1.0]*5, [2.0]*5, [3.0]*5]).float()
x_seq.shape

torch.Size([3, 5])

In [7]:
# adding a batch size
x_seq = x_seq.unsqueeze(0)
x_seq.shape

torch.Size([1, 3, 5])

In [8]:
output, hn = rnn_layer(x_seq) # outputs: the output (outputs at each timestep for the model), and the final hidden state (which is the output at the final timestep)
print(output.shape, hn.shape)

torch.Size([1, 3, 2]) torch.Size([1, 1, 2])


Now we work out the RNN manually - doing the unrolling and everything else manually:

In [9]:
out_man = []

# loop over the sequence length
for t in range(3):
    # at each timestep, get the input at that timestep
    xt = x_seq[:, t, :]
    print(f"Timestep: {t}=>")
    print("Input: ",xt.numpy())

    ht = torch.matmul(xt, torch.transpose(w_xh, 0, 1)) + b_xh # multiply and convert the input shape to hidden shape
    print('Hidden: ',ht.detach().numpy())
    if t>0:
        prev_h = out_man[t-1] # if we aren't at the final timestep, then get the previous timestep's hidden state which is the previous timestep's output
    else:
        prev_h = torch.zeros((ht.shape))
    ot = ht + torch.matmul(prev_h, torch.transpose(w_hh, 0, 1)) + b_hh
    ot = torch.tanh(ot)
    out_man.append(ot)

Timestep: 0=>
Input:  [[1. 1. 1. 1. 1.]]
Hidden:  [[-0.47019297  0.58639044]]
Timestep: 1=>
Input:  [[2. 2. 2. 2. 2.]]
Hidden:  [[-0.8888316  1.2364398]]
Timestep: 2=>
Input:  [[3. 3. 3. 3. 3.]]
Hidden:  [[-1.3074702  1.8864892]]


comparing this with the output computed with the rnn_layer automatically:

In [10]:
# first principle is that the final hidden states should be the same:
print("rnn layer final hidden state: ", hn)
print("manually computed final hidden state: ", out_man[-1])

rnn layer final hidden state:  tensor([[[-0.8649,  0.9047]]], grad_fn=<StackBackward0>)
manually computed final hidden state:  tensor([[-0.8649,  0.9047]], grad_fn=<TanhBackward0>)


In [34]:
out_man2 = tuple(map(lambda x: x.detach(), out_man))

In [35]:
out_man2[2].shape

torch.Size([1, 2])

In [40]:
out_man3 = torch.stack(out_man2, dim=1)

In [41]:
for i in range(3):
    print(f"RNN Layer, timestep: {i+1}: {output[:,i,:]}")
    print(f"Manual output, timestep: {i+1}: {out_man3[:,i,:]}")
    print("")

RNN Layer, timestep: 1: tensor([[-0.3520,  0.5253]], grad_fn=<SelectBackward0>)
Manual output, timestep: 1: tensor([[-0.3520,  0.5253]])

RNN Layer, timestep: 2: tensor([[-0.6842,  0.7607]], grad_fn=<SelectBackward0>)
Manual output, timestep: 2: tensor([[-0.6842,  0.7607]])

RNN Layer, timestep: 3: tensor([[-0.8649,  0.9047]], grad_fn=<SelectBackward0>)
Manual output, timestep: 3: tensor([[-0.8649,  0.9047]])



Same outputs and same hidden layer. 
Now, making the code cleaner -> implementing the rnn functionality but this time using linear layers instead of doing the matrix multiplication manually.

In [42]:
input_hidden = nn.Linear(5,2)
hidden_hidden = nn.Linear(2,2)

In [47]:
input_hidden.weight.data = w_xh
input_hidden.bias.data = b_xh
hidden_hidden.weight.data = w_hh
hidden_hidden.bias.data = b_hh

In [48]:
out_manx = []

for t in range(3):
    xt = x_seq[:, t, :]
    ht = input_hidden(xt)
    if t>0:
        prev_out = out_manx[t-1]
    else:
        prev_out = torch.zeros((ht.shape))
    
    ot = ht + hidden_hidden(prev_out)
    out_manx.append(torch.tanh(ot))

In [49]:
out_manx2 = tuple(map(lambda x: x.detach(), out_manx))

In [50]:
out_manx2 = torch.stack(out_manx2, dim=1)

In [52]:
for i in range(3):
    print(f"rnn layer output: {output[:,i,:]}")
    print(f"manual output: {out_manx2[:,i,:]}")

rnn layer output: tensor([[-0.3520,  0.5253]], grad_fn=<SelectBackward0>)
manual output: tensor([[-0.3520,  0.5253]])
rnn layer output: tensor([[-0.6842,  0.7607]], grad_fn=<SelectBackward0>)
manual output: tensor([[-0.6842,  0.7607]])
rnn layer output: tensor([[-0.8649,  0.9047]], grad_fn=<SelectBackward0>)
manual output: tensor([[-0.8649,  0.9047]])


Based off this - can create a custom rnn layer.

Now again making the code cleaner by using an RNN Cell:

In [53]:
rnn_cell = nn.RNNCell(5,2) # an rnn cell with an input size of 5 and an output size of 2

In [56]:
rnn_cell.weight_ih.data = w_xh
rnn_cell.weight_hh.data = w_hh
rnn_cell.bias_ih = b_xh
rnn_cell.bias_hh = b_hh

In [57]:
out_manx2 = []

for t in range(3):
    xt = x_seq[:,t,:] # get the input at that timestep

    if t > 0:
        hidden = rnn_cell(xt, out_manx2[t-1])
    else:
        initial_hidden = torch.zeros((xt.shape[0],2))
        hidden = rnn_cell(xt, initial_hidden)
    out_manx2.append(hidden)

In [59]:
for i in range(3):
    print("Manual output: ",out_manx2[i])
    print("RNN layer output: ", output[:,i,:])
    print()

Manual output:  tensor([[-0.3520,  0.5253]], grad_fn=<TanhBackward0>)
RNN layer output:  tensor([[-0.3520,  0.5253]], grad_fn=<SelectBackward0>)

Manual output:  tensor([[-0.6842,  0.7607]], grad_fn=<TanhBackward0>)
RNN layer output:  tensor([[-0.6842,  0.7607]], grad_fn=<SelectBackward0>)

Manual output:  tensor([[-0.8649,  0.9047]], grad_fn=<TanhBackward0>)
RNN layer output:  tensor([[-0.8649,  0.9047]], grad_fn=<SelectBackward0>)



Layer Normalization applied to the RNN: we add layer normalization to the output of every timestep in this way:

In [63]:
ln = nn.LayerNorm(2)
ln_out = []

for t in range(3):
    xt = x_seq[:, t, :] # get the input at that timestep

    if t > 0:
        hidden = rnn_cell(xt, ln_out[t-1])
        normalized_hidden = ln(hidden)
    else:
        initial_hidden = torch.zeros((xt.shape[0], 2))
        hidden = rnn_cell(xt, initial_hidden)
        normalized_hidden = ln(hidden)
    ln_out.append(normalized_hidden)

In [64]:
for i in range(3):
    print("Manual output: ",ln_out[i])
    print("RNN layer output: ", output[:,i,:])
    print()

Manual output:  tensor([[-1.0000,  1.0000]], grad_fn=<NativeLayerNormBackward0>)
RNN layer output:  tensor([[-0.3520,  0.5253]], grad_fn=<SelectBackward0>)

Manual output:  tensor([[-1.0000,  1.0000]], grad_fn=<NativeLayerNormBackward0>)
RNN layer output:  tensor([[-0.6842,  0.7607]], grad_fn=<SelectBackward0>)

Manual output:  tensor([[-1.0000,  1.0000]], grad_fn=<NativeLayerNormBackward0>)
RNN layer output:  tensor([[-0.8649,  0.9047]], grad_fn=<SelectBackward0>)



hmm.. doing it now as it is in the textbook.. normalizing before calling the activation.

In [86]:
class CustomRNNCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.input_hidden = nn.Linear(input_size, hidden_size)
        self.hidden_hidden = nn.Linear(hidden_size, hidden_size)
        self.ln = nn.LayerNorm(hidden_size)
    def forward(self, x, hidden):
        out = self.input_hidden(x) + self.hidden_hidden(hidden)
        out = self.ln(out)
        return torch.tanh(out)

In [87]:
cust_rnn = CustomRNNCell(5,2)

In [75]:
cust_rnn.input_hidden.weight.data = w_xh
cust_rnn.input_hidden.bias.data = b_xh
cust_rnn.hidden_hidden.weight.data = w_hh
cust_rnn.hidden_hidden.bias.data = b_hh

In [88]:
out_manx2 = []

for t in range(3):
    xt = x_seq[:,t,:] # get the input at that timestep

    if t > 0:
        hidden = cust_rnn(xt, out_manx2[t-1])
    else:
        initial_hidden = torch.zeros((xt.shape[0],2))
        hidden = cust_rnn(xt, initial_hidden)
    out_manx2.append(hidden)

In [89]:
for i in range(3):
    print("Manual output: ",out_manx2[i])
    print("RNN layer output: ", output[:,i,:])
    print()

Manual output:  tensor([[-0.7616,  0.7616]], grad_fn=<TanhBackward0>)
RNN layer output:  tensor([[-0.3520,  0.5253]], grad_fn=<SelectBackward0>)

Manual output:  tensor([[-0.7616,  0.7616]], grad_fn=<TanhBackward0>)
RNN layer output:  tensor([[-0.6842,  0.7607]], grad_fn=<SelectBackward0>)

Manual output:  tensor([[-0.7616,  0.7616]], grad_fn=<TanhBackward0>)
RNN layer output:  tensor([[-0.8649,  0.9047]], grad_fn=<SelectBackward0>)

