In [1]:
import torch
import torch.nn as nn

In [2]:
torch.manual_seed(1)
rnn_layer = nn.RNN(input_size = 5, hidden_size =2, num_layers=1, batch_first=True)

In [3]:
w_xh = rnn_layer.weight_ih_l0
w_hh = rnn_layer.weight_hh_l0
b_xh = rnn_layer.bias_ih_l0
b_hh = rnn_layer.bias_hh_l0

In [4]:
w_xh.shape, w_hh.shape, b_xh.shape, b_hh.shape

(torch.Size([2, 5]), torch.Size([2, 2]), torch.Size([2]), torch.Size([2]))

In [5]:
x_seq = torch.tensor([[1.0]*5, [2.0]*5, [3.0]*5]).float()

In [6]:
x_seq.shape

torch.Size([3, 5])

In [7]:
x_seq = torch.reshape(x_seq, (1,3,5))

In [8]:
output, hn = rnn_layer(x_seq)

In [9]:
output.shape, hn.shape

(torch.Size([1, 3, 2]), torch.Size([1, 1, 2]))

2 layer rnn:

In [10]:
rnn_layer2 = nn.RNN(input_size=5, hidden_size=2, num_layers=2, batch_first=True)

In [11]:
w_xh1 = rnn_layer2.weight_ih_l0
w_hh1 = rnn_layer2.weight_hh_l0
b_xh1 = rnn_layer2.bias_ih_l0
b_hh1 = rnn_layer2.bias_hh_l0

w_xh2 = rnn_layer2.weight_ih_l1
w_hh2 = rnn_layer2.weight_hh_l1
b_xh2 = rnn_layer2.bias_ih_l1
b_hh2 = rnn_layer2.bias_hh_l1

In [13]:
w_xh2.shape, w_hh2.shape, b_xh2.shape, b_hh2.shape

(torch.Size([2, 2]), torch.Size([2, 2]), torch.Size([2]), torch.Size([2]))

In [14]:
output2, hn2 = rnn_layer2(x_seq)

In [15]:
output2.shape, hn2.shape

(torch.Size([1, 3, 2]), torch.Size([2, 1, 2]))

In [16]:
x_seq.shape

torch.Size([1, 3, 5])

In [22]:
w_xh.shape[0]

2

In [24]:
torch.zeros((w_xh.shape[0])).shape

torch.Size([2])

In [None]:
torch.zeros

Under the hood: Operation of the RNN:

In [26]:
# manually computing the output of the RNN
out_man = []

for t in range(3): # range of 3 because that is the sequence length and the rnn works by working on each input in the sequence at a time
    xt = x_seq[:, t, :]
    print(f'Time step: {t} =>')
    print(' Input: ', xt.numpy())

    ht = torch.matmul(xt, torch.transpose(w_xh, 0,1)) + b_xh

    if t > 0:
        prev_h = out_man[t-1] # extracting the hidden state
    else:
        prev_h = torch.zeros((ht.shape[1]))
    ot = ht + torch.matmul(prev_h, torch.transpose(w_hh, 0, 1)) + b_hh
    ot = torch.tanh(ot)
    out_man.append(ot)
    print(' Output (manual): ', ot.detach().numpy())
    print(' RNN output: ', output[:,t].detach().numpy())

Time step: 0 =>
 Input:  [[1. 1. 1. 1. 1.]]
 Output (manual):  [[-0.35198015  0.52525216]]
 RNN output:  [[-0.3519801   0.52525216]]
Time step: 1 =>
 Input:  [[2. 2. 2. 2. 2.]]
 Output (manual):  [[-0.68424344  0.76074266]]
 RNN output:  [[-0.68424344  0.76074266]]
Time step: 2 =>
 Input:  [[3. 3. 3. 3. 3.]]
 Output (manual):  [[-0.8649416  0.9046636]]
 RNN output:  [[-0.8649416  0.9046636]]


In [27]:
hn

tensor([[[-0.8649,  0.9047]]], grad_fn=<StackBackward0>)

In [28]:
out_man[-1]

tensor([[-0.8649,  0.9047]], grad_fn=<TanhBackward0>)

Base RNN implementation in pytorch:
- the output at each timestep is the hidden state (config of the rnn used is that the output at each timestep is stored as the memory cell state)

In [31]:
output[:,-1,:] == hn

tensor([[[True, True]]])

In [32]:
output2.shape

torch.Size([1, 3, 2])

In [33]:
hn2.shape

torch.Size([2, 1, 2])

In [34]:
output2[:, -1, :] == hn2[-1,:,:]

tensor([[True, True]])

so for the base rnn in pytorch, the last hidden state will essentially always be equal to the the last time step output in the output. i.e. for a rnn with 1 layer, output[:,-1,:] == hn. Considering the fact that output is of shape (batch size, sequence length, hidden_size of rnn), and hn is of the shape (num layers, 1 (one time step considered), hidden_size of rnn). or, for 2 layers: output[:, -1, :] == hn[-1, :, :] (-1 to extract the hidden state final hidden state of the last layer)

In [35]:
hn2.shape

torch.Size([2, 1, 2])

In [36]:
hn2[-1,:,:].shape

torch.Size([1, 2])

LSTMs:

In [37]:
lstm_layer = torch.nn.LSTM(input_size=5, hidden_size=2, num_layers=1, batch_first=True)

In [40]:
output_ls, (hn_ls, cn_ls) = lstm_layer(x_seq)

In [41]:
output_ls.shape

torch.Size([1, 3, 2])

In [43]:
hn_ls.shape, cn_ls.shape

(torch.Size([1, 1, 2]), torch.Size([1, 1, 2]))

In [44]:
hn_ls

tensor([[[0.0020, 0.2991]]], grad_fn=<StackBackward0>)

In [45]:
cn_ls

tensor([[[0.1312, 1.2058]]], grad_fn=<StackBackward0>)

In [46]:
torch.nn.functional.tanh(cn_ls)

tensor([[[0.1305, 0.8354]]], grad_fn=<TanhBackward0>)

Bidirectional:

In [47]:
bi_rnn_layer = nn.RNN(input_size=5, hidden_size=2, num_layers=1, batch_first=True, bidirectional=True)

In [50]:
bi_output, bi_hn = bi_rnn_layer(x_seq)

In [51]:
bi_output.shape

torch.Size([1, 3, 4])

In [52]:
bi_hn.shape

torch.Size([2, 1, 2])

In [92]:
test_lstm1 = torch.nn.LSTM(20, 64, batch_first=True, bidirectional=True, num_layers=1)

In [93]:
test_out, (test_h, test_c) = test_lstm1(torch.randn(128, 100, 20))

In [94]:
test_out.shape

torch.Size([128, 100, 128])

In [95]:
test_h.shape

torch.Size([2, 128, 64])

In [96]:
test_c.shape

torch.Size([2, 128, 64])

In [97]:
test_h[-1, :, :].shape

torch.Size([128, 64])

In [98]:
test_h[-2, :, :].shape

torch.Size([128, 64])

In [99]:
torch.cat((test_h[-2,:,:],test_h[-1,:,:]),dim=1)

tensor([[-0.0855,  0.0288,  0.0403,  ...,  0.0224,  0.0648,  0.1566],
        [ 0.0479, -0.0804, -0.1054,  ...,  0.0128,  0.0597,  0.1576],
        [ 0.0314, -0.1836,  0.0388,  ...,  0.0612,  0.0999,  0.0890],
        ...,
        [-0.0408, -0.1587,  0.2578,  ...,  0.0978, -0.0690,  0.0608],
        [-0.0489, -0.0930,  0.1138,  ...,  0.0421,  0.0041,  0.2705],
        [-0.0029, -0.2243,  0.3296,  ...,  0.1195, -0.0143,  0.1271]],
       grad_fn=<CatBackward0>)

In [100]:
test_out.shape

torch.Size([128, 100, 128])

In [101]:
test_out[:, -1, :]

tensor([[-0.0855,  0.0288,  0.0403,  ...,  0.0605,  0.0634,  0.0350],
        [ 0.0479, -0.0804, -0.1054,  ...,  0.0453,  0.0025,  0.1446],
        [ 0.0314, -0.1836,  0.0388,  ...,  0.0832,  0.0313,  0.1720],
        ...,
        [-0.0408, -0.1587,  0.2578,  ...,  0.0873, -0.0663,  0.0876],
        [-0.0489, -0.0930,  0.1138,  ...,  0.0398, -0.0008, -0.1211],
        [-0.0029, -0.2243,  0.3296,  ...,  0.1717, -0.0188, -0.0069]],
       grad_fn=<SelectBackward0>)

In [None]:
test

In [77]:
test_h[0]

tensor([[ 0.0303, -0.0678,  0.0290,  ...,  0.0394,  0.0689,  0.0853],
        [-0.0287, -0.0691, -0.0170,  ..., -0.1651,  0.0260, -0.0930],
        [-0.0457,  0.0109,  0.0713,  ...,  0.0062,  0.0617,  0.0547],
        ...,
        [-0.0347, -0.0247, -0.0095,  ..., -0.1614,  0.1317,  0.0686],
        [-0.1693, -0.0779, -0.0416,  ..., -0.0713,  0.1787, -0.1981],
        [-0.1115,  0.1029, -0.0537,  ...,  0.0860,  0.0355,  0.0183]],
       grad_fn=<SelectBackward0>)

In [78]:
test_h[-2,:,:]

tensor([[ 0.0303, -0.0678,  0.0290,  ...,  0.0394,  0.0689,  0.0853],
        [-0.0287, -0.0691, -0.0170,  ..., -0.1651,  0.0260, -0.0930],
        [-0.0457,  0.0109,  0.0713,  ...,  0.0062,  0.0617,  0.0547],
        ...,
        [-0.0347, -0.0247, -0.0095,  ..., -0.1614,  0.1317,  0.0686],
        [-0.1693, -0.0779, -0.0416,  ..., -0.0713,  0.1787, -0.1981],
        [-0.1115,  0.1029, -0.0537,  ...,  0.0860,  0.0355,  0.0183]],
       grad_fn=<SelectBackward0>)