**Implementing a GRU from scratch:**

In [1]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
torch.manual_seed(1)

<torch._C.Generator at 0x11da921b0>

In [2]:
torch.manual_seed(1)

<torch._C.Generator at 0x11da921b0>

Defining a custom layers, whose weights will be used for comparison inside the manual implementation...

In [3]:
gru_layer = nn.GRU(input_size=5, hidden_size=2, num_layers=1, batch_first=True)

Inspecting the GRU's variables - weights and biases:

In [4]:
wi = gru_layer.weight_ih_l0 # translates to these weights: ((W_ir|W_iz|W_in))
wh = gru_layer.weight_hh_l0 # translates to these weights: ((W_hr|W_hz|W_hn))
bi = gru_layer.bias_ih_l0 
bh = gru_layer.bias_hh_l0

In [5]:
wi.shape, wh.shape, bi.shape, bh.shape

(torch.Size([6, 5]), torch.Size([6, 2]), torch.Size([6]), torch.Size([6]))

Weight matrix shapes all check out - they're all that way because of the 3 gates that need servicing - hence the input feature dimension of 2 => multiplied by 3 to cater for all these gates. 

Creating an input sequence to apply these inputs to:

In [6]:
x_seq = torch.tensor([[1.0]*5,[2.0]*5,[3.0]*5]).unsqueeze(0).float()
x_seq.shape

torch.Size([1, 3, 5])

In [7]:
output, h_n = gru_layer(x_seq)

In [9]:
print(output.shape, h_n.shape)

torch.Size([1, 3, 2]) torch.Size([1, 1, 2])


Now, working out the GRU output manually......
- for all 3 gates, the linear regression operation is performed using the input and hidden weights and biases on the same inputs and previous memory output. Activatopms are then applied. Can do this....

In [11]:
layer_state = []

# using the dense/linear layer to mimic the linear regression operation between the inputs and the gates
# output dimension is 6 - 2 dimensions for each of the 3 gates as detailed in the pytorch gru explanation -> r, z, and n
input_gates = nn.Linear(5,6)
input_gates.weight.data = wi
input_gates.bias.data = bi

# using the dense/linear layer to mimic the linear regression operation between the prev input and the gates
hidden_gates = nn.Linear(2,6)
hidden_gates.weight.data = wh
hidden_gates.bias.data = bh

# remember that in sequence models, weights are shared across all timesteps:
out = []
for t in range(3):
    xt = x_seq[:,t,:] # get the input that corresponds to timestep t
    # compute the input - gates matrix multiplication
    ig = input_gates(xt)
    if t > 0:
        prev_h = out[t-1]
    else:
        prev_h = torch.zeros((xt.shape[0],2))
    
    hg = hidden_gates(prev_h)

    rgate = torch.sigmoid(ig[:,:2]+hg[:,:2])
    zgate = torch.sigmoid(ig[:,2:4]+hg[:,2:4])
    ngate = torch.tanh(ig[:,4:] + (rgate*hg[:,4:]))

    h = (1-zgate)*ngate + zgate*prev_h
    out.append(h)

In [13]:
manual_output = torch.stack(out, dim=1)

In [14]:
manual_output

tensor([[[-0.6458,  0.1718],
         [-0.8509,  0.2851],
         [-0.9287,  0.3488]]], grad_fn=<StackBackward0>)

In [15]:
output

tensor([[[-0.6458,  0.1718],
         [-0.8509,  0.2851],
         [-0.9287,  0.3488]]], grad_fn=<TransposeBackward1>)

As seen above, the manual output and the output computed using the actual gru layer are the same

In [16]:
print(f'Model\'s h_n = {h_n}')
print(f'Manual output\'s h_n = {out[-1]}')

Model's h_n = tensor([[[-0.9287,  0.3488]]], grad_fn=<StackBackward0>)
Manual output's h_n = tensor([[-0.9287,  0.3488]], grad_fn=<AddBackward0>)


We could instead of having the manually created linear layers have a gru cell instead which is nice and clean:
- downside though is for layer normalization implementation - we can't fix this inside the gru cell and would have to use our own custom cell
- nevertheless, lets explore the default gru cell

In [17]:
gru_cell = nn.GRUCell(input_size=5, hidden_size=2, bias=True)

Initialize the weights of the gru cell using the initial gru layer weights:

In [18]:
gru_cell.weight_ih.data = wi
gru_cell.weight_hh.data = wh
gru_cell.bias_ih.data = bi
gru_cell.bias_hh.data = bh

Compute the sequence output using this gru cell

In [20]:
cell_state = []

for t in range(3):
    xt = x_seq[:, t, :]

    if t > 0:
        prev_h = cell_state[t-1]
    else:
        prev_h = torch.zeros((xt.shape[0],2))
    h = gru_cell(xt,prev_h)
    cell_state.append(h)

In [21]:
cell_out = torch.stack(cell_state, dim=1)

In [22]:
cell_out

tensor([[[-0.6458,  0.1718],
         [-0.8509,  0.2851],
         [-0.9287,  0.3488]]], grad_fn=<StackBackward0>)

In [24]:
output

tensor([[[-0.6458,  0.1718],
         [-0.8509,  0.2851],
         [-0.9287,  0.3488]]], grad_fn=<TransposeBackward1>)

Same output for both the gru cell and the gru layer.....

In [30]:
# Now creating a custom GRU Cell... capable of accepting and computing layer normalization

class CustomGRUCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.input = nn.Linear(input_size, 3*hidden_size)
        self.hidden = nn.Linear(hidden_size, 3*hidden_size)
        self.hidden_size = hidden_size
    def forward(self, x, prev_h):
        computed_inputs = self.input(x)
        computed_hiddens = self.hidden(prev_h)
        
        # gates.....
        rgate = torch.sigmoid(computed_inputs[:,:self.hidden_size]+computed_hiddens[:,:self.hidden_size])
        zgate = torch.sigmoid(computed_inputs[:,self.hidden_size:2*self.hidden_size]+computed_hiddens[:,self.hidden_size:2*self.hidden_size])
        ngate = torch.tanh(computed_inputs[:,2*self.hidden_size:]+(rgate*computed_hiddens[:,2*self.hidden_size:]))
        h = (1-zgate)*ngate + zgate*prev_h
        return h

In [31]:
cust_cell = CustomGRUCell(5,2)
cust_cell.input.weight.data = wi
cust_cell.input.bias.data = bi
cust_cell.hidden.weight.data = wh
cust_cell.hidden.bias.data = bh

In [32]:
cell_state2 = []

for t in range(3):
    xt = x_seq[:,t,:]
    if t > 0:
        prev_h = cell_state2[t-1]
    else:
        prev_h = torch.zeros((xt.shape[0],2))
    h = cust_cell(xt, prev_h)
    cell_state2.append(h)

In [33]:
torch.stack(cell_state2, dim=1)

tensor([[[-0.6458,  0.1718],
         [-0.8509,  0.2851],
         [-0.9287,  0.3488]]], grad_fn=<StackBackward0>)

In [34]:
output

tensor([[[-0.6458,  0.1718],
         [-0.8509,  0.2851],
         [-0.9287,  0.3488]]], grad_fn=<TransposeBackward1>)

Bingo......

Now, custom GRU Cell with Layer Normalization....

In [36]:
class CustomGRUCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.input = nn.Linear(input_size, 3*hidden_size)
        self.hidden = nn.Linear(hidden_size, 3*hidden_size)
        self.hidden_size = hidden_size

        #Â one layer norm per gate.. best practice
        self.ln_r = nn.LayerNorm(hidden_size)
        self.ln_z = nn.LayerNorm(hidden_size)
        self.ln_n = nn.LayerNorm(hidden_size)
    def forward(self, x, prev_h):
        computed_inputs = self.input(x)
        computed_hiddens = self.hidden(prev_h)
        
        # gates.....
        pre_r = computed_inputs[:,:self.hidden_size]+computed_hiddens[:,:self.hidden_size]
        rgate = torch.sigmoid(self.ln_r(pre_r))

        pre_z = computed_inputs[:,self.hidden_size:2*self.hidden_size]+computed_hiddens[:,self.hidden_size:2*self.hidden_size]
        zgate = torch.sigmoid(self.ln_z(pre_z))

        pre_n = computed_inputs[:,2*self.hidden_size:]+(rgate*computed_hiddens[:,2*self.hidden_size:])
        ngate = torch.tanh(self.ln_n(pre_n))
        h = (1-zgate)*ngate + zgate*prev_h
        return h