### RNN Background
To motivate a Transformers we need to first review a Vanilla RNN. 
$$
h_t = \tanh\left( W \cdot \begin{bmatrix} h_{t-1} \\ x \end{bmatrix}\right)
$$

$$
y_t = \text{LogSoftmax}\left( W \cdot \begin{bmatrix} h_{t} \\ x \end{bmatrix}\right)
$$

```python
class VanillaRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        # super so we inherit methods from nn.Module
        super(VanillaRNN, self).__init__() 

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden):
        # stack tensors on top of each other
        combined = torch.cat((x, hidden), 1)
        
        # update hidden
        #   hidden = tanh(input @ self.Wxh +  hidden @ self.Whh + self.bh)
        #
        # note: 
        #   all outputs between -1 and 1
        hidden = nn.tanh(self.i2h(combined))
        
        # update output
        #   output = hidden @ self.Why + self.by
        combined = torch.cat((x, hidden), 1)
        output = self.i2o(combined)
        
        # compute output probabilities
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)
```