In [1]:
# Torch Imports
import torch
import torch.nn as nn
import torch.nn.functional as F

### RNN Formulation
$$
h_t = \tanh\left( W \cdot \begin{bmatrix} h_{t-1} \\ x \end{bmatrix}\right)
$$

$$
y_t =  W \cdot \begin{bmatrix} h_{t} \\ x \end{bmatrix}
$$

In [2]:
class VanillaRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        # super so we inherit methods from nn.Module
        super(VanillaRNN, self).__init__() 

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)

    def forward(self, input, hidden):
        # stack tensors on top of each other
        combined = torch.cat((input, hidden), 1)
        
        # update hidden
        #   hidden = tanh(input @ self.Wxh +  hidden @ self.Whh + self.bh)
        #
        # note: 
        #   all outputs between -1 and 1
        hidden = nn.tanh(self.i2h(combined))
        
        # update output
        #   output = hidden @ self.Why + self.by
        combined = torch.cat((x, hidden), 1)
        output = self.i2o(combined)
        
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

### Why Tanh?
The vanishing gradient problem is the main problem in RNN. Also, to keep the gradient in the linear region of the activation function, we need a function whose second derivative can sustain for a long range before going to zero. Tanh is pretty good with these properties.

### Vanishing Gradient Demo

### Exploding Gradient Demo