### RNN - `Recurent Neural Networks`.

```python
torch.nn.RNN(*args, **kwargs)
```

Parameters: 

* ``input_size`` – The number of expected features in the input x

* ``hidden_size`` – The number of features in the hidden state h

* ``num_layers`` – Number of recurrent layers. E.g., setting ``num_layers=2`` would mean stacking two RNNs together to form a stacked RNN, with the second RNN taking in outputs of the first RNN and computing the final results. Default: `1``

* `nonlinearity` – The non-linearity to use. Can be either `'tanh'` or `'relu'`. Default: `'tanh'`

* `bias` – If False, then the layer does not use bias weights `b_ih` and `b_hh`. Default: `True`

* `batch_first` – If True, then the input and output tensors are provided as (batch, seq, feature). Default: `False`

* `dropout` – If non-zero, introduces a Dropout layer on the outputs of each RNN layer except the last layer, with dropout probability equal to dropout. `Default: 0`

* `bidirectional` – If True, becomes a bidirectional RNN. Default: `False`

* [RNN](https://pytorch.org/docs/stable/generated/torch.nn.RNN.html)

The `MNIST` dataset and `RNN`.

In [95]:
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from tqdm import tqdm

In [6]:
train = datasets.MNIST('content/drive/', train=True, transform=transforms.ToTensor(), download=True)
test = datasets.MNIST('content/drive/', train=False, transform=transforms.ToTensor(), download=True)

In [7]:
train_set = DataLoader(train, batch_size=32, shuffle=True)
test_set = DataLoader(test, batch_size=32, shuffle=False)

### Device

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### Simple `RNN`

In [85]:
## Hyper Parameters
input_size = 28 # (20 features, the number of elements in each row)
sequence_length = 28 # (28, the number of rows we have)
hidden_size = 128
num_layers = 2

class SimpleRNN(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers):
    super(SimpleRNN, self).__init__()
    self.num_layers = num_layers
    self.hidden_size = hidden_size

    self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
    self.fc = nn.Linear(hidden_size, 10)

  def forward(self, x):
    # initial hidden_state 
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) #(2, 28, 128)
    output, h_n = self.rnn(x, h0) # output: tensor of shape (batch_size, seq_length, hidden_size) (32, 28, 128)
    output = output[:, -1, :] # (n_batches, num_classes), (32, 10)
    return self.fc(output)

net = SimpleRNN(input_size, hidden_size, num_layers).to(device)
net

SimpleRNN(
  (rnn): RNN(28, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=10, bias=True)
)

> Trainning the `NN`.

In [86]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

### Custom `accuracy` function.

In [91]:
def accuracy(y_true, y_pred):
  total = 0
  correct = 0
  net.eval()
  with torch.no_grad():
    correct = list(y_true==y_pred).count(True)
    total = len(y_true)
  net.train()
  return correct/total

In [None]:
EPOCHS = 10
for epoch in range(EPOCHS):
  for X, y in tqdm(train_set):
    X = X.to(device)
    y = y.to(device)
    # forward pass
    output = net(X.reshape(-1,sequence_length, input_size)).to(device)
    y_pred = torch.argmax(output, dim=1)
    # loss
    loss = criterion(output, y)
    # backward pass
    optimizer.zero_grad()
    loss.backward()
    # update the weights
    optimizer.step()
  acc = accuracy(y, y_pred)
  print(f"\nEpochs: {epoch+1}/{EPOCHS} Loss: {loss.item():.3f}, Accuracy: {acc:.3f}\n")

### Model Evaluation.

In [113]:
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval()
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device).squeeze(1)
            y = y.to(device=device)
            scores = model(x)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)
    return num_correct / num_samples
check_accuracy(train_set, net)
check_accuracy(test_set, net)

tensor(0.9280, device='cuda:0')

### Gated Recurrent Unit `(GRU)` Net.

Parameters: 

* ``input_size`` – The number of expected features in the input x

* ``hidden_size`` – The number of features in the hidden state h

* ``num_layers`` – Number of recurrent layers. E.g., setting ``num_layers=2`` would mean stacking two RNNs together to form a stacked RNN, with the second RNN taking in outputs of the first RNN and computing the final results. Default: `1``

* `nonlinearity` – The non-linearity to use. Can be either `'tanh'` or `'relu'`. Default: `'tanh'`

* `bias` – If False, then the layer does not use bias weights `b_ih` and `b_hh`. Default: `True`

* `batch_first` – If True, then the input and output tensors are provided as (batch, seq, feature). Default: `False`

* `dropout` – If non-zero, introduces a Dropout layer on the outputs of each RNN layer except the last layer, with dropout probability equal to dropout. `Default: 0`

* `bidirectional` – If True, becomes a bidirectional RNN. Default: `False`

* [Docs](https://pytorch.org/docs/stable/generated/torch.nn.GRU.html)

In [128]:

# Hyper parameters
input_size = 28 
sequence_length = 28 
hidden_size = 128
num_layers = 2

class GRU(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers):
    super(GRU, self).__init__()
    self.hidden_size = hidden_size
    self.num_layer = num_layers
    self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
    self.fc = nn.Linear(hidden_size, 10)

  def forward(self, x):
    h0 = torch.zeros(self.num_layer, x.size(0), self.hidden_size).to(device)
    output, _ = self.gru(x, h0)
    output = output[:, -1, :]
    return self.fc(output)

net = GRU(input_size, hidden_size, num_layers).to(device)
net

GRU(
  (gru): GRU(28, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=10, bias=True)
)

In [129]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

### Trainning  the GRU

In [131]:
EPOCHS = 10
for epoch in range(EPOCHS):
  for X, y in tqdm(train_set):
    X = X.to(device).reshape(-1, sequence_length, input_size)
    y = y.to(device)
    # forward pass
    output = net(X).to(device)
    y_pred = torch.argmax(output, dim=1)
    # loss
    loss = criterion(output, y)
    #backward pass
    loss.backward()
    # update the weights
    optimizer.step()
    optimizer.zero_grad()
  acc = accuracy(y, y_pred)
  print(f"\nEpochs: {epoch+1}/{EPOCHS} Loss: {loss.item():.3f}, Accuracy: {acc:.3f}\n")

100%|██████████| 1875/1875 [00:10<00:00, 172.68it/s]
  1%|          | 17/1875 [00:00<00:10, 169.18it/s]


Epochs: 1/10 Loss: 0.060, Accuracy: 0.969



100%|██████████| 1875/1875 [00:10<00:00, 173.75it/s]
  1%|          | 18/1875 [00:00<00:10, 175.31it/s]


Epochs: 2/10 Loss: 0.141, Accuracy: 0.969



100%|██████████| 1875/1875 [00:10<00:00, 176.08it/s]
  1%|          | 18/1875 [00:00<00:10, 177.58it/s]


Epochs: 3/10 Loss: 0.040, Accuracy: 0.969



100%|██████████| 1875/1875 [00:10<00:00, 177.38it/s]
  1%|          | 18/1875 [00:00<00:10, 172.28it/s]


Epochs: 4/10 Loss: 0.007, Accuracy: 1.000



100%|██████████| 1875/1875 [00:10<00:00, 175.70it/s]
  1%|          | 18/1875 [00:00<00:10, 173.37it/s]


Epochs: 5/10 Loss: 0.013, Accuracy: 1.000



100%|██████████| 1875/1875 [00:10<00:00, 179.80it/s]
  1%|          | 18/1875 [00:00<00:10, 176.36it/s]


Epochs: 6/10 Loss: 0.020, Accuracy: 1.000



100%|██████████| 1875/1875 [00:10<00:00, 178.49it/s]
  1%|          | 18/1875 [00:00<00:10, 179.25it/s]


Epochs: 7/10 Loss: 0.014, Accuracy: 1.000



100%|██████████| 1875/1875 [00:10<00:00, 177.39it/s]
  1%|          | 15/1875 [00:00<00:12, 148.01it/s]


Epochs: 8/10 Loss: 0.207, Accuracy: 0.938



100%|██████████| 1875/1875 [00:10<00:00, 174.92it/s]
  1%|          | 18/1875 [00:00<00:10, 177.64it/s]


Epochs: 9/10 Loss: 0.000, Accuracy: 1.000



100%|██████████| 1875/1875 [00:10<00:00, 174.52it/s]


Epochs: 10/10 Loss: 0.030, Accuracy: 0.969






### Evaluating the `GRU`.

In [134]:
print("Train acc: ", check_accuracy(train_set, net).item())
print("Test acc: ", check_accuracy(test_set, net).item())

Train acc:  0.9958833456039429
Test acc:  0.9902999997138977


### Long Short Term Memory `LSTM` RNN

Parameters: 

* ``input_size`` – The number of expected features in the input x

* ``hidden_size`` – The number of features in the hidden state h

* ``num_layers`` – Number of recurrent layers. E.g., setting ``num_layers=2`` would mean stacking two RNNs together to form a stacked RNN, with the second RNN taking in outputs of the first RNN and computing the final results. Default: `1``

* `nonlinearity` – The non-linearity to use. Can be either `'tanh'` or `'relu'`. Default: `'tanh'`

* `bias` – If False, then the layer does not use bias weights `b_ih` and `b_hh`. Default: `True`

* `batch_first` – If True, then the input and output tensors are provided as (batch, seq, feature). Default: `False`

* `dropout` – If non-zero, introduces a Dropout layer on the outputs of each RNN layer except the last layer, with dropout probability equal to dropout. `Default: 0`

* `bidirectional` – If True, becomes a bidirectional RNN. Default: `False`

* ``proj_size`` – If > 0, will use LSTM with projections of corresponding size. Default: `0`

The differents between ``LSTM`` and other `RNN's` is that `LSTM` accept a hidden (h0) state and a cell (c0) state:


* `h_0` of shape ``(num_layers * num_directions, batch, hidden_size)``: tensor containing the initial hidden state for each element in the batch. If the LSTM is bidirectional, num_directions should be 2, else it should be 1. If `proj_size > `0 was specified, the shape has to be `(num_layers * num_directions, batch, proj_size)`.

* `c_0` of shape `(num_layers * num_directions, batch, hidden_size)`: tensor containing the initial cell state for each element in the batch.

> If `(h_0, c_0)` is not provided, both `h_0` and `c_0` default to zero.

[Docs](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html)


In [143]:

# Hyper parameters
input_size = 28 
sequence_length = 28 
hidden_size = 128
num_layers = 2

class LSTM(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers):
    super(LSTM, self).__init__()
    self.num_layers = num_layers
    self.hidden_size = hidden_size

    self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
    self.fc = nn.Linear(hidden_size, 10)

  def forward(self, x):
    h_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
                    #(    number_of_layers, batch_size, hiden_size)
    c_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

    output, _ = self.lstm(x, (h_0, c_0))
    output = output[:, -1, :]

    return self.fc(output)

net = LSTM(input_size, hidden_size, num_layers).to(device)
net

LSTM(
  (lstm): LSTM(28, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=10, bias=True)
)

In [144]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

### Training a `LSTM`

In [145]:
EPOCHS = 10
for epoch in range(EPOCHS):
  for X, y in tqdm(train_set):
    X = X.to(device).reshape(-1, sequence_length, input_size)
    y = y.to(device)
    # forward pass
    output = net(X).to(device)
    y_pred = torch.argmax(output, dim=1)
    # loss
    loss = criterion(output, y)
    #backward pass
    loss.backward()
    # update the weights
    optimizer.step()
    optimizer.zero_grad()
  acc = accuracy(y, y_pred)
  print(f"\nEpochs: {epoch+1}/{EPOCHS} Loss: {loss.item():.3f}, Accuracy: {acc:.3f}\n")

100%|██████████| 1875/1875 [00:11<00:00, 160.97it/s]
  1%|          | 17/1875 [00:00<00:11, 167.29it/s]


Epochs: 1/10 Loss: 0.113, Accuracy: 0.938



100%|██████████| 1875/1875 [00:11<00:00, 163.87it/s]
  1%|          | 17/1875 [00:00<00:11, 166.21it/s]


Epochs: 2/10 Loss: 0.020, Accuracy: 1.000



100%|██████████| 1875/1875 [00:11<00:00, 162.44it/s]
  1%|          | 16/1875 [00:00<00:11, 156.93it/s]


Epochs: 3/10 Loss: 0.110, Accuracy: 0.969



100%|██████████| 1875/1875 [00:11<00:00, 163.70it/s]
  1%|          | 17/1875 [00:00<00:11, 162.85it/s]


Epochs: 4/10 Loss: 0.227, Accuracy: 0.938



100%|██████████| 1875/1875 [00:11<00:00, 162.86it/s]
  1%|          | 17/1875 [00:00<00:11, 162.21it/s]


Epochs: 5/10 Loss: 0.004, Accuracy: 1.000



100%|██████████| 1875/1875 [00:11<00:00, 162.04it/s]
  1%|          | 17/1875 [00:00<00:11, 162.29it/s]


Epochs: 6/10 Loss: 0.002, Accuracy: 1.000



100%|██████████| 1875/1875 [00:11<00:00, 163.54it/s]
  1%|          | 17/1875 [00:00<00:11, 164.74it/s]


Epochs: 7/10 Loss: 0.102, Accuracy: 0.969



100%|██████████| 1875/1875 [00:11<00:00, 163.36it/s]
  1%|          | 17/1875 [00:00<00:11, 164.93it/s]


Epochs: 8/10 Loss: 0.032, Accuracy: 0.969



100%|██████████| 1875/1875 [00:11<00:00, 162.06it/s]
  1%|          | 17/1875 [00:00<00:11, 167.87it/s]


Epochs: 9/10 Loss: 0.003, Accuracy: 1.000



100%|██████████| 1875/1875 [00:11<00:00, 164.67it/s]


Epochs: 10/10 Loss: 0.001, Accuracy: 1.000






### Evaluating the `LSTM` model.



In [146]:
print("Train acc: ", check_accuracy(train_set, net).item())
print("Test acc: ", check_accuracy(test_set, net).item())

Train acc:  0.9919833540916443
Test acc:  0.9869999885559082


### Bidirectional `RNN`
To make a recurrent `NN` bidirectional we only need to pass the keyword argument `bidirectional=True` and tweek a litle bit in our `forward` method the `h_0` and `c_0` if it is a `LSTM` RNN otherwise we only change `h_0` for other RNN and then finnaly change the output layer `in_featurers` to be `2 * hidden_size`.

```python
...

nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
self.fc = nn.Linear(hidden_size * 2, 10)
....
def forward(self, x):
  h_0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
                    #(    number_of_layers, batch_size, hiden_size)
    c_0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
```
> Where `2` are `num_directions` number of directions.

In [159]:
# Hyper parameters
input_size = 28 
sequence_length = 28 
hidden_size = 128
num_layers = 2
num_directions = 2

class Bidirectional_LSTM(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers, num_directions):
    super(Bidirectional_LSTM, self).__init__()
    self.num_layers = num_layers
    self.hidden_size = hidden_size
    self.num_directions = num_directions

    self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                        num_layers=num_layers, batch_first=True, bidirectional=True)
    self.fc = nn.Linear(hidden_size * num_directions, 10)

  def forward(self, x):
    h_0 = torch.zeros(self.num_layers * num_directions, x.size(0), self.hidden_size).to(device)
                    #(    number_of_layers * num_directions, batch_size, hiden_size)
    c_0 = torch.zeros(self.num_layers * num_directions, x.size(0), self.hidden_size).to(device)

    output, _ = self.lstm(x, (h_0, c_0))
    output = output[:, -1, :]

    return self.fc(output)

net = Bidirectional_LSTM(input_size, hidden_size, num_layers, num_directions).to(device)
net

Bidirectional_LSTM(
  (lstm): LSTM(28, 128, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=10, bias=True)
)

In [160]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

### Trainning the `Bidirectional LSTM`

In [161]:
EPOCHS = 10
for epoch in range(EPOCHS):
  for X, y in tqdm(train_set):
    X = X.to(device).reshape(-1, sequence_length, input_size)
    y = y.to(device)
    # forward pass
    output = net(X).to(device)
    y_pred = torch.argmax(output, dim=1)
    # loss
    loss = criterion(output, y)
    #backward pass
    loss.backward()
    # update the weights
    optimizer.step()
    optimizer.zero_grad()
  acc = accuracy(y, y_pred)
  print(f"\nEpochs: {epoch+1}/{EPOCHS} Loss: {loss.item():.3f}, Accuracy: {acc:.3f}\n")

100%|██████████| 1875/1875 [00:16<00:00, 112.07it/s]
  1%|          | 12/1875 [00:00<00:16, 112.98it/s]


Epochs: 1/10 Loss: 0.342, Accuracy: 0.906



100%|██████████| 1875/1875 [00:16<00:00, 113.31it/s]
  1%|          | 12/1875 [00:00<00:16, 110.22it/s]


Epochs: 2/10 Loss: 0.039, Accuracy: 1.000



100%|██████████| 1875/1875 [00:16<00:00, 114.28it/s]
  1%|          | 11/1875 [00:00<00:17, 108.82it/s]


Epochs: 3/10 Loss: 0.009, Accuracy: 1.000



100%|██████████| 1875/1875 [00:16<00:00, 114.01it/s]
  1%|          | 11/1875 [00:00<00:17, 107.77it/s]


Epochs: 4/10 Loss: 0.003, Accuracy: 1.000



100%|██████████| 1875/1875 [00:16<00:00, 113.13it/s]
  1%|          | 11/1875 [00:00<00:17, 107.81it/s]


Epochs: 5/10 Loss: 0.039, Accuracy: 0.969



100%|██████████| 1875/1875 [00:16<00:00, 113.18it/s]
  1%|          | 12/1875 [00:00<00:16, 114.72it/s]


Epochs: 6/10 Loss: 0.005, Accuracy: 1.000



100%|██████████| 1875/1875 [00:16<00:00, 111.65it/s]
  1%|          | 12/1875 [00:00<00:16, 115.71it/s]


Epochs: 7/10 Loss: 0.038, Accuracy: 1.000



100%|██████████| 1875/1875 [00:16<00:00, 114.25it/s]
  1%|          | 12/1875 [00:00<00:16, 115.00it/s]


Epochs: 8/10 Loss: 0.003, Accuracy: 1.000



100%|██████████| 1875/1875 [00:16<00:00, 112.32it/s]
  1%|          | 11/1875 [00:00<00:18, 102.69it/s]


Epochs: 9/10 Loss: 0.039, Accuracy: 0.969



100%|██████████| 1875/1875 [00:16<00:00, 112.37it/s]


Epochs: 10/10 Loss: 0.064, Accuracy: 0.938






### Evaluating the `Bidirectional` NN.

In [162]:
print("Train acc: ", check_accuracy(train_set, net).item())
print("Test acc: ", check_accuracy(test_set, net).item())

Train acc:  0.9943833351135254
Test acc:  0.9868999719619751


### Implementing achitecture in `pytorch`.

`RNN`

```python
                                   
        [ input ]               [ hidden ] <-------|
            |                        |             |
            |______           _______|             |         
                   [ combined]                     |
            __________|    |__________             |
            |                         |            |
            |                         |            |
         [ i2o ]                    [ i2h ]        |
            |                          |           |
            |                          |----->-----|
        [ softmax ]
            |
            |
        [ output ]

key:
i2o = input to output
i2h = input to hidden
```


In [178]:
# Hyper parameters
input_size = 28 
sequence_length = 28 
hidden_size = 128
num_layers = 2
num_directions = 2

class RNN(nn.Module):
  def __init__(self, input_size, hidden_size):
    super().__init__()
    self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
    self.i2o = nn.Linear(input_size + hidden_size, 10)
    self.softmax = nn.LogSoftmax(dim=1)
    self.hidden_size = hidden_size

  def forward(self, input_tensor):
    combined = torch.cat((input_tensor, torch.zeros_like(input_tensor)), 1)

    hidden = self.i2h(combined)
    output = self.i2o(combined)
    output = self.softmax(output)
    return output, hidden
net = RNN(input_size, hidden_size)
net

RNN(
  (i2h): Linear(in_features=156, out_features=128, bias=True)
  (i2o): Linear(in_features=156, out_features=10, bias=True)
  (softmax): LogSoftmax(dim=1)
)