### An example of bidirectional RNN on MNIST dataset

In [None]:
import torch 
import torchvision
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms

'''move the computations to the GPU if cuda is available, otherwise the computations will be run on CPU'''
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

'''defining model parameters'''
sequence_length = 28
input_size = 28
hidden_size = 128
num_layers = 2
num_classes = 10
batch_size = 100
num_epochs = 2
learning_rate = 0.003

'''download the training and test set'''
train_dataset = torchvision.datasets.MNIST(root='data/', 
                                           train=True, 
                                           transform=transforms.ToTensor(),  
                                           download=True)

test_dataset = torchvision.datasets.MNIST(root='data/', 
                                          train=False, 
                                          transform=transforms.ToTensor())

'''use dataloader to shuffle and batch the data'''
train_loader = torch.utils.data.DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle = True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

'''define the model'''
class biLSTM(nn.Module):
    def __init__(self, input_size, sequence_length, hidden_size, num_layers, num_classes):
        super(biLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2*hidden_size, num_classes)
        
    def forward(self, input):
        '''initialize the hidden state and cell state as zero'''
        h0 = torch.zeros(self.num_layers*2, input.size(0), self.hidden_size)
        c0 = torch.zeros(self.num_layers*2, input.size(0), self.hidden_size)
        '''output: tensor of shape (batch_size, seq_length, hidden_size*2)'''
        output,_ = self.lstm(input, (h0, c0))
        '''get the output of the last time step'''
        '''output(100, 28, 256(2*128))'''
        output = self.fc(output[:,-1,:])
        return output

'''instantiate the model'''
model = biLSTM(input_size, sequence_length, hidden_size, num_layers, num_classes).to(device)

'''cross entropy is used as loss function'''
criterion = nn.CrossEntropyLoss()

'''Adam optimizer is used as the optimization function. We optimized all the model parameters, with a given learning rate.'''
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

'''training'''
for epoch in range(num_epochs):
    for i, (img, label) in enumerate(train_loader):
        img = img.reshape(-1, 28, 28)
        img = img.to(device)
        pred = model(img)
        label = label.to(device)
        
        loss = criterion(pred, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i%100==0:
            print('epoch [{}/{}] step [{}/{}], loss {:.4f}'.format(epoch+1, num_epochs, i+1, len(train_loader),loss))

'''calculate the performance of the trained model on unseen test set.'''
true = 0
total = 0
with torch.no_grad():
    for img, label in test_loader:
        img = img.reshape(-1, 28, 28).to(device)
        pred = model(img)
        prob, pred_label = torch.max(pred.data, 1)
        true +=(pred_label==label).sum()
        total += label.size(0)
print('accuracy on the test set is {} percent'.format(100 * float(true)/total))

epoch [1/2] step [1/600], loss 2.2995
epoch [1/2] step [101/600], loss 0.6912
epoch [1/2] step [201/600], loss 0.2863
epoch [1/2] step [301/600], loss 0.2609
epoch [1/2] step [401/600], loss 0.1255
epoch [1/2] step [501/600], loss 0.0634
epoch [2/2] step [1/600], loss 0.0367
epoch [2/2] step [101/600], loss 0.1528
epoch [2/2] step [201/600], loss 0.0911
epoch [2/2] step [301/600], loss 0.1258
epoch [2/2] step [401/600], loss 0.0789
epoch [2/2] step [501/600], loss 0.0816
