## Text - Gated Neural Nets (LSTM, GRU)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter as P

### Reference
Same RNNCell as built previously in the Text Classification introduction notebook.

In [None]:
class RNNCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        # super so we inherit methods from nn.Module
        super(RNNCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        # input to hidden
        self.Wxh = P(torch.randn(input_size, hidden_size)*0.01)
        
        # hidden to hidden
        self.Whh = P(torch.randn(hidden_size, hidden_size)*0.01)
        
        # bias
        self.bh = P(torch.zeros(1, hidden_size))

            
    def forward(self, input, hidden):
        # hidden = torch.tanh(input @ self.Wxh + self.bh)
        # would just be a multi layer perceptron
        
        # by adding hidden @ self.Whh we using 
        # the context of the previous hidden state
        hidden = torch.tanh(input @ self.Wxh +  hidden @ self.Whh + self.bh)
        
        return hidden
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

### About
RNNs work by feeding forward the hidden state. This is to account for variable length input. Information from the previous input is fed forward, thus allowing for some sense of context. 

The question then being what is the limitation on the amount which can be rememebered? The issue with a classical RNN and its memory has something to do with its [gradient](!http://ai.dinfo.unifi.it/paolo//ps/tnn-94-gradient.pdf)

### LSTM
In an [LSTM](!http://www.bioinf.jku.at/publications/older/2604.pdf) we first calculate 3 gates
$$i_t = \sigma(W_{ii}x_t + b_{ii} + W_{hi}h_{t-1} + b_{hi})$$
$$f_t = \sigma(W_{if}x_t + b_{if} + W_{hf}h_{t-1} + b_{bf})$$
$$o_t = \sigma(W_{io}x_t + b_{io} + W_{ho}h_{t-1} + b_{ho}$$
Then 
$$g_t = \tanh(W_{ig}x_t + b_{ig} + W_{hg}h_{t-1} + b_{ho}$$

Then from these 3 gates we derive 3 outputs
$$c_t = f_tc_{t-1} + i_tg_t$$
$$h_t = o_t\tanh(c_t)$$
[Code Referece](!https://github.com/pytorch/benchmark/blob/master/rnns/benchmarks/lstm_variants/lstm.py)

In [None]:
class LSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        # input to hidden
        self.Wxi = P(torch.randn(input_size, hidden_size)*.01)
        self.Wxf = P(torch.randn(input_size, hidden_size)*.01)
        self.Wxo = P(torch.randn(input_size, hidden_size)*.01)
        self.Wxc = P(torch.randn(input_size, hidden_size)*.01)
        
        # hidden to hidden
        self.Whi = P(torch.randn(hidden_size, hidden_size)*.01)
        self.Whf = P(torch.randn(hidden_size, hidden_size)*.01)
        self.Who = P(torch.randn(hidden_size, hidden_size)*.01)
        self.Whc = P(torch.randn(hidden_size, hidden_size)*.01)
        
        # bias
        self.bi = P(torch.zeros(1, hidden_size))
        self.bf = P(torch.zeros(1, hidden_size))
        self.bo = P(torch.zeros(1, hidden_size))
        self.bc = P(torch.zeros(1, hidden_size))
        
        
    def forward(self, input, hidden):
        h, c = hidden # previous h, c 
        
        # sigmoid + linear map input, hidden -> hidden
        i_t = torch.sigmoid(input @ self.Wxi + h @ self.Whi + self.bi)
        f_t = torch.sigmoid(input @ self.Wxf + h @ self.Whf + self.bf)
        o_t = torch.sigmoid(input @ self.Wxo + h @ self.Who + self.bo)
        
        # tanh + linear map input, hidden -> hidden
        g_t = torch.tanh(input @ self.Wxc + h @ self.Whc + self.bc)
        
        # note that this is elementwise multiplication
        # not matrix multiplication
        c_t = c * f_t + i_t * g_t
        h_t = o_t * torch.tanh(c_t)
        
        return h_t, (h_t, c_t)
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size), torch.zeros(1, self.hidden_size)

### Comparison Point
One way to visually compare these 2 architectures might be seen [here](!https://web.eecs.umich.edu/~justincj/papers/understanding-rnns/KarpathyICLR2016.pdf)

### Dataset

In [None]:
from io import open
import glob
import os
import math

import unicodedata
import string

import numpy as np
import random

In [None]:
DATA_DIR = '../data/names/'

# reproducability
random.seed(42)
torch.manual_seed(42)
np.random.seed(42)

#### Names Dataset
List of names from 18 different countries.

In [None]:
def findFiles(path): 
    return glob.glob(path)

# print(findFiles(os.path.join(DATA_DIR, 'names/*.txt')))

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# print(unicodeToAscii('Ślusàrski'))

# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles(os.path.join(DATA_DIR, 'names/*.txt')):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

#### One Hot Encoding

In [None]:
# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor
        
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

#### Dataloader

In [None]:
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = lineToTensor(line)
    return category, line, category_tensor, line_tensor

### Classifier

In [None]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.hidden_size =  hidden_size
        self.rnn = RNNCell(input_size, hidden_size)
        self.classifier = nn.Linear(hidden_size, output_size)
        
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, input, hidden):
        hidden = self.rnn(input, hidden)
        output = self.classifier(hidden)
        output = self.softmax(output)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMClassifier, self).__init__()
        self.lstm = LSTMCell(input_size, hidden_size)
        self.classifier = nn.Linear(hidden_size, output_size)
        self.hidden_size = hidden_size
        
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, input, hidden):
        x, hidden = self.lstm(input, hidden)
        output = self.classifier(x)
        output = self.softmax(output)
        
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size), torch.zeros(1, self.hidden_size)

### Training

In [None]:
import torch.optim as optim

#### RNN Classifier

In [None]:
n_iters = 10000
epochs = 2
n_hidden = 128
rnn = RNNClassifier(n_letters, n_hidden, n_categories)

optimizer = optim.SGD(rnn.parameters(), lr=0.001, momentum=0.9)
criterion = nn.NLLLoss() # this is somehow appropriate to go with Log Softmax

In [None]:
for ep in range(epochs):
    for mode in ["train", "val"]:
        num_correct = 0
        for iter in range(n_iters):
            category, line, category_tensor, line_tensor = randomTrainingExample()
            if mode == "train":
                
                hidden = rnn.initHidden()
                optimizer.zero_grad()

                for i in range(line_tensor.size()[0]):
                    output, hidden = rnn(line_tensor[i], hidden)

                loss = criterion(output, category_tensor)
                loss.backward()
                
                optimizer.step()
                
            elif mode == "val":
                with torch.no_grad():
                    hidden = rnn.initHidden()

                    for i in range(line_tensor.size()[0]):
                        output, hidden = rnn(line_tensor[i], hidden)
                        
                    guess, guess_i = categoryFromOutput(output)
                    if guess == category:
                        num_correct += 1
    print("Percent Correct {} on Epoch {}".format(num_correct/n_iters, ep+1))

#### LSTM Classifier

In [None]:
n_iters = 10000
epochs = 2
n_hidden = 128
lstm = LSTMClassifier(n_letters, n_hidden, n_categories)

optimizer = optim.SGD(lstm.parameters(), lr=0.001, momentum=0.9)
criterion = nn.NLLLoss() # this is somehow appropriate to go with Log Softmax

In [None]:
for ep in range(epochs):
    for mode in ["train", "val"]:
        num_correct = 0
        for iter in range(n_iters):
            category, line, category_tensor, line_tensor = randomTrainingExample()
            if mode == "train":
                
                hidden = lstm.initHidden()
                optimizer.zero_grad()

                for i in range(line_tensor.size()[0]):
                    output, hidden = lstm(line_tensor[i], hidden)

                loss = criterion(output, category_tensor)
                loss.backward()
                
                optimizer.step()
                
            elif mode == "val":
                with torch.no_grad():
                    hidden = lstm.initHidden()

                    for i in range(line_tensor.size()[0]):
                        output, hidden = lstm(line_tensor[i], hidden)
                        
                    guess, guess_i = categoryFromOutput(output)
                    if guess == category:
                        num_correct += 1
    print("Percent Correct {} on Epoch {}".format(num_correct/n_iters, ep+1))