## Text Classification Introduction - RNN

#### References
- [Pytorch RNN Introduction](!https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html)
- [Karpathy Blog](!http://karpathy.github.io/2015/05/21/rnn-effectiveness/)

In [3]:
from io import open
import glob
import os
import math

import unicodedata
import string

import torch
import numpy as np
import random

In [4]:
DATA_DIR = 'data/names/'

# reproducability
random.seed(42)
torch.manual_seed(42)
np.random.seed(42)

### Dataset Config

####  Data Categories

In [5]:
def findFiles(path): 
    return glob.glob(path)

# print(findFiles(os.path.join(DATA_DIR, 'names/*.txt')))

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# print(unicodeToAscii('Ślusàrski'))

# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles(os.path.join(DATA_DIR, 'names/*.txt')):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

#### One Hot Encoding

In [6]:
# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

print('5 letters, each vector has a single row, and 57 symbols', lineToTensor('Jones').size())

5 letters, each vector has a single row, and 57 symbols torch.Size([5, 1, 57])


In [7]:
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

### Defining the Model

In [8]:
import torch.nn as nn

In [9]:
class TutRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        # super so we inherit methods from nn.Module
        super(TutRNN, self).__init__() 

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        # stack tensors on top of each other
        combined = torch.cat((input, hidden), 1) 
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)


In [10]:
class ScratchRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        # super so we inherit methods from nn.Module
        super(ScratchRNN, self).__init__()
        
        self.hidden_size = hidden_size
        
        # we want to initialize all these weights to something, 
        # otherwise when we run we will get a bunch of (nan)
        self.Wxh = nn.Parameter(torch.randn(input_size, hidden_size)*0.01)
    
        self.Whh = nn.Parameter(torch.randn(hidden_size, hidden_size)*0.01)
        self.bh = nn.Parameter(torch.zeros(1, hidden_size))
        
        self.Why = nn.Parameter(torch.randn(hidden_size, output_size)*0.01)
        self.by = nn.Parameter(torch.zeros(1, output_size))
        
        self.softmax = nn.LogSoftmax(dim=1)
            
    def forward(self, input, hidden):
        # if we simply had
        # hidden = torch.tanh(input @ self.Wxh + self.bh)
        # it would just be a multi layer perceptron
        
        # by adding hidden @ self.Whh we using the context of the previous
        # hidden state
        hidden = input @ self.Wxh +  hidden @ self.Whh + self.bh
        output = hidden @ self.Why + self.by
        
        output = self.softmax(output)
        
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

### Training

#### Helper Methods

In [11]:
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = lineToTensor(line)
    return category, line, category_tensor, line_tensor

In [12]:
criterion = nn.NLLLoss() # this is somehow appropriate to go with Log Softmax

#### Training Note
Note that we put all the words through to get a finalized hidden state before we do our little backprop procedure. In other words we account for variable length.

Note too that after some point the hidden state is being set back to 0 again. This variable exists on a per input basis.

In [13]:
learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn

def train(rnn, category_tensor, line_tensor):
    hidden = rnn.initHidden()
    rnn.zero_grad()

    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    loss = criterion(output, category_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    # this is doing what an optimizer does
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

    return output, loss.item()

In [14]:
# Just return an output given a line
def evaluate(rnn, line_tensor):
    hidden = rnn.initHidden()

    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    return output

#### Training Note 2
- GPU acceleration on a RNN is much less than that of a CNN likely because things like operations like conv2d are much more paralleisable then a feedforward simple linear net

In [15]:
1/n_categories # we should be way higher than this

0.05555555555555555

#### Rnn1 (Rnn Built in Tutorial Using Torch Linear)

In [None]:
n_iters = 10000
epochs = 2
n_hidden = 128
rnn = TutRNN(n_letters, n_hidden, n_categories)

# rnn.to(device)
for ep in range(epochs):
    for mode in ["train", "val"]:
        num_correct = 0
        for iter in range(n_iters):
            category, line, category_tensor, line_tensor = randomTrainingExample()
            if mode == "train":
                output, loss = train(rnn, category_tensor, line_tensor)
            elif mode == "val":
                output = evaluate(rnn, line_tensor)
                guess, guess_i = categoryFromOutput(output)
                if guess == category:
                    num_correct += 1
    print("Percent Correct {} on Epoch {}".format(num_correct/n_iters, ep+1))

#### Rnn2 (Rnn built using matrix multiplication)

In [16]:
n_iters = 10000
epochs = 2
n_hidden = 128
rnn = ScratchRNN(n_letters, n_hidden, n_categories)

# rnn.to(device) 

# declaring a variable as cuda changes the computational graph
# in such a way that this form of optimization is does not work
# therefore we dont use rnn.to(cuda)
for ep in range(epochs):
    for mode in ["train", "val"]:
        num_correct = 0
        for iter in range(n_iters):
            category, line, category_tensor, line_tensor = randomTrainingExample()
            if mode == "train":
                output, loss = train(rnn, category_tensor, line_tensor)
            elif mode == "val":
                output = evaluate(rnn, line_tensor)
                guess, guess_i = categoryFromOutput(output)
                if guess == category:
                    num_correct += 1
    print("Percent Correct {} on Epoch {}".format(num_correct/n_iters, ep+1))

Percent Correct 0.21 on Epoch 1
Percent Correct 0.2396 on Epoch 2
