# RNN classification from scratch Pytorch tutorial

We’ll train on a few thousand surnames from 18 languages of origin, and predict which language a name is from based on the spelling

In [1]:
from pathlib import Path
import unicodedata
import string
import torch

In [2]:
def find_files(path): return Path().glob(path)

def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

def read_lines(filename):
# Read a file and split into lines
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicode_to_ascii(line) for line in lines]

## Previous steps

In [3]:
for file in find_files("rnn_data/names/*.txt"):
    print(file)
    
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)
print(n_letters)
print(unicode_to_ascii('Ślusàrski'))

# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

for file in find_files("rnn_data/names/*.txt"):
    category = file.name.split(".")[0]
    all_categories.append(category)
    lines = read_lines(file)
    category_lines[category] = lines
    
n_categories = len(all_categories)
print(n_categories)

rnn_data/names/Portuguese.txt
rnn_data/names/Scottish.txt
rnn_data/names/Vietnamese.txt
rnn_data/names/Chinese.txt
rnn_data/names/Czech.txt
rnn_data/names/Italian.txt
rnn_data/names/Korean.txt
rnn_data/names/Spanish.txt
rnn_data/names/Russian.txt
rnn_data/names/Japanese.txt
rnn_data/names/German.txt
rnn_data/names/Arabic.txt
rnn_data/names/French.txt
rnn_data/names/Dutch.txt
rnn_data/names/English.txt
rnn_data/names/Irish.txt
rnn_data/names/Greek.txt
rnn_data/names/Polish.txt
57
Slusarski
18


## Turning names into Tensors

We use a “one-hot vector” of size <1 x n_letters>. A one-hot vector is filled with 0s except for a 1 at index of the current letter, e.g. "b" = <0 1 0 0 0 ...>.

To make a word we join a bunch of those into a 2D matrix <line_length x 1 x n_letters>.
Extra dimension for batch size (using equal to 1)

In [4]:
# Find letter index from all_letters, e.g. "a" = 0
def element_to_index(element, all_elements):
    return all_elements.find(element)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def element_to_tensor(element, num_elements, all_elements):
    tensor = torch.zeros(1, num_elements)
    tensor[0][element_to_index(element, all_elements)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def sequence_to_tensor(sequence, all_elements, num_elements):
    tensor = torch.zeros(len(sequence), 1, num_elements)
    for i, element in enumerate(sequence):
        tensor[i][0][element_to_index(element, all_elements)] = 1
    return tensor

In [5]:
print(all_letters)
print(element_to_index("J",all_letters))
print(element_to_tensor('J', num_elements=n_letters, all_elements=all_letters))
print(sequence_to_tensor('Jones', num_elements=n_letters, all_elements=all_letters))
print(sequence_to_tensor('Jones', num_elements=n_letters, all_elements=all_letters).size())
print("Dimensions of the tensor: <sequence_size, batch_size, num_letters")

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'
35
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])
tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

## Creating the RNN model

In [47]:
import torch.nn as nn
import math
import ipdb


class scratch_RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()

        self.hidden_size = hidden_size
        self.hidden = self.initHidden()
        # Init the weights and biases
        self.weights_input = nn.Parameter(torch.randn(input_size + hidden_size, hidden_size) / math.sqrt(input_size + hidden_size))
        
        self.bias = nn.Parameter(torch.zeros(hidden_size))

        self.tanh = nn.Tanh()
        self.softmax = nn.LogSoftmax(dim=1) # Output
        
    def forward(self, x):
        outputs = []
        hiddens = []
        for s in range(x.shape[-2]):
            combined = torch.cat((x[s], self.hidden), 0) # combine input and hidden state
            self.hidden = tanh(combined @ self.weights + self.bias)
            hiddens.append(self.hidden)
            output = self.softmax(self.hidden)
            outputs.append(output)
            # returns the output and the hidden state
        return outputs, hiddens

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)


In [48]:
n_hidden = 128
rnn = scratch_RNN(n_letters, n_hidden, n_categories)

In [49]:
t = torch.randn((5,20,10))
rnn_test = scratch_RNN(20,n_hidden,10)

In [50]:
rnn_test
rnn_test(t)

RuntimeError: Sizes of tensors must match except in dimension 0. Got 10 and 128 in dimension 1 (The offending index is 1)