# Recurrent Neural Networks 

Use RNNs for NLP. 


ASCII = American Standard Code for Information Interchange 
Ascii is seven bit character encoding standard

In [4]:
from io import open 
import glob 
import os 

def findFiles(path): return glob.glob(path)

file_path = 'data/names/*txt'
print(findFiles(file_path))

import unicodedata 
import string 

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

print(all_letters)
print(n_letters)

['data/names/Czech.txt', 'data/names/German.txt', 'data/names/Arabic.txt', 'data/names/Japanese.txt', 'data/names/Chinese.txt', 'data/names/Vietnamese.txt', 'data/names/Russian.txt', 'data/names/French.txt', 'data/names/Irish.txt', 'data/names/English.txt', 'data/names/Spanish.txt', 'data/names/Greek.txt', 'data/names/Italian.txt', 'data/names/Portuguese.txt', 'data/names/Scottish.txt', 'data/names/Dutch.txt', 'data/names/Korean.txt', 'data/names/Polish.txt']
abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'
57


In [3]:
def unicodeToAscii(s: str=None) -> None:
    return ''.join(
        c for c in unicodedata.normalize("NFD", s)
        if unicodedata.category(c) != "Mn" and c in all_letters
    )

print(unicodeToAscii('Ślusàrski'))

Slusarski


build list of names per language 

In [10]:
category_lines = {} # mapping category to list of names from that category/country
all_categories = []

def readLines(filename: str=None) -> list:
    lines = open(filename, encoding="utf-8").read().strip().split("\n")
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles(file_path):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)



['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni', 'Abatescianni', 'Abba', 'Abbadelli', 'Abbascia', 'Abbatangelo']


Turn names into tensors using one-hot-vector 
For example: 
- a = <1 0 0 ... >
- b = <0 1 0 ... >
and so on.

A word is then a 2D matrix:
 <line_length x n_bunch x n_letters>
 where n_bunch is one here.

In [32]:
import torch 

 
def letterToIndex(letter: str=None) -> int:
    """
    find letter index for letter 
    all_letters is the list of all ascii letters
    """
    return all_letters.find(letter)

assert letterToIndex("a") == 0
assert letterToIndex("c") == 2
 
def letterToTensor(letter: str=None) -> torch.Tensor:
    """ 
    turn letter into tensor
    """
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1 # set element to one at correct position
    return tensor 

print(letterToTensor("a"))
print(type(letterToTensor("a")))
assert type(letterToTensor("a")) == torch.Tensor


def lineToTensor(line: str=None) -> torch.Tensor:
    """ 
    transform line into tensor
    a letter is a row (not a column)
    a word has then multiple rows 
    """
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1 
    return tensor 

testword = "abz"
print(lineToTensor(testword))
print(lineToTensor(testword).size()) # expect [3,1,57] for len of word 3, ascii chars 57


tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])
<class 'torch.Tensor'>
tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
          0.,

# RNN 

Let us now create the neural network.

In [39]:
import torch.nn as nn 

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size 
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input,hidden),1)
        hidden = self.i2h(combined)
        output = self.h2o(hidden)
        output = self.softmax(output)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)
    
n_hidden = 128 
rnn = RNN(n_letters, n_hidden, n_categories)


In [44]:
from torchsummary import summary
input = lineToTensor('Albert')
hidden = torch.zeros(1, n_hidden)
summary(rnn(input[0], hidden), letterToTensor("A")[0])


AttributeError: 'RNN' object has no attribute 'h2'