This notebook is about classifying names, by building a Charachter Level RNN

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

## Data Preprocessing 

In [2]:
def get_Files(path): return glob.glob(path)

print(get_Files('data/data/names/*.txt'))

['data/data/names\\Arabic.txt', 'data/data/names\\Chinese.txt', 'data/data/names\\Czech.txt', 'data/data/names\\Dutch.txt', 'data/data/names\\English.txt', 'data/data/names\\French.txt', 'data/data/names\\German.txt', 'data/data/names\\Greek.txt', 'data/data/names\\Irish.txt', 'data/data/names\\Italian.txt', 'data/data/names\\Japanese.txt', 'data/data/names\\Korean.txt', 'data/data/names\\Polish.txt', 'data/data/names\\Portuguese.txt', 'data/data/names\\Russian.txt', 'data/data/names\\Scottish.txt', 'data/data/names\\Spanish.txt', 'data/data/names\\Vietnamese.txt']


Currently the names are in Unicode format. However, we have to convert them to ASCII standard. This will remove the diacritics in the words. For example, the French name Béringer will be converted to Beringer.

In [3]:
import unicodedata
import string

print(type(string.ascii_letters))

<class 'str'>


As you can see the string.ascii_letters is a class but we need the string of all ascii letters for our function that will convert text from unicode to ASCII

In [9]:
all_letters = string.ascii_letters + ".,;'"
n_letters = len(all_letters)
print(n_letters)

def unicodeToASCII(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicodeToASCII('Béringer'))

56
Beringer


Now we will create a dictionary of languages, and all the names assosciated to that language in a list


In [5]:
final_dict = {}
all_categories = []

def readFiles(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToASCII(line) for line in lines]

for filename in get_Files('data/data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readFiles(filename)
    final_dict[category] = lines
    
n_categories = len(all_categories)

In [6]:
print(final_dict['Arabic'][:5])

['Khoury', 'Nahas', 'Daher', 'Gerges', 'Nazari']


For training the neural network we need to convert the charachters into tensors. We will use one hot encoding. A line can be obtained by stacking the one hot tensors of the charachters in that line together in a single vector that is join multiple vectors into a 2D matrix


In [8]:
import torch

def letterToIndex(letter):
    return all_letters.find(letter)

def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

# sample testing
print(letterToTensor('J'))

print(lineToTensor('Jones').size())

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]])
torch.Size([5, 1, 56])


In PyTorch tensors are mutable, but in tensorflow this is not possible. So to create a similar tensor as above what we can do is create a numpy array and when that array is created in the above format then convert it to tensor

In [16]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        self.hidden_size = hidden_size
        
        self.i2h = nn.Linear(hidden_size + input_size, hidden_size)
        self.i2o = nn.Linear(hidden_size + input_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, inputs, hidden):
        combined = torch.cat((inputs, hidden), 1)
        hidden = self.i2h(combined)
        outputs = self.i2o(combined)
        outputs = self.softmax(outputs)
        return outputs, hidden
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)
    

n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories)

In [17]:
inputs = letterToTensor('A')
outputs, hidden = rnn(inputs, torch.zeros(1, n_hidden))

In [20]:
print(outputs)

tensor([[-2.8891, -2.8873, -2.9206, -2.8567, -2.8909, -2.9423, -2.9277, -2.8792,
         -2.9101, -2.8580, -2.8541, -2.9434, -2.8125, -2.9714, -2.9975, -2.8699,
         -2.8844, -2.7586]], grad_fn=<LogSoftmaxBackward>)


In [19]:
print(n_categories)

18


In [21]:
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category = top_i[0].item()
    return all_categories[category], category

print(categoryFromOutput(outputs))

('Vietnamese', 17)


In [22]:
print(outputs.argmax(1))

tensor([17])


####  Getting Training Data

In [32]:
import random

def randomNumber(l): return l[random.randint(0, len(l)-1)]

def randomExample():
    # selects a random category from the all 
    # categories list
    category = randomNumber(all_categories)
    
    # selects a random name from the list of 
    # names under that category
    line = randomNumber(final_dict[category])
    
    # convert both the name and category to
    # tensor to be able to work with the 
    # neural network
    category_tensor = torch.tensor([all_categories.index(category)])
    line_tensor = lineToTensor(line)
    
    return category, line, category_tensor, line_tensor

In [None]:
# Define the loss for training the neural network
criterion = nn.NLLLoss()