In [3]:
import io
import os
import unicodedata
import string
import glob
import random

In [12]:
import torch

### Utils

In [4]:
# alphabet small + capital letters + " .,;'"
ALL_LETTERS = string.ascii_letters + " .,;'"
N_LETTERS = len(ALL_LETTERS)

In [5]:
# trun a unicode string to plain ASCII
def unicode_to_ascii(s):
    return ''.join (
        c for c in unicodedata.normalize('NFC', s)
        if unicodedata.category(c) != 'Mn'
        and c in ALL_LETTERS
    )


In [10]:
def load_data():
    # bulid the category_lines dictionary and a list of names per language
    category_lines = {}
    all_categories = []

    def find_files(path):
        return glob.glob(path)

    # read a file and split into lines
    def read_line(filename):
        lines = io.open(filename, encoding='utf-8').read().strip().split('\n')
        return [unicode_to_ascii(line) for line in lines]

    file_path = "data/names/*.txt"
    for filename in find_files(file_path):
        category = os.path.splitext(os.path.basename(filename))[0]
        all_categories.append(category)

        lines = read_line(filename)
        category_lines[category] = lines

    return category_lines, all_categories


In [7]:
print(ALL_LETTERS)
print(unicode_to_ascii('Ślusàrski'))

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'
lusrski


In [11]:
category_lines, all_categories = load_data()
print(category_lines['Italian'][:5])

['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']


In [14]:
# find letter index from all_lettes, (a = 0, b = 1...)
def letter_to_index(letter):
    return ALL_LETTERS.find(letter)

print(letter_to_index('j'))

9


In [17]:
# turn a letter into a (1 x n_letters) Tensor
def letter_to_tensor(letter):
    tensor = torch.zeros(1, N_LETTERS)
    tensor[0][letter_to_index(letter)] = 1
    return tensor

print(letter_to_tensor('b'))

tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])


In [33]:
# trun a line into a (line-length x 1 x n_letters)
# or an array of one-hot-encoding

def line_to_tensor(line):
    tensor = torch.zeros(len(line), 1, N_LETTERS)
    for i, letter in enumerate(line):
        tensor[i][0][letter_to_index(letter)] = 1

    return tensor


print(line_to_tensor('python').size())

torch.Size([6, 1, 57])


In [34]:
def random_training_example(category_lines, all_categories):
    
    def random_choice(a):
        random_idx = random.randint(0, len(a) - 1)
        return a[random_idx]
    
    category = random_choice(all_categories)
    line = random_choice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = line_to_tensor(line)
    return category, line, category_tensor, line_tensor


# RNN