In [2]:
import numpy as np
import torch
from torch import nn
import requests
import zipfile
from io import BytesIO
import unicodedata
from sklearn.preprocessing import LabelEncoder
import string

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device:', device)

Device: cuda


## Collecting data

In [3]:
# Extract data
response = requests.get('https://download.pytorch.org/tutorial/data.zip')

data, labels = [], []
with zipfile.ZipFile(BytesIO(response.content)) as zfile:
    for filename in [
        f 
        for f in zfile.namelist() 
        if f.endswith('.txt') and f.startswith('data/names')
    ]:
        with zfile.open(filename) as file:
            lines = file.read().decode('utf-8').strip().split('\n')
            names = [
                unicodedata.normalize(
                    'NFKD', 
                    line
                ).encode('ascii', 'ignore').decode('utf-8')
                for line in lines
            ]
            category = filename.split('/')[-1].split('.')[0]
            temp_labels = np.repeat(category, len(names))
            
            data.extend(names)
            labels.extend(temp_labels)
            

## Label Encoding

In [4]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

def label2tensor(label):
    return torch.LongTensor(
        label_encoder.transform([label])
    ).to(device)

np.vstack([
    label_encoder.classes_,
    label_encoder.transform(label_encoder.classes_)
]).T

array([['Arabic', '0'],
       ['Chinese', '1'],
       ['Czech', '2'],
       ['Dutch', '3'],
       ['English', '4'],
       ['French', '5'],
       ['German', '6'],
       ['Greek', '7'],
       ['Irish', '8'],
       ['Italian', '9'],
       ['Japanese', '10'],
       ['Korean', '11'],
       ['Polish', '12'],
       ['Portuguese', '13'],
       ['Russian', '14'],
       ['Scottish', '15'],
       ['Spanish', '16'],
       ['Vietnamese', '17']], dtype='<U21')

## Name Encoding

In [6]:
dictionary = string.ascii_letters + " .,;'-"


def name2tensor(name):
    tensor = torch.zeros(len(name), 1, len(dictionary)).to(device)
    for i, char in enumerate(name):
        tensor[i][0][dictionary.index(char)] = 1
    return tensor

name2tensor('Edy')

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]]], device='cuda:0')