In [17]:
import torch

# Check if CUDA is available
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

torch.set_default_device(device)
print(f"Using device = {torch.get_default_device()}")

Using device = cuda:0


In [18]:
import string
import unicodedata

# We can use "_" to represent an out-of-vocabulary character, that is, any character we are not handling in our model
allowed_characters = string.ascii_letters + " .,;'" + "_"
n_letters = len(allowed_characters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in allowed_characters
    )

In [19]:
print (f"converting 'Ślusàrski' to {unicodeToAscii('Ślusàrski')}")
print (f"converting 'Ánh Như' to {unicodeToAscii('Ánh Như')}")

converting 'Ślusàrski' to Slusarski
converting 'Ánh Như' to Anh Nhu


In [None]:
# Turning names into tensors
def letterToIndex(letter):
    if letter not in allowed_characters:
        return allowed_characters.find('_')
    else:
        return allowed_characters.find(letter)

# Turn a line into a <line_length, 1, n_letters>
# an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [21]:
print (f"The letter 'a' becomes {lineToTensor('a')}") #notice that the first position in the tensor = 1
print (f"The name 'Ahn' becomes {lineToTensor('Ahn')}") #notice 'A' sets the 27th index to 1

The letter 'a' becomes tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]]], device='cuda:0')
The name 'Ahn' becomes tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 

In [22]:
from io import open
import glob
import os
import time

import torch
from torch.utils.data import Dataset

class NamesDataset(Dataset):
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.load_time = time.localtime
        labels_set = set()

        self.data = []
        self.data_tensors = []
        self.labels = []
        self.labels_tensors = []

        #read all the ``.txt`` files in the specified directory
        text_files = glob.glob(os.path.join(data_dir, '*.txt'))
        for filename in text_files:
            label = os.path.splitext(os.path.basename(filename))[0]
            labels_set.add(label)
            lines = open(filename, encoding='utf-8').read().strip().split('\n')
            for name in lines:
                self.data.append(name)
                self.data_tensors.append(lineToTensor(name))
                self.labels.append(label)

        #Cache the tensor representation of the labels
        self.labels_uniq = list(labels_set)
        for idx in range(len(self.labels)):
            temp_tensor = torch.tensor([self.labels_uniq.index(self.labels[idx])], dtype=torch.long)
            self.labels_tensors.append(temp_tensor)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        data_item = self.data[index]
        data_label = self.labels[index]
        data_tensor = self.data_tensors[index]
        label_tensor = self.labels_tensors[index]

        return label_tensor, data_tensor, data_label, data_item
# data: Person's names
# labels: Country names
# data_tensor: One-hot encoded for every single letter of a person's name
# label_tensor: Country index name

In [23]:
alldata = NamesDataset("data/names")
print(f"loaded {len(alldata)} items of data")
print(f"example = {alldata[0]}")

loaded 20074 items of data
example = (tensor([10], device='cuda:0'), tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 1., 0.

In [24]:
train_set, test_set = torch.utils.data.random_split(alldata, [.85, .15], generator = torch.Generator(device = device).manual_seed(1234))
print(
    f"train examples = {len(train_set)}, \
        validation examples = {len(test_set)}"
)

train examples = 17063,         validation examples = 3011


In [25]:
# Creating the Network
import torch.nn as nn
import torch.nn.functional as F

class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()

        self.rnn = nn.RNN(input_size, hidden_size)
        self.hidden_to_output = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim = 1)

    def forward(self, line_tensor):
        # hidden_shape: (1, batch_size, hidden_size)
        # rnn_out_shape: (seq_length, batch_size, hidden_size)
        # rnn_out chứa tất cả các hidden_states tại mỗi bước thời gian trong chuỗi đầu vào
        rnn_out, hidden = self.rnn(line_tensor)
        output = self.hidden_to_output(hidden[-1])
        output = self.softmax(output)
        return output

In [26]:
n_hidden = 128
rnn = CharRNN(n_letters, n_hidden, len(alldata.labels_uniq))
print(rnn)

CharRNN(
  (rnn): RNN(58, 128)
  (hidden_to_output): Linear(in_features=128, out_features=18, bias=True)
  (softmax): LogSoftmax(dim=1)
)


In [27]:
def label_from_output(output : torch.Tensor, output_labels):
    label_i = output.argmax(dim = 1).item()
    return output_labels[label_i], label_i

input = lineToTensor('Albert')
output = rnn(input) #this is equivalent to ``output = rnn.forward(input)``
print(output)
print(label_from_output(output, alldata.labels_uniq))

tensor([[-2.8695, -2.8808, -3.1704, -2.9225, -2.7958, -2.8329, -2.9179, -2.8691,
         -2.8430, -2.9901, -2.8501, -2.9606, -2.8588, -2.8739, -2.8363, -2.9604,
         -2.8043, -2.8520]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
('Korean', 4)


In [None]:
import random
import numpy as np

def train(rnn, training_data, testing_data, n_epoch = 10, batch_size = 64
, lr = 0.2, 
          loss_fn = nn.NLLLoss()):
    current_loss = 0
    all_training_losses = []
    all_valid_losses = []
    optimizer = torch.optim.SGD(rnn.parameters(), lr = lr)

    print(f"Training on dataset with n = len({len(training_data)})")

    for epoch in range(n_epoch):
        rnn.train()
        rnn.zero_grad()

        batches = list(range(len(training_data)))
        random.shuffle(batches)
        batches = np.array_split(batches, len(batches) // batch_size)

        for idx, batch in enumerate(batches):
            batch_loss = 0
            for i in batch:
                (label_tensor, text_tensor, label, text) = training_data[i]
                label_tensor = label_tensor.to(device)
                text_tensor = text_tensor.to(device)
                output = rnn.forward(text_tensor)
                loss = loss_fn(output, label_tensor)
                batch_loss += loss
            
            batch_loss.backward()
            nn.utils.clip_grad_norm_(rnn.parameters(), 3)
            optimizer.step()
            optimizer.zero_grad()

            current_loss += batch_loss.item() / len(batch)
        all_training_losses.append(current_loss / len(batches))

        rnn.eval()
        for example in testing_data:
            with torch.no_grad():
                (label_tensor, text_tensor, label, text) = example
                label_tensor = label_tensor.to(device)
                text_tensor = text_tensor.to(device)
                output = rnn.forward(text_tensor)
                loss = loss_fn(output, label_tensor)
                valid_loss += loss.item()
        all_valid_losses.append(valid_loss / len(testing_data))


        print(f"{epoch} ({epoch / n_epoch:.0%}): \t average batch loss = {all_training_losses[-1]} \t average valid loss = {all_valid_losses[-1]}")
        current_loss = 0
    return all_training_losses

In [31]:
start = time.time()
rnn.to(device)
all_losses = train(rnn, train_set, n_epoch = 50, lr = 0.01)
end = time.time()
print(f"Training_time = {start - end}")

Training on dataset with n = len(17063)
0 (0%): 	 average batch loss = 1.7867365752706093
1 (2%): 	 average batch loss = 1.52870933800298
2 (4%): 	 average batch loss = 1.4293270036448902
3 (6%): 	 average batch loss = 1.3543094997326028
4 (8%): 	 average batch loss = 1.2932851547309605
5 (10%): 	 average batch loss = 1.2264714142881838
6 (12%): 	 average batch loss = 1.179814004508521
7 (14%): 	 average batch loss = 1.144275048838798
8 (16%): 	 average batch loss = 1.1188107586720557
9 (18%): 	 average batch loss = 1.0984487546876098
10 (20%): 	 average batch loss = 1.078674018510578
11 (22%): 	 average batch loss = 1.0593519200025232
12 (24%): 	 average batch loss = 1.0432291456596778
13 (26%): 	 average batch loss = 1.0295820745655122
14 (28%): 	 average batch loss = 1.016826717559832
15 (30%): 	 average batch loss = 1.0025124087610846
16 (32%): 	 average batch loss = 0.9914922208197205
17 (34%): 	 average batch loss = 0.9792706969399337
18 (36%): 	 average batch loss = 0.9673910226

In [36]:
torch.save(rnn.state_dict(), "../c3_sequence_model/name_classifier.pth")

In [33]:
input = lineToTensor('Tuma')
output = rnn(input) #this is equivalent to ``output = rnn.forward(input)``
# print(output)
print(label_from_output(output, alldata.labels_uniq))

('Arabic', 10)


In [37]:
all_losses

[1.7867365752706093,
 1.52870933800298,
 1.4293270036448902,
 1.3543094997326028,
 1.2932851547309605,
 1.2264714142881838,
 1.179814004508521,
 1.144275048838798,
 1.1188107586720557,
 1.0984487546876098,
 1.078674018510578,
 1.0593519200025232,
 1.0432291456596778,
 1.0295820745655122,
 1.016826717559832,
 1.0025124087610846,
 0.9914922208197205,
 0.9792706969399337,
 0.9673910226933592,
 0.9590340491800105,
 0.9505876045753806,
 0.9386968944788393,
 0.9304506244895222,
 0.9218006798268191,
 0.9125869054026242,
 0.9044410378757829,
 0.896979125701602,
 0.8886951241622845,
 0.8820039487440094,
 0.873315618457375,
 0.8657245288786687,
 0.8578125127500161,
 0.8520784031731191,
 0.8441993116815356,
 0.8391922274599054,
 0.8323209146928622,
 0.8271249689553913,
 0.8177993731908263,
 0.8125662536083996,
 0.8066195272534069,
 0.8012076281825541,
 0.7976162021666197,
 0.7893862818830477,
 0.7868301867904381,
 0.7818633210407371,
 0.7755438076839617,
 0.7712387986149383,
 0.7669722827529548,


In [54]:
n_hidden = 128
rnn_test = CharRNN(n_letters, n_hidden, len(alldata.labels_uniq))
# rnn_test.load_state_dict("../c3_sequence_model/name_classifier.pth")
rnn_test.load_state_dict(torch.load("../c3_sequence_model/name_classifier.pth", weights_only=True))

<All keys matched successfully>

In [55]:
def accuracy(rnn):
    right_choice = 0
    for example in test_set:
        (label_tensor, text_tensor, label, text) = example
        output = rnn(text_tensor)
        # print(output.argmax(dim = 1))
        # print(label_tensor)
        if (output.argmax(dim = 1) == label_tensor): 
            right_choice += 1

    return right_choice * 1.0 / len(test_set)

acc = accuracy(rnn_test)
print(f"Model's accuracy: {acc} in total {len(test_set)} examples!")

Model's accuracy: 0.7705081368316175 in total 3011 examples!


In [66]:
def get_choice(rnn, name):
    input = lineToTensor(name)
    output = rnn(input) 
    return label_from_output(output, alldata.labels_uniq)

get_choice(rnn_test, "David")

('English', 6)