In [1]:
!curl -O https://download.pytorch.org/tutorial/data.zip; unzip data.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0
100   243    0   243    0     0     74      0 --:--:--  0:00:03 --:--:--    74
100   243    0   243    0     0     74      0 --:--:--  0:00:03 --:--:--    74
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:-

In [2]:
import os
import random
from string import ascii_letters

import torch
from torch import nn
import torch.nn.functional as F
from unidecode import unidecode

_ = torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
data_dir = "./data/names"
# Load data
lang2label = {
    file_name.split(".")[0]: torch.tensor([i], dtype=torch.long)
    for i, file_name in enumerate(os.listdir(data_dir))
}

In [4]:
lang2label

{'Arabic': tensor([0]),
 'Chinese': tensor([1]),
 'Czech': tensor([2]),
 'Dutch': tensor([3]),
 'English': tensor([4]),
 'French': tensor([5]),
 'German': tensor([6]),
 'Greek': tensor([7]),
 'Irish': tensor([8]),
 'Italian': tensor([9]),
 'Japanese': tensor([10]),
 'Korean': tensor([11]),
 'Polish': tensor([12]),
 'Portuguese': tensor([13]),
 'Russian': tensor([14]),
 'Scottish': tensor([15]),
 'Spanish': tensor([16]),
 'Vietnamese': tensor([17])}

In [None]:
char2idx = {letter: i for i, letter in enumerate(ascii_letters + " .,:;-'")}
num_letters = len(char2idx); num_letters

59

In [6]:
# function to convert a name into a tensor of character indices
def name2tensor(name):
    tensor = torch.zeros(len(name), 1, num_letters)
    for i, char in enumerate(name):
        tensor[i][0][char2idx[char]] = 1
    return tensor

In [None]:
#  RNN layers expect the input tensor to be of size (seq_len, batch_size, input_size)
name2tensor("Solomon")

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       

In [9]:
# Creating the Dataset

tensor_names = []
target_langs = []

for file in os.listdir(data_dir):
    with open(os.path.join(data_dir, file), encoding="utf-8") as f:
        lang = file.split(".")[0] # get language from file name
        names = [unidecode(line.rstrip()) for line in f] # read names and remove accents (standardize)
        for name in names:
            try:
                tensor_names.append(name2tensor(name)) # convert name to tensor and add to list
                target_langs.append(lang2label[lang]) # get target language label and add to list
            except KeyError:
                pass

In [10]:
# Split data into training and test sets

from sklearn.model_selection import train_test_split

train_idx, test_idx = train_test_split(
    range(len(target_langs)), 
    test_size=0.1, 
    shuffle=True, 
    stratify=target_langs
)

train_dataset = [
    (tensor_names[i], target_langs[i])
    for i in train_idx
]

test_dataset = [
    (tensor_names[i], target_langs[i])
    for i in test_idx
]

In [11]:
print(f"Train: {len(train_dataset)}")
print(f"Test: {len(test_dataset)}")

Train: 18063
Test: 2007


In [12]:
# Simple Elman RNN Model

class simpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(simpleRNN, self).__init__()
        self.hidden_size = hidden_size
        self.inpt2hidden = nn.Linear(input_size + hidden_size, hidden_size) # for input to hidden
        self.inpt2output = nn.Linear(input_size + hidden_size, output_size) # for input to output
        
    def forward(self, inpt, hidden_state):
        combined = torch.cat((inpt, hidden_state), 1) # concatenate input and hidden state
        hidden = torch.sigmoid(self.inpt2hidden(combined)) # compute new hidden state h(t) = sigmoid(Wh[x(t), h(t-1)]+ b(h))
        output = self.inpt2output(combined) # compute output y(t) = Wo[x(t), h(t-1)] + b(o)
        return output, hidden
    
    def init_hidden(self):
        return nn.init.kaiming_uniform_(torch.empty(1, self.hidden_size))

In [13]:
hidden_size = 128
learning_rate = 0.001

# Initialize model, optimizer, and loss function
model = simpleRNN(num_letters, hidden_size, len(lang2label)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [14]:
## Training Loop

num_epochs = 3
print_every = 2000
losses = []
for epoch in range(1, num_epochs + 1):
    random.shuffle(train_dataset) # shuffle training data each epoch
    total_loss = 0

    for i, (name_tensor, target_lang) in enumerate(train_dataset, 1):
        name_tensor = name_tensor.to(device)
        target_lang = target_lang.to(device) # move tensors to device
        hidden = model.init_hidden().to(device) # initialize hidden state

        for char_tensor in name_tensor:
            output, hidden = model(char_tensor, hidden)
        
        loss = criterion(output, target_lang)
        
        optimizer.zero_grad()
        loss.backward() # backpropagation
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) # gradient clipping
        optimizer.step() # update parameters
        losses.append(loss.item()) # record losses
        total_loss += loss.item() # accumulate loss

        if i % print_every == 0:
            avg_loss = total_loss / print_every
            print(f"Epoch [{epoch}/{num_epochs}], Step [{i}/{len(train_dataset)}], Loss: {avg_loss:.4f}")
            total_loss = 0

Epoch [1/3], Step [2000/18063], Loss: 2.0341
Epoch [1/3], Step [4000/18063], Loss: 1.5823
Epoch [1/3], Step [6000/18063], Loss: 1.3921
Epoch [1/3], Step [8000/18063], Loss: 1.2947
Epoch [1/3], Step [10000/18063], Loss: 1.2710
Epoch [1/3], Step [12000/18063], Loss: 1.2367
Epoch [1/3], Step [14000/18063], Loss: 1.1243
Epoch [1/3], Step [16000/18063], Loss: 1.1547
Epoch [1/3], Step [18000/18063], Loss: 1.1397
Epoch [2/3], Step [2000/18063], Loss: 1.0999
Epoch [2/3], Step [4000/18063], Loss: 1.1152
Epoch [2/3], Step [6000/18063], Loss: 1.1063
Epoch [2/3], Step [8000/18063], Loss: 1.0204
Epoch [2/3], Step [10000/18063], Loss: 1.0555
Epoch [2/3], Step [12000/18063], Loss: 1.0851
Epoch [2/3], Step [14000/18063], Loss: 0.9858
Epoch [2/3], Step [16000/18063], Loss: 1.0752
Epoch [2/3], Step [18000/18063], Loss: 0.9847
Epoch [3/3], Step [2000/18063], Loss: 1.0045
Epoch [3/3], Step [4000/18063], Loss: 1.0424
Epoch [3/3], Step [6000/18063], Loss: 1.0342
Epoch [3/3], Step [8000/18063], Loss: 0.9424


In [17]:
num_correct = 0
num_samples = len(test_dataset)

model.eval()

with torch.no_grad():
    for name, label in test_dataset:
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char, hidden_state)
        _, pred = torch.max(output, dim=1)
        num_correct += bool(pred == label)

print(f"Accuracy: {num_correct / num_samples * 100:.4f}%")

Accuracy: 72.6956%


In [20]:
label2lang = {label.item(): lang for lang, label in lang2label.items()} # reverse mapping from label to language

def predict_language(name):
    model.eval() # set model to evaluation mode
    tensor_name = name2tensor(name) # convert name to tensor
    with torch.no_grad(): # disable gradient calculation
        hidden_state = model.init_hidden()
        for char in tensor_name:
            output, hidden_state = model(char, hidden_state)
        _, pred = torch.max(output, dim=1)
    model.train()    
    return label2lang[pred.item()]

In [26]:
predict_language("Qing")

'Chinese'

In [27]:
predict_language("Solomon")

'English'