In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler

In [2]:
train = pd.read_json('train.jsonl', lines=True)
valid = pd.read_json('valid.jsonl', lines=True)
test = pd.read_json('test.jsonl', lines=True)

all_letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'"
n_letters = len(all_letters)

dataset = pd.concat([train, valid, test], ignore_index=True)
n_categories = dataset['country'].nunique()
all_categories = dataset['country'].unique()
categ_to_idx = {categ: idx for idx, categ in enumerate(all_categories)}
idx_to_categ = {v: k for k, v in categ_to_idx.items()}
longest_name_len = dataset['name'].str.len().max()

In [3]:
print(categ_to_idx)

{'English': 0, 'Russian': 1, 'Italian': 2, 'German': 3, 'Japanese': 4, 'Dutch': 5, 'Arabic': 6, 'Chinese': 7, 'Greek': 8, 'Czech': 9, 'Irish': 10, 'Spanish': 11, 'French': 12, 'Polish': 13, 'Vietnamese': 14, 'Korean': 15, 'Portuguese': 16, 'Scottish': 17}


In [4]:
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a (longest_name_len, 1, n_letters) tensor
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

## Creating the Network

Before autograd, creating a recurrent neural network in Torch involved
cloning the parameters of a layer over several timesteps. The layers
held hidden state and gradients which are now entirely handled by the
graph itself. This means you can implement a RNN in a very "pure" way,
as regular feed-forward layers.

This RNN module (mostly copied from [the PyTorch for Torch users
tutorial](https://pytorch.org/tutorials/beginner/former_torchies/nn_tutorial.html#example-2-recurrent-net))
is just 2 linear layers which operate on an input and hidden state, with
a LogSoftmax layer after the output.

<img src="../images/rnn.png" width="80%">


In [5]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, sigma=0.01):
        super().__init__()
        self.hidden_size = hidden_size
        self.U = nn.Parameter(
            torch.randn(input_size, hidden_size) * sigma)
        self.W = nn.Parameter(
            torch.randn(hidden_size, hidden_size) * sigma)
        self.b1 = nn.Parameter(torch.zeros(hidden_size))
        self.relu = nn.ReLU()
        self.V = nn.Parameter(
            torch.randn(hidden_size, output_size) * sigma)
        self.b2 = nn.Parameter(torch.zeros(output_size))
        
    def forward(self, inputs, hidden):
        z1 = torch.mm(inputs, self.U)
        z2 = torch.mm(hidden, self.W)
        hidden = self.relu(z1 + z2 + self.b1)
        z3 = torch.mm(hidden, self.V)
        output = z3 + self.b2
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories)
print(rnn)

RNN(
  (relu): ReLU()
)


To run a step of this network we need to pass an input (in our case, the
Tensor for the current letter) and a previous hidden state (which we
initialize as zeros at first). We'll get back the output (probability of
each language) and a next hidden state (which we keep for the next
step).

In [6]:
input = lineToTensor('Albert')
hidden = torch.zeros(1, n_hidden)
output, next_hidden = rnn(input[0], hidden)
print(output.shape)

torch.Size([1, 18])


As you can see the output is a ``<1 x n_categories>`` Tensor, where
every item is the likelihood of that category (higher is more likely).




## Training

In [7]:
criterion = nn.CrossEntropyLoss()
learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn
optimizer = optim.SGD(rnn.parameters(), lr=learning_rate)
model = RNN(n_letters, n_hidden, n_categories)
n_epochs = 20

def train_tensor(category_tensor, line_tensor):
    hidden = rnn.initHidden()

    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    optimizer.zero_grad()
    loss = criterion(output, category_tensor)
    loss.backward()
    optimizer.step()

    return output, loss.item()

def evaluate_tensor(line_tensor):
    hidden = rnn.initHidden()

    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    return output

In [8]:
valid_acc_list = []
train_acc_list = []
loss_list = []
for epoch in range(n_epochs):
    print(f'Epoch: {epoch+1} / {n_epochs}')
    
    # calculate the accuracy on valid set
    correct_train = 0
    total_train = 0
    for _, row in train.iterrows():
        category = row['country']
        line = row['name']
        category_tensor = torch.tensor([categ_to_idx[category]], dtype=torch.long)
        line_tensor = lineToTensor(line)
        
        output = evaluate_tensor(line_tensor)
            
        # calculate the accuracy on train set
        pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
        if pred.item() == category_tensor.item():
            correct_train += 1
            
    # calculate the accuracy on train set
    train_acc = correct_train / len(train)
    print(f'train_acc: {train_acc}')
    train_acc_list.append(train_acc)
    
    # calculate the accuracy on valid set
    correct_val = 0
    total_val = 0
    for _, row in valid.iterrows():
        category = row['country']
        line = row['name']
        category_tensor = torch.tensor([categ_to_idx[category]], dtype=torch.long)
        line_tensor = lineToTensor(line)
        
        output = evaluate_tensor(line_tensor)
            
        pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
        if pred.item() == category_tensor.item():
            correct_val += 1
            
    # calculate the accuracy on valid set
    valid_acc = correct_val / len(valid)
    print(f'valid_acc: {valid_acc}')
    valid_acc_list.append(valid_acc)
    
    # do training on the train set
    for _, row in train.iterrows():
        category = row['country']
        line = row['name']
        category_tensor = torch.tensor([categ_to_idx[category]], dtype=torch.long)
        line_tensor = lineToTensor(line)
        
        # forward pass
        output, loss = train_tensor(category_tensor, line_tensor)
    
    loss_list.append(loss)

Epoch: 1 / 20
train_acc: 0.012819286329283459
valid_acc: 0.011194029850746268


In [None]:
# save train, acc, loss lists to file
import json
with open("pytorch_results.json" , "w") as f:
    json.dump({"train_acc": train_acc_list, "val_acc": valid_acc_list, "loss": loss_list}, f)

In [None]:
# infer in the test set
test_acc_list = []
correct_test = 0
total_test = 0
for i, row in test.iterrows():
    category = row['country']
    line = row['name']
    category_tensor = torch.tensor([categ_to_idx[category]], dtype=torch.long)
    line_tensor = lineToTensor(line)
    
    output = evaluate_tensor(line_tensor)
        
    pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
    if pred.item() == category_tensor.item():
        correct_test += 1

# calculate the accuracy on test set
test_acc = correct_test / len(test)
print(f'test_acc: {test_acc}')
test_acc_list.append(test_acc)

test_acc: 0.5423242467718795


In [None]:
# give a real name and predict its country
line = 'Junyan Li'
line_tensor = lineToTensor(line)
output = evaluate_tensor(line_tensor)
# get the top-3 predictions
_, pred = output.topk(3, dim=1)
print(f'Predicted countries: {[idx_to_categ[p.item()] for p in pred[0]]}')

Predicted countries: ['Russian', 'English', 'Japanese']
