In [1]:
import glob

import unicodedata
import string

import codecs

import torch
import torch.nn as nn
from torch.autograd import Variable

import random

import time
import math

In [2]:
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [3]:
# Read a file and split into lines
def readLines(filename):
    lines = codecs.open(filename, "r",encoding='utf-8', errors='ignore').read().strip().split('\n')
    return [unicode_to_ascii(line) for line in lines]


In [4]:
# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letter_to_tensor(letter,all_letters,n_letters):
    tensor = torch.zeros(1, n_letters)
    letter_index = all_letters.find(letter)
    tensor[0][letter_index] = 1
    return tensor

In [5]:
# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def line_to_tensor(line,all_letters,n_letters):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        letter_index = all_letters.find(letter)
        tensor[li][0][letter_index] = 1
    return tensor

In [6]:
def category_from_output(output,all_categories):
    _, top_i = output.data.topk(1) # Tensor out of Variable with .data
    category_i = top_i[0][0]
    return all_categories[category_i], category_i

In [7]:
def random_training_pair(all_categories,category_lines,all_letters,n_letters):                                                                                                               
    category = random.choice(all_categories)
    line = random.choice(category_lines[category])
    category_tensor = Variable(torch.LongTensor([all_categories.index(category)]))
    line_tensor = Variable(line_to_tensor(line,all_letters,n_letters))
    return category, line, category_tensor, line_tensor

In [8]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax()
    
    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def init_hidden(self):
        return Variable(torch.zeros(1, self.hidden_size))

In [9]:
train_filenames = glob.glob('./train/*.txt')
print(train_filenames)

val_filenames = glob.glob('./val/*.txt')
print (val_filenames)

['./train/af.txt', './train/de.txt', './train/fi.txt', './train/fr.txt', './train/in.txt', './train/ir.txt', './train/cn.txt', './train/za.txt', './train/pk.txt']
['./val/af.txt', './val/cn.txt', './val/de.txt', './val/fi.txt', './val/fr.txt', './val/ir.txt', './val/za.txt', './val/pk.txt', './val/in.txt']


In [10]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)
print(unicode_to_ascii('Ślusàrski'))

Slusarski


In [11]:
train_category_lines={}
train_categories=[]
for filename in train_filenames:
    category = filename.split('/')[-1].split('.')[0]
    train_categories.append(category)
    lines = readLines(filename)
    train_category_lines[category] = lines

train_n_categories = len(train_categories)
print('n_categories =', train_n_categories)
print(train_category_lines['in'][:5])

n_categories = 9
['tagegin', 'kheda', 'trichy', 'palaina', "sainthilairede l'aude"]


In [12]:
val_category_lines={}
val_categories=[]
for filename in val_filenames:
    category = filename.split('/')[-1].split('.')[0]
    val_categories.append(category)
    lines = readLines(filename)
    val_category_lines[category] = lines

val_n_categories = len(val_categories)
print('n_categories =', val_n_categories)
print(val_category_lines['in'][:5])

n_categories = 9
['sinceira grande', 'rinangkatan', 'quinta das mares', 'saraguaina', 'dazhangjingqiao']


In [13]:
assert train_n_categories==val_n_categories

In [14]:
# # Build the category_lines dictionary, a list of names per language
# category_lines = {}
# all_categories = []



# for filename in train_filenames:
#     category = filename.split('/')[-1].split('.')[0]
#     all_categories.append(category)
#     lines = readLines(filename)
#     category_lines[category] = lines

# n_categories = len(all_categories)
# print('n_categories =', n_categories)

In [15]:
print(letter_to_tensor('J',all_letters,n_letters))



Columns 0 to 12 
    0     0     0     0     0     0     0     0     0     0     0     0     0

Columns 13 to 25 
    0     0     0     0     0     0     0     0     0     0     0     0     0

Columns 26 to 38 
    0     0     0     0     0     0     0     0     0     1     0     0     0

Columns 39 to 51 
    0     0     0     0     0     0     0     0     0     0     0     0     0

Columns 52 to 56 
    0     0     0     0     0
[torch.FloatTensor of size 1x57]



In [16]:
print(line_to_tensor('Jones',all_letters,n_letters).size())

torch.Size([5, 1, 57])


# Preparing for Training

Before going into training we should make a few helper functions. The first is to interpret the output of the network, which we know to be a likelihood of each category. We can use `Tensor.topk` to get the index of the greatest value:

In [17]:
for i in range(10):
    category, line, category_tensor, line_tensor = random_training_pair(train_categories,train_category_lines,all_letters,n_letters)
    print('category =', category, '/ line =', line)

category = cn / line = liuhuaping
category = fi / line = jylhankyla
category = in / line = linushka
category = fi / line = fida ali bak
category = pk / line = purano wah misri
category = de / line = ziellechen
category = ir / line = kotalah kamar
category = pk / line = mufti kili
category = za / line = zadra
category = za / line = traktkurzan


In [18]:
for i in range(10):
    category, line, category_tensor, line_tensor = random_training_pair(val_categories,val_category_lines,all_letters,n_letters)
    print('category =', category, '/ line =', line)

category = fi / line = finsterbergen
category = af / line = asolmah
category = ir / line = kirchovon
category = fr / line = xaffevillers
category = de / line = dalena
category = in / line = jingjiangao
category = de / line = niederhost
category = ir / line = towali bala
category = ir / line = qaleh agha
category = de / line = bohnsackerweide


For the [loss function `nn.NLLLoss`](http://pytorch.org/docs/nn.html#nllloss) RNN is `nn.LogSoftmax`.

Each loop of training will:

* Create input and target tensors
* Create a zeroed initial hidden state
* Read each letter in and
    * Keep hidden state for next letter
* Compare final output to target
* Back-propagate
* Return the output and loss

In [19]:
def train(rnn, category_tensor, line_tensor):
    rnn.zero_grad()
    hidden = rnn.init_hidden()
    
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    loss = criterion(output, category_tensor)
    
    loss.backward() ##this computes the gradient i guess

    optimizer.step() ##this updates the parameters after the gradient is computed

    return output, loss.data[0]

In [20]:
def time_since(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [21]:
n_hidden = 128
rnn = RNN(n_letters, n_hidden, train_n_categories)
hidden = rnn.init_hidden()
criterion = nn.NLLLoss()
learning_rate = 2e-4 # If you set this too high, it might explode. If too low, it might not learn
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)

In [22]:
input = Variable(letter_to_tensor('A',all_letters,n_letters))

output, next_hidden = rnn(input, hidden)
print('output.size =', output.size())
print(category_from_output(output,train_categories))

input = Variable(line_to_tensor('Albert',all_letters,n_letters))
hidden = Variable(torch.zeros(1, n_hidden))

output, next_hidden = rnn(input[0], hidden)
print(output)

output.size = torch.Size([1, 9])
('za', 7)
Variable containing:
-2.2429 -2.1975 -2.1983 -2.2038 -2.1716 -2.1914 -2.1805 -2.1411 -2.2526
[torch.FloatTensor of size 1x9]





We will also create an "optimizer" which updates the parameters of our model according to its gradients. We will use the vanilla SGD algorithm with a low learning rate.

In [23]:
n_epochs = 100000
print_every = 5000
plot_every = 1000

# Keep track of losses for plotting
current_loss = 0
all_losses = []
accuracy=0

In [25]:
start = time.time()

for epoch in range(1, n_epochs + 1):
    # Get a random training input and target
    category, line, category_tensor, line_tensor = random_training_pair(train_categories,train_category_lines,all_letters,n_letters)
    output, loss = train(rnn, category_tensor, line_tensor)
    current_loss += loss
    guess, guess_i = category_from_output(output, train_categories)
    
    if guess==category:
        accuracy+=1.
    
    # Print epoch number, loss, name and guess
    if epoch % print_every == 0:
        guess, guess_i = category_from_output(output, train_categories)
        correct = '✓' if guess == category else '✗ (%s)' % category
        print('%d %d%% (%s) %.4f %s / %s %s' % (epoch, epoch / n_epochs * 100, time_since(start), loss, line, guess, correct))

    # Add current loss avg to list of losses
    if epoch % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        current_loss = 0



5000 5% (1m 7s) 2.0803 spilmai / fi ✗ (pk)
10000 10% (2m 17s) 3.6756 popkeros / fr ✗ (pk)
15000 15% (3m 25s) 1.9619 mbongweni / in ✗ (za)
20000 20% (4m 25s) 0.1080 villeneuvetroloc / fr ✓
25000 25% (5m 8s) 1.0132 arshabad / ir ✓
30000 30% (6m 22s) 0.4874 qaryehye chashmehye aqa / af ✓
35000 35% (7m 28s) 0.9143 miniera / in ✓
40000 40% (8m 36s) 1.4253 aldeanueva de figueroa / fr ✗ (fi)
45000 45% (9m 35s) 3.9009 kirianwali / pk ✗ (ir)
50000 50% (10m 41s) 0.8852 xiala / cn ✓
55000 55% (11m 40s) 1.1541 hejiawa / cn ✓
60000 60% (12m 42s) 1.9910 pirzay caharbagh / ir ✗ (af)
65000 65% (13m 50s) 3.0657 esperaza / za ✗ (fr)
70000 70% (14m 46s) 1.1703 surkhkhula / fi ✗ (af)


KeyboardInterrupt: 

In [None]:
accuracy/=n_epochs
accuracy*=100
print(accuracy)

# Plotting the Results

Plotting the historical loss from `all_losses` shows the network learning:

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

%matplotlib inline

In [None]:
plt.figure()
plt.plot(all_losses)

# Evaluating the Results

To see how well the network performs on different categories, we will create a confusion matrix, indicating for every actual language (rows) which language the network guesses (columns). To calculate the confusion matrix a bunch of samples are run through the network with `evaluate()`, which is the same as `train()` minus the backprop.

In [None]:
# Keep track of correct guesses in a confusion matrix
confusion = torch.zeros(val_n_categories, val_n_categories)
n_confusion = 10000
accuracy=0

# Just return an output given a line
def evaluate(rnn,line_tensor):
    hidden = rnn.init_hidden()
    
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)
    
    return output

# Go through a bunch of examples and record which are correctly guessed
for i in range(n_confusion):
    category, line, category_tensor, line_tensor = random_training_pair(val_categories,val_category_lines,all_letters,n_letters)
    output = evaluate(rnn,line_tensor)
    guess, guess_i = category_from_output(output, val_categories)
    if guess==category:
        accuracy+=1
    category_i = val_categories.index(category)
    confusion[category_i][guess_i] += 1
accuracy/=10000
accuracy*=100
accuracy

In [None]:
# Normalize by dividing every row by its sum
for i in range(val_n_categories):
    confusion[i] = confusion[i] / confusion[i].sum()

# Set up plot
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(confusion.numpy())
fig.colorbar(cax)

# Set up axes
ax.set_xticklabels([''] + val_categories, rotation=90)
ax.set_yticklabels([''] + val_categories)

# Force label at every tick
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

plt.show()

You can pick out bright spots off the main axis that show which languages it guesses incorrectly, e.g. Chinese for Korean, and Spanish for Italian. It seems to do very well with Greek, and very poorly with English (perhaps because of overlap with other languages).

# Running on User Input

In [None]:
val_categories

In [None]:
train_categories

In [None]:
lines = readLines('./cities_test.txt')
out_lines=[]
for line in lines:
    line_tensor = Variable(line_to_tensor(line,all_letters,n_letters))
    output=evaluate(rnn,line_tensor)
    guess, guess_i = category_from_output(output, val_categories)
    out_lines.append(guess)

with open ('labels.txt','w') as f:
    for line in out_lines:
        f.write(line+'\n')