In [1]:

from io import open
import glob

def findFiles(path): return glob.glob(path)

print(findFiles('data/names/*.txt'))

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicodeToAscii('Ślusàrski'))

# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
    category = filename.split('/')[-1].split('.')[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

['data/names/Korean.txt', 'data/names/Spanish.txt', 'data/names/Scottish.txt', 'data/names/Russian.txt', 'data/names/Japanese.txt', 'data/names/Polish.txt', 'data/names/Czech.txt', 'data/names/French.txt', 'data/names/Dutch.txt', 'data/names/Irish.txt', 'data/names/German.txt', 'data/names/Portuguese.txt', 'data/names/Vietnamese.txt', 'data/names/English.txt', 'data/names/Italian.txt', 'data/names/Greek.txt', 'data/names/Arabic.txt', 'data/names/Chinese.txt']
Slusarski


In [2]:
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

print(letterToTensor('J'))

print(lineToTensor('Jones').size()) # <line_length x 1 x n_letters>



Columns 0 to 12 
    0     0     0     0     0     0     0     0     0     0     0     0     0

Columns 13 to 25 
    0     0     0     0     0     0     0     0     0     0     0     0     0

Columns 26 to 38 
    0     0     0     0     0     0     0     0     0     1     0     0     0

Columns 39 to 51 
    0     0     0     0     0     0     0     0     0     0     0     0     0

Columns 52 to 56 
    0     0     0     0     0
[torch.FloatTensor of size 1x57]

torch.Size([5, 1, 57])


In [36]:
import numpy as np
import torch.nn as nn
from torch.autograd import Variable


class SRU(nn.Module):
    def __init__(self, x_dim, phi_dim, r_dim, o_dim, A):
        """ 
        args: x_dim:   入力xの次元
              phi_dim: phiの次元。\mu^{\alpha}の次元とも等しい
              r_dim:   rの次元
              o_dim:   出力oの次元
              A:       {\alpha_1, \alpha_2, ..., \alpha_m}
        """

        super(SRU, self).__init__()

        n_alpha      = len(A)
        self.n_alpha = n_alpha
        self.A       = A
        self.phi_dim = phi_dim
        # muの次元 = phiの次元*alphaの個数
        mu_dim = phi_dim * n_alpha 
        self.mu_dim = mu_dim
        
        # 各結合の定義
        self.mu2r    = nn.Linear(mu_dim, r_dim)
        self.xr2phi  = nn.Linear(x_dim + r_dim, phi_dim)
        self.mu2o    = nn.Linear(mu_dim, o_dim)
        self.log_softmax = nn.LogSoftmax()

    def forward(self, x, mu):
        r = F.relu(self.mu2r(mu))
        phi = F.relu(self.xr2phi(torch.cat((x, r), 1)))
        mu = self.muphi2mu(mu, phi)
        o = F.relu(self.mu2o(mu))
        o = self.log_softmax(o)
        return o, mu
    
    def muphi2mu(self, mu, phi):

        ''' すべてのalphaについて、\mu_t^{\alpha} = \alpha \mu_{t-1}^{\alpha} + (1-\alpha) \phi_t を同時に行う
        mask:     Kronecker product of (A, ones(phi_dim))
        phi_tile: Kronecker product of (ones(n_alpha), phi)
        '''
        
        A_expanded = self.A.expand(phi_dim, self.n_alpha)
        mask = torch.t(A_expanded).contiguous().view(-1)
        # 要素積をとるためにmaskをVariableに変換するが、あくまでmaskは定数項なのでrequires_grad=Falseをつける
        mask = Variable(mask, requires_grad=False)
        phi_expanded = phi.view(-1).expand(self.n_alpha, self.phi_dim)
        phi_repeated = phi_expanded.contiguous().view(-1)
        mu = torch.add(torch.mul(mask, mu.view(-1)), torch.mul((1-mask), phi_repeated)).view(1, -1)
        return mu

    def initMu(self):
        return Variable(torch.zeros(1, self.mu_dim))
    

phi_dim = 128
r_dim = 60
A = torch.Tensor([0.0, 0.5, 0.9, 0.99])

sru = SRU(n_letters, phi_dim, r_dim, n_categories, A)


In [37]:
''' forwardの動作確認 '''
input = Variable(letterToTensor('A'))
mu_0 = Variable(torch.zeros(1, phi_dim*len(A)))
output, mu_t = sru(input, mu_0)

In [38]:
def categoryFromOutput(output):
    top_n, top_i = output.data.topk(1) # Tensor out of Variable with .data
    category_i = top_i[0][0]
    return all_categories[category_i], category_i

print(categoryFromOutput(output))

import random

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    category_tensor = Variable(torch.LongTensor([all_categories.index(category)]))
    line_tensor = Variable(lineToTensor(line))
    return category, line, category_tensor, line_tensor

for i in range(10):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    print('category =', category, '/ line =', line)

('Chinese', 17)
category = Dutch / line = Rompay
category = Korean / line = Chin
category = Russian / line = Baulin
category = German / line = Siemon
category = Arabic / line = Wasem
category = Chinese / line = Fan
category = Arabic / line = Handal
category = Vietnamese / line = Bui
category = Spanish / line = Tomas
category = English / line = Froy


In [39]:
criterion = nn.NLLLoss()

In [40]:
learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn

def train(category_tensor, line_tensor):
    mu = sru.initMu()

    sru.zero_grad()

    for i in range(line_tensor.size()[0]):
        o, mu = sru(line_tensor[i], mu)

    loss = criterion(o, category_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in sru.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return o, loss.data[0]

In [41]:
import time
import math

n_iters = 100000
print_every = 5000
plot_every = 1000



# Keep track of losses for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

for iter in range(1, n_iters + 1):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    output, loss = train(category_tensor, line_tensor)
    current_loss += loss

    # Print iter number, loss, name and guess
    if iter % print_every == 0:
        guess, guess_i = categoryFromOutput(output)
        correct = '✓' if guess == category else '✗ (%s)' % category
        print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), current_loss, line, guess, correct))
        current_loss = 0


5000 5% (1m 3s) 14404.7369 Althuis / Spanish ✗ (Dutch)
10000 10% (2m 6s) 14170.7242 Zdunowski / Japanese ✗ (Polish)
15000 15% (3m 9s) 13636.4113 Seok / Korean ✓
20000 20% (4m 13s) 12866.3510 Youn / Chinese ✗ (Korean)
25000 25% (5m 16s) 12577.0877 Sook / Korean ✓
30000 30% (6m 19s) 12224.3120 Shan / Chinese ✓
35000 35% (7m 21s) 11937.6581 Tomioka / Japanese ✓
40000 40% (8m 24s) 11519.2817 Leveque / Scottish ✗ (French)
45000 45% (9m 28s) 11311.5770 Ennos / Czech ✗ (English)
50000 50% (10m 30s) 11005.1095 Hunter / German ✗ (Scottish)
55000 55% (11m 33s) 10736.2238 Fournier / German ✗ (French)
60000 60% (12m 35s) 10459.0420 Wasem / Dutch ✗ (Arabic)
65000 65% (13m 37s) 10254.7816 Luu / Vietnamese ✓
70000 70% (14m 39s) 10182.7682 Kreskas / Polish ✗ (Greek)
75000 75% (15m 42s) 9929.1720 Toma / Japanese ✗ (Arabic)
80000 80% (16m 45s) 9738.3586 Dejmal / Czech ✓
85000 85% (17m 47s) 9672.6181 Nicchi / Japanese ✗ (Italian)
90000 90% (18m 50s) 9372.1073 Thao / Vietnamese ✓
95000 95% (19m 53s) 9327.

## mini-batch対応版

In [42]:
''' SRUモデルの定義 '''

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F


class SRU(nn.Module):
    def __init__(self, x_dim, phi_dim, r_dim, o_dim, A, GPU=True):
        """ 
        x_dim:   入力xの次元（特徴量数）
        phi_dim: phiの次元。\mu^{\alpha}の次元とも等しい
        r_dim:   rの次元
        o_dim:   出力oの次元
        A:       [\alpha_1, \alpha_2, ..., \alpha_m], shape: (1, m)
        """

        super(SRU, self).__init__()

        self.gpu     = GPU
        n_alpha      = A.size()[1]
        self.n_alpha = n_alpha
        self.A       = A
        self.phi_dim = phi_dim
        # muの次元 = phiの次元*alphaの個数
        mu_dim = phi_dim * n_alpha 
        self.mu_dim = mu_dim
        
        # 各結合の定義
        self.mu2r    = nn.Linear(mu_dim, r_dim)
        self.xr2phi  = nn.Linear(x_dim + r_dim, phi_dim)
        self.mu2o    = nn.Linear(mu_dim, o_dim)
        self.log_softmax = nn.LogSoftmax()

    def forward(self, x, mu):
        '''
        x.size()  => (sample_size, x_dim)
        mu.size() => (sample_size, mu_dim)
        '''

        r = F.relu(self.mu2r(mu))
        phi = F.relu(self.xr2phi(torch.cat((x, r), 1)))
        mu = self.muphi2mu(mu, phi)
        o = F.relu(self.mu2o(mu))
        o = self.log_softmax(o)
        return o, mu
    
    def muphi2mu(self, mu, phi):
        '''
        すべての\alphaについて、\mu_t^{(\alpha)} = \alpha \mu_{t-1}^{(\alpha)} + (1-\alpha) \phi_t を同時に行う
            A_mask:   Kronecker product of (A, ones(1, phi_dim)),   shape => (1, mu_dim)
            phi_tile: Kronecker product of (ones(1, n_alpha), phi), shape => (sample_size, mu_dim)
        '''
        if self.gpu:
            A_mask = kronecker_product(self.A, torch.ones(1, self.phi_dim).cuda())
            phi_tile = kronecker_product(Variable(torch.ones(1, self.n_alpha).cuda()), phi)
        else:
            A_mask = kronecker_product(self.A, torch.ones(1, self.phi_dim))
            phi_tile = kronecker_product(Variable(torch.ones(1, self.n_alpha)), phi)

        # 要素積をとるためにA_maskをVariableに変換するが、A_maskは定数項なのでrequires_grad=Falseをつける
        A_mask = Variable(A_mask, requires_grad=False)
        mu = torch.mul(A_mask, mu) + torch.mul((1-A_mask), phi_tile)
        return mu


def kronecker_product(t1, t2):
    t1_height, t1_width = t1.size()
    t2_height, t2_width = t2.size()
    out_height = t1_height * t2_height
    out_width = t1_width * t2_width

    tiled_t2 = t2.repeat(t1_height, t1_width)
    expanded_t1 = (
        t1.unsqueeze(2)
          .unsqueeze(3)
          .repeat(1, t2_height, t2_width, 1)
          .view(out_height, out_width)
    )

    return expanded_t1 * tiled_t2




In [43]:
''' forwardの動作確認 '''

phi_dim = 128
r_dim = 60
A = torch.Tensor([0.0, 0.5, 0.9, 0.99]).view(1, -1)
sru = SRU(n_letters, phi_dim, r_dim, n_categories, A, GPU=False)


input = Variable(letterToTensor('A'))
mu_0 = Variable(torch.zeros(1, phi_dim*A.size(1)))
output, mu_t = sru(input, mu_0)

In [44]:
def train(category_tensor, line_tensor):
    
    # 暫定
    mu = Variable(torch.zeros(1, phi_dim*A.size(1)))

    sru.zero_grad()

    for i in range(line_tensor.size()[0]):
        o, mu = sru(line_tensor[i], mu)

    loss = criterion(o, category_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in sru.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return o, loss.data[0]

In [45]:
import time
import math

n_iters = 100000
print_every = 5000
plot_every = 1000



# Keep track of losses for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

for iter in range(1, n_iters + 1):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    output, loss = train(category_tensor, line_tensor)
    current_loss += loss

    # Print iter number, loss, name and guess
    if iter % print_every == 0:
        guess, guess_i = categoryFromOutput(output)
        correct = '✓' if guess == category else '✗ (%s)' % category
        print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), current_loss, line, guess, correct))
        current_loss = 0


5000 5% (1m 16s) 14333.0896 Tuma / Japanese ✗ (Arabic)
10000 10% (2m 33s) 13817.2160 Stegon / Irish ✗ (Czech)
15000 15% (3m 49s) 12923.1399 Andruhov / Russian ✓
20000 20% (5m 6s) 11674.0150 Do / Korean ✗ (Vietnamese)
25000 25% (6m 23s) 10482.1865 Sepulveda / Spanish ✓
30000 30% (7m 38s) 9538.9026 Macha / Japanese ✗ (Czech)
35000 35% (8m 54s) 9240.6220 Lopez / Spanish ✓
40000 40% (10m 10s) 8674.9418 Bata / Spanish ✗ (Arabic)
45000 45% (11m 27s) 8208.5405 Bahchivandzhi / Japanese ✗ (Russian)
50000 50% (12m 43s) 7833.9862 Sakamoto / Japanese ✓
55000 55% (14m 1s) 7437.5199 Hideyoshi / Japanese ✓
60000 60% (15m 18s) 7266.3119 Marquerink / Czech ✗ (German)
65000 65% (16m 37s) 6878.8767 To The First Page / French ✗ (Russian)
70000 70% (17m 57s) 6661.4646 Flynn / Irish ✓
75000 75% (19m 15s) 6525.1322 Paloumbas / Portuguese ✗ (Greek)
80000 80% (20m 37s) 6262.9591 Niftrik / Russian ✗ (Dutch)
85000 85% (21m 56s) 6026.9547 Miazga / Czech ✗ (Polish)
90000 90% (23m 13s) 6061.5333 Ngo / Korean ✗ (Vie

In [46]:
dir(sru)

['A',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_all_buffers',
 '_apply',
 '_backend',
 '_backward_hooks',
 '_buffers',
 '_forward_hooks',
 '_forward_pre_hooks',
 '_modules',
 '_parameters',
 'add_module',
 'apply',
 'children',
 'cpu',
 'cuda',
 'double',
 'dump_patches',
 'eval',
 'float',
 'forward',
 'gpu',
 'half',
 'load_state_dict',
 'modules',
 'mu2o',
 'mu2r',
 'mu_dim',
 'muphi2mu',
 'n_alpha',
 'named_children',
 'named_modules',
 'named_parameters',
 'parameters',
 'phi_dim',
 'register_backward_hook',
 'register_buffer',
 'register_forward_hook',
 'register_forward_pre_hook',
 'register_parameter',
 'share_memory',
 'softmax'

In [53]:
# W^{(r)}の勾配の確認
list(sru.parameters())[0].grad

Variable containing:
-0.1798  0.0000  0.0000  ...   0.2996  0.0000 -0.0040
 0.0419  0.0000  0.0000  ...  -0.1308  0.0000 -0.0619
 0.0000  0.0000  0.0000  ...   0.2183  0.0000  0.0000
          ...             ⋱             ...          
-0.0890  0.0000  0.0000  ...   0.0169  0.0000 -0.0020
 0.0000  0.0000  0.0000  ...  -0.3198  0.0000 -0.0266
 0.0000  0.0000  0.0000  ...   0.0151  0.0000  0.0000
[torch.FloatTensor of size 128x117]