In [1]:
import numpy as np
from keras.utils import np_utils
from keras.datasets import mnist

Using TensorFlow backend.


In [108]:
''' データセットの準備 '''

(X_train, y_train), (X_test, y_test) = mnist.load_data()

# yをone-hot表現に
# y_train = np_utils.to_categorical(y_train)
# y_test = np_utils.to_categorical(y_test)

# x, yの型変換
X_train, X_test = X_train.astype('float32'), X_test.astype('float32')
y_train, y_test = y_train.astype('int64'), y_test.astype('int64')

# xの範囲を[0, 1]に変換
X_train /= 255
X_test /= 255

# 2次元の画像を、各行を互い違いにして1次元に変換
def flatten_img(images):
    '''
    images: shape => (n, rows, columns)
    output: shape => (n, rows*columns)
    '''
    n_rows    = images.shape[1]
    n_columns = images.shape[2]
    for num in range(n_rows):
        if num % 2 != 0:
            images[:, num, :] = images[:, num, :][:, ::-1]
    output = images.reshape(-1, n_rows*n_columns)
    return output

X_train, X_test = flatten_img(X_train), flatten_img(X_test)

# X.shape => (n_sample, seq_size, n_features) に変換
X_train, X_test = X_train[:, :, np.newaxis], X_test[:, :, np.newaxis]



In [113]:
''' SRUモデルの定義 '''

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F


class SRU(nn.Module):
    def __init__(self, x_dim, phi_dim, r_dim, o_dim, A, GPU=True):
        """ 
        x_dim:   入力xの次元（特徴量数）
        phi_dim: phiの次元。\mu^{\alpha}の次元とも等しい
        r_dim:   rの次元
        o_dim:   出力oの次元
        A:       [\alpha_1, \alpha_2, ..., \alpha_m], shape: (1, m)
        """

        super(SRU, self).__init__()

        self.gpu     = GPU
        n_alpha      = A.size()[1]
        self.n_alpha = n_alpha
        self.A       = A
        self.phi_dim = phi_dim
        # muの次元 = phiの次元*alphaの個数
        mu_dim = phi_dim * n_alpha 
        self.mu_dim = mu_dim
        
        # 各結合の定義
        self.mu2r    = nn.Linear(mu_dim, r_dim)
        self.xr2phi  = nn.Linear(x_dim + r_dim, phi_dim)
        self.mu2o    = nn.Linear(mu_dim, o_dim)
        self.log_softmax = nn.LogSoftmax()

    def forward(self, x, mu):
        '''
        x.size()  => (sample_size, x_dim)
        mu.size() => (sample_size, mu_dim)
        '''

        r = F.relu(self.mu2r(mu))
        phi = F.relu(self.xr2phi(torch.cat((x, r), 1)))
        mu = self.muphi2mu(mu, phi)
        o = F.relu(self.mu2o(mu))
        o = self.log_softmax(o)
        return o, mu
    
    def muphi2mu(self, mu, phi):
        '''
        すべての\alphaについて、\mu_t^{(\alpha)} = \alpha \mu_{t-1}^{(\alpha)} + (1-\alpha) \phi_t を同時に行う
            A_mask:   Kronecker product of (A, ones(1, phi_dim)),   shape => (1, mu_dim)
            phi_tile: Kronecker product of (ones(1, n_alpha), phi), shape => (sample_size, mu_dim)
        '''
        if self.gpu:
            A_mask = kronecker_product(self.A, torch.ones(1, self.phi_dim).cuda())
            phi_tile = kronecker_product(Variable(torch.ones(1, self.n_alpha).cuda()), phi)
        else:
            A_mask = kronecker_product(self.A, torch.ones(1, self.phi_dim))
            phi_tile = kronecker_product(Variable(torch.ones(1, self.n_alpha)), phi)

        # 要素積をとるためにA_maskをVariableに変換するが、A_maskは定数項なのでrequires_grad=Falseをつける
        A_mask = Variable(A_mask, requires_grad=False)
        mu = torch.mul(A_mask, mu) + torch.mul((1-A_mask), phi_tile)
        return mu


def kronecker_product(t1, t2):
    t1_height, t1_width = t1.size()
    t2_height, t2_width = t2.size()
    out_height = t1_height * t2_height
    out_width = t1_width * t2_width

    tiled_t2 = t2.repeat(t1_height, t1_width)
    expanded_t1 = (
        t1.unsqueeze(2)
          .unsqueeze(3)
          .repeat(1, t2_height, t2_width, 1)
          .view(out_height, out_width)
    )

    return expanded_t1 * tiled_t2



In [129]:
''' パラメータの設定 '''

x_dim = X_train.shape[2]
phi_dim = 200
r_dim = 60
o_dim = np.unique(y_train).size
A = torch.Tensor([0.0, 0.5, 0.9, 0.99]).view(1, -1).cuda()
sru = SRU(x_dim, phi_dim, r_dim, o_dim, A)
sru.cuda()

batch_size = 4
mu_dim = phi_dim * A.size()[1]

In [130]:
''' データセットの準備 その2 '''

import torch.utils.data

train = torch.utils.data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
trainloader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
test = torch.utils.data.TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
testloader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=True)

In [131]:
''' feedforwardの確認 '''

dataiter = iter(trainloader)
inputs, labels = dataiter.next()
inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())

# inputs.size => (seq_size, batch_size, n_features) に変形
inputs = torch.transpose(inputs, 0, 1)

# 隠れ変数の初期化
mu = Variable(torch.rand(batch_size, mu_dim).cuda())

# 勾配の初期化
optimizer.zero_grad()

# 確認
x = inputs[0]
outputs, mu = sru(x, mu)


In [135]:
''' 訓練 '''

import time
import math
import torch.optim as optim


# 計算時間の表示
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

# NLLLossとlog_softmaxのoutputで計算が速くなる
criterion = nn.NLLLoss()
optimizer = optim.SGD(sru.parameters(), lr=0.001)

start = time.time()

for epoch in range(1):
    running_loss = 0.0
    for i, data in enumerate(trainloader, 1):
        inputs, labels = data
        inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
        # inputs.size => (seq_size, batch_size, n_features) に変形
        inputs = torch.transpose(inputs, 0, 1)
        # 隠れ変数の初期化
        mu = Variable(torch.rand(batch_size, mu_dim).cuda())
        # 勾配の初期化
        optimizer.zero_grad()
        for x in inputs:
            outputs, mu = sru(x, mu)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.data[0]

        # 1000サンプルごとにlossを表示
#         if i * batch_size % 5000 == 0:
#             print('[%d, %5d] (%s) loss: %.3f' %
#                   (epoch + 1, i * batch_size, timeSince(start), running_loss / 2000))
#             running_loss = 0.0
        print('[%d, %5d] (%s) loss: %.6f' %
              (epoch + 1, i * batch_size, timeSince(start), running_loss / 2000))
        running_loss = 0.0

        if i * batch_size % 20000 == 0:
            break

print('Finished Training')

[1,     4] (0m 1s) loss: 0.001149
[1,     8] (0m 3s) loss: 0.001150
[1,    12] (0m 5s) loss: 0.001160
[1,    16] (0m 7s) loss: 0.001152
[1,    20] (0m 9s) loss: 0.001156
[1,    24] (0m 11s) loss: 0.001152
[1,    28] (0m 12s) loss: 0.001144
[1,    32] (0m 14s) loss: 0.001159
[1,    36] (0m 16s) loss: 0.001143
[1,    40] (0m 18s) loss: 0.001152
[1,    44] (0m 20s) loss: 0.001159
[1,    48] (0m 21s) loss: 0.001152
[1,    52] (0m 23s) loss: 0.001150
[1,    56] (0m 25s) loss: 0.001144
[1,    60] (0m 27s) loss: 0.001144
[1,    64] (0m 29s) loss: 0.001144
[1,    68] (0m 31s) loss: 0.001150
[1,    72] (0m 33s) loss: 0.001159


KeyboardInterrupt: 

In [136]:
# W^{(r)}の勾配の確認
list(sru.parameters())[0].grad

Variable containing:
1.00000e-03 *
-0.0000 -0.0012 -0.0002  ...  -0.0020 -0.0028 -0.0021
 0.0000  0.0518  0.0415  ...   0.0036  0.0047  0.0019
 0.0000 -0.0760 -0.0139  ...   0.0016  0.0008 -0.0008
          ...             ⋱             ...          
 0.0000  0.0307  0.0079  ...  -0.0101 -0.0002  0.0069
-0.0000 -0.1016 -0.0454  ...  -0.0029 -0.0176 -0.0139
 0.0000  0.7504  0.2278  ...   0.0224  0.0398  0.0186
[torch.cuda.FloatTensor of size 60x800 (GPU 0)]

In [34]:
''' テスト '''

correct = 0
total = 0

for i, data in enumerate(trainloader, 1):
    inputs, labels = data
    inputs = Variable(inputs.cuda())
    labels = labels.cuda()
    # inputs.size => (seq_size, batch_size, n_features) に変形
    inputs = torch.transpose(inputs, 0, 1)
    # 隠れ変数の初期化
    mu = Variable(torch.zeros(batch_size, mu_dim).cuda())

    for x in inputs:
        outputs, mu = sru(x, mu)
    torch.t(outputs)
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum()
    
    if i * batch_size % 2000 == 0:
        break

print('Accuracy of the test images: %d %%' % (
    100 * correct / total))

Accuracy of the test images: 10 %
