# MNISTで比較

In [1]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_mldata
import numpy as np

import torch
torch.cuda.set_device(2)

## 1. MNISTデータセット準備

In [2]:
def load_mnist():
    mnist = fetch_mldata('MNIST original')
    mnist_X, mnist_y = shuffle(mnist.data, mnist.target, random_state=42)
    mnist_X = mnist_X / 255.0

    # pytorch用に型変換
    mnist_X, mnist_y = mnist_X.astype('float32'), mnist_y.astype('int64')

    # 2次元の画像を、各行を互い違いにして1次元に変換
    def flatten_img(images):
        '''
        images: shape => (n, rows, columns)
        output: shape => (n, rows*columns)
        '''
        n_rows    = images.shape[1]
        n_columns = images.shape[2]
        for num in range(n_rows):
            if num % 2 != 0:
                images[:, num, :] = images[:, num, :][:, ::-1]
        output = images.reshape(-1, n_rows*n_columns)
        return output

    mnist_X = mnist_X.reshape(-1, 28, 28)
    mnist_X = flatten_img(mnist_X) # X.shape => (n_samples, seq_len) 
    mnist_X = mnist_X[:, :, np.newaxis] # X.shape => (n_samples, seq_len, n_features) 

    # 訓練、テスト、検証データに分割
    train_X, test_X, train_y, test_y = train_test_split(mnist_X, mnist_y,
                                                        test_size=0.2,
                                                        random_state=42)
    train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y,
                                                          test_size=0.1,
                                                          random_state=42)

    return train_X, test_X, train_y, test_y, valid_X, valid_y

train_X, test_X, train_y, test_y, valid_X, valid_y = load_mnist()



## 2. モデル構築

In [3]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR
from models import SRU, GRU, LSTM

## 3. 訓練の準備

In [4]:
import time
import math
import torch.optim as optim

# 計算時間を表示させる
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


# batchあたりの訓練
def train(model, inputs, labels, optimizer, criterion, clip):
    batch_size = inputs.size(1)
    # 隠れ変数の初期化
    model.initHidden(batch_size)
    # 勾配の初期化
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    accuracy = (torch.max(outputs, 1)[1] == labels).sum().data[0] / batch_size
    loss.backward()
    torch.nn.utils.clip_grad_norm(model.parameters(), clip)
    optimizer.step()
    return loss.data[0], accuracy


# 検証
def validate(model, inputs, labels, optimizer, criterion):
    # 隠れ変数の初期化
    batch_size = inputs.size(1)
    model.initHidden(batch_size)
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    accuracy = (torch.max(outputs, 1)[1] == labels).sum().data[0] / batch_size
    return loss.data[0], accuracy

## 4. パラメータの設定

In [5]:
input_size = train_X.shape[2]
output_size = np.unique(train_y).size

In [6]:
''' GRU '''
hidden_size = 100
dropout = 0.2
lr = 0.05
lr_decay = 0.99
init_forget_bias = 1
clip = 1

# インスタンスの作成
model = GRU(input_size, hidden_size, output_size, dropout=dropout)
model.initWeight(init_forget_bias)

In [27]:
''' LSTM '''
hidden_size = 100
dropout = 0.3
lr = 0.01
lr_decay = 1
init_forget_bias = 1
clip = 1

# インスタンスの作成
model = LSTM(input_size, hidden_size, output_size, dropout=dropout)
model.initWeight(init_forget_bias)

In [8]:
''' SRU '''
phi_size = 200
r_size = 60
cell_out_size = 200
lr = 0.1
lr_decay = 0.99
clip = 1

torch.cuda.manual_seed(0)
model = SRU(input_size, phi_size, r_size, cell_out_size,
            output_size, A=[0, 0.5, 0.9, 0.99, 0.999],
            dropout=0.2)
model.initWeight()

In [28]:
''' loss, optimizerの定義 '''
model.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr) # SRUはRMSprop等と相性が悪い?
# 10ステップごとに学習率をdecayさせる
scheduler = StepLR(optimizer, step_size=10, gamma=lr_decay)

In [31]:
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0.0001)

## 5. 訓練

In [None]:
''' 訓練 '''
n_epochs = 200
batch_size = 1024
n_batches = train_X.shape[0]//batch_size
n_batches_v = valid_X.shape[0]//batch_size
all_acc = []
start_time = time.time()

for epoch in range(n_epochs):
    train_cost, valid_cost, train_acc, valid_acc  = 0, 0, 0, 0
    train_X, train_y = shuffle(train_X, train_y, random_state=epoch)

    # 訓練
    model.train()
    train_X_t = np.transpose(train_X, (1, 0, 2)) # X.shape => (seq_len, n_samples, n_features) に変換
    for i in range(n_batches):
        scheduler.step()
        start = i * batch_size
        end = start + batch_size
        inputs, labels = train_X_t[:, start:end, :], train_y[start:end]
        inputs, labels = Variable(torch.from_numpy(inputs).cuda()
                         ), Variable(torch.from_numpy(labels).cuda())
        cost, accuracy = train(model, inputs, labels, optimizer, criterion, clip)
        train_cost += cost / n_batches
        train_acc  += accuracy / n_batches

    # 検証
    model.eval()
    valid_X_t = np.transpose(valid_X, (1, 0, 2)) # X.shape => (seq_len, n_samples, n_features) に変換
    for i in range(n_batches_v):
        start = i * batch_size
        end = start + batch_size
        inputs, labels = valid_X_t[:, start:end, :], valid_y[start:end]
        inputs, labels = Variable(torch.from_numpy(inputs).cuda()
                         ), Variable(torch.from_numpy(labels).cuda())
        cost, accuracy = validate(model, inputs, labels, optimizer, criterion)
        valid_cost += cost / n_batches_v
        valid_acc += accuracy / n_batches_v

    all_acc.append(valid_acc)
    print('EPOCH:: %i, (%s) train_cost: %.3f, valid_cost: %.3f, train_acc: %.3f, valid_acc: %.3f' % (epoch + 1,
                       timeSince(start_time), train_cost, valid_cost, train_acc, valid_acc))

print('Finished Training')

EPOCH:: 1, (0m 9s) train_cost: 0.099, valid_cost: 0.107, train_acc: 0.220, valid_acc: 0.218
EPOCH:: 2, (0m 19s) train_cost: 0.087, valid_cost: 0.096, train_acc: 0.224, valid_acc: 0.220
EPOCH:: 3, (0m 28s) train_cost: 0.069, valid_cost: 0.084, train_acc: 0.230, valid_acc: 0.228
EPOCH:: 4, (0m 38s) train_cost: 0.097, valid_cost: 0.090, train_acc: 0.222, valid_acc: 0.224
EPOCH:: 5, (0m 47s) train_cost: 0.067, valid_cost: 0.080, train_acc: 0.230, valid_acc: 0.229
EPOCH:: 6, (0m 57s) train_cost: 0.143, valid_cost: 0.127, train_acc: 0.205, valid_acc: 0.211
EPOCH:: 7, (1m 7s) train_cost: 0.102, valid_cost: 0.115, train_acc: 0.218, valid_acc: 0.214
EPOCH:: 8, (1m 17s) train_cost: 0.080, valid_cost: 0.396, train_acc: 0.226, valid_acc: 0.129
EPOCH:: 9, (1m 27s) train_cost: 0.103, valid_cost: 0.087, train_acc: 0.219, valid_acc: 0.224
EPOCH:: 10, (1m 37s) train_cost: 0.064, valid_cost: 0.074, train_acc: 0.232, valid_acc: 0.229
EPOCH:: 11, (1m 47s) train_cost: 0.064, valid_cost: 0.084, train_acc: 0

EPOCH:: 89, (15m 10s) train_cost: 0.056, valid_cost: 0.079, train_acc: 0.234, valid_acc: 0.228
EPOCH:: 90, (15m 19s) train_cost: 0.054, valid_cost: 0.084, train_acc: 0.234, valid_acc: 0.225
EPOCH:: 91, (15m 29s) train_cost: 0.059, valid_cost: 0.074, train_acc: 0.232, valid_acc: 0.229
EPOCH:: 92, (15m 39s) train_cost: 0.064, valid_cost: 0.076, train_acc: 0.232, valid_acc: 0.229
EPOCH:: 93, (15m 48s) train_cost: 0.054, valid_cost: 0.069, train_acc: 0.234, valid_acc: 0.232
EPOCH:: 94, (15m 58s) train_cost: 0.089, valid_cost: 0.166, train_acc: 0.224, valid_acc: 0.192
EPOCH:: 95, (16m 8s) train_cost: 0.102, valid_cost: 0.087, train_acc: 0.219, valid_acc: 0.224
EPOCH:: 96, (16m 17s) train_cost: 0.059, valid_cost: 0.069, train_acc: 0.232, valid_acc: 0.228
EPOCH:: 97, (16m 27s) train_cost: 0.076, valid_cost: 0.074, train_acc: 0.228, valid_acc: 0.229
EPOCH:: 98, (16m 37s) train_cost: 0.051, valid_cost: 0.087, train_acc: 0.234, valid_acc: 0.226
EPOCH:: 99, (16m 47s) train_cost: 0.067, valid_cost

## Sand Box

In [9]:
''' テスト '''
test_X_t = np.transpose(test_X[5000:5500], (1, 0, 2))
inputs, labels = test_X_t, test_y[5000:5500]
inputs, labels = Variable(torch.from_numpy(inputs).cuda()
                 ), Variable(torch.from_numpy(labels).cuda())
model.initHidden(inputs.size(1))
outputs = model(inputs)

# 正解数
(torch.max(outputs, 1)[1] == labels).sum()[0]

Variable containing:
 236
[torch.cuda.ByteTensor of size 1 (GPU 2)]

In [10]:
''' 勾配の確認 '''
loss = criterion(outputs, labels)
loss.backward()

In [11]:
list(model.parameters())[5].grad

Variable containing:
1.00000e-03 *
 -1.3640
 -3.9899
  3.9046
 -0.2410
  2.9663
 -7.2542
  1.3611
  3.0439
 -0.1566
  1.7300
[torch.cuda.FloatTensor of size 10 (GPU 2)]

In [12]:
"""
outputsが全てのサンプルで同じになる理由: XWが0, bが≠0
"""
outputs

Variable containing:
-1.6677e+00 -2.1592e+00  1.1355e+01  ...  -3.4879e+00 -1.7871e-01 -2.5049e+00
-6.0168e+00  1.2347e+01 -1.9971e+00  ...  -9.8720e-01 -2.5509e+00 -2.5539e+00
-4.8645e+00  1.1795e+01 -5.2879e+00  ...  -4.3604e-01 -6.9174e-01 -2.8169e+00
                ...                   ⋱                   ...                
-5.4314e+00 -9.9449e-01 -1.3796e+00  ...   1.0591e+01 -1.0177e+00  3.4657e-01
-3.5827e-02 -4.4396e-01 -2.2040e+00  ...  -7.9230e+00 -1.8895e+00 -4.1059e+00
 6.9418e-01 -6.3189e-01 -1.8002e+00  ...  -8.9616e+00 -1.7312e+00 -4.9029e+00
[torch.cuda.FloatTensor of size 500x10 (GPU 2)]