# GRUでMNIST（ベンチマーク）

In [1]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_mldata
import numpy as np

import torch
torch.cuda.set_device(2)

## 1. MNISTデータセット準備

In [2]:
def set_data():
    mnist = fetch_mldata('MNIST original')
    mnist_X, mnist_y = shuffle(mnist.data, mnist.target, random_state=42)
    mnist_X = mnist_X / 255.0


    # pytorch用に型変換
    mnist_X, mnist_y = mnist_X.astype('float32'), mnist_y.astype('int64')


    # 2次元の画像を、各行を互い違いにして1次元に変換
    def flatten_img(images):
        '''
        images: shape => (n, rows, columns)
        output: shape => (n, rows*columns)
        '''
        n_rows    = images.shape[1]
        n_columns = images.shape[2]
        for num in range(n_rows):
            if num % 2 != 0:
                images[:, num, :] = images[:, num, :][:, ::-1]
        output = images.reshape(-1, n_rows*n_columns)
        return output

    mnist_X = mnist_X.reshape(-1, 28, 28)
    mnist_X = flatten_img(mnist_X)


    # X.shape => (n_samples, seq_len, n_features) に変換
    mnist_X = mnist_X[:, :, np.newaxis]


    # 訓練、テスト、検証データに分割
    train_X, test_X, train_y, test_y = train_test_split(mnist_X, mnist_y,
                                                        test_size=0.2,
                                                        random_state=42)
    train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y,
                                                          test_size=0.1,
                                                          random_state=42)

    return train_X, test_X, train_y, test_y, valid_X, valid_y

train_X, test_X, train_y, test_y, valid_X, valid_y = set_data()

In [3]:
def set_data2():
    mnist = fetch_mldata('MNIST original')
    mnist_X, mnist_y = shuffle(mnist.data, mnist.target, random_state=42)
    mnist_X = mnist_X / 255.0


    # pytorch用に型変換
    mnist_X, mnist_y = mnist_X.astype('float32'), mnist_y.astype('int64')

    mnist_X = mnist_X.reshape(-1, 28, 28)

    # 訓練、テスト、検証データに分割
    train_X, test_X, train_y, test_y = train_test_split(mnist_X, mnist_y,
                                                        test_size=0.2,
                                                        random_state=42)
    train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y,
                                                          test_size=0.1,
                                                          random_state=42)

    return train_X, test_X, train_y, test_y, valid_X, valid_y

train_X, test_X, train_y, test_y, valid_X, valid_y = set_data2()

## 2. モデル構築

In [4]:
import torch
import torch.nn as nn
from torch.autograd import Variable

class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, hidden_layers=1, drop_rate=0.6):
        super(GRU, self).__init__()

        self.hidden_size = hidden_size
        self.hidden_layers = hidden_layers

        self.gru = nn.GRU(input_size, hidden_size, num_layers=hidden_layers, dropout=drop_rate)
        self.linear = nn.Linear(hidden_size, output_size)
        self.log_softmax = nn.LogSoftmax()

    def forward(self, inputs, hidden):
        _, self.ht = self.gru(inputs, hidden)
        ## extract the last hidden layer from ht(n_layers, n_samples, hidden_size)
        htL = self.ht[-1]
        outputs = self.linear(htL)
        return outputs

## 3. 訓練

In [5]:

import time
import math
import torch.optim as optim

# 計算時間を表示させる
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


# mini-batchあたりの訓練
def train(model, inputs, labels, optimizer, criterion):
    # 隠れ変数の初期化
    hidden = Variable(torch.randn(
        model.hidden_layers, inputs.size(1), model.hidden_size).cuda())
    # 勾配の初期化
    optimizer.zero_grad()
    outputs = model(inputs, hidden)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    return loss.data[0]


# 検証
def validate(model, inputs, labels, optimizer, criterion):
    # 隠れ変数の初期化
    hidden = Variable(torch.randn(
        model.hidden_layers, inputs.size(1), model.hidden_size).cuda())
    outputs = model(inputs, hidden)
    loss = criterion(outputs, labels)
    return loss.data[0]

In [39]:
# パラメータの設定
input_size = train_X.shape[2]
hidden_size = 200
output_size = np.unique(train_y).size
drop_rate = 0.5
lr = 0.1

# インスタンスの作成
gru = GRU(input_size, hidden_size, output_size, drop_rate=drop_rate)
gru.cuda()

# loss, optimizerの定義
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(gru.parameters(), lr=lr)


In [44]:
''' 訓練 '''
n_epochs = 20
batch_size = 300
n_batches = train_X.shape[0]//batch_size
n_batches_v = valid_X.shape[0]//batch_size

start_time = time.time()

for epoch in range(n_epochs):
    train_cost, valid_cost = 0, 0
    
    train_X, train_y = shuffle(train_X, train_y, random_state=42)

    # 訓練
    gru.train()
    train_X_t = np.transpose(train_X, (1, 0, 2)) # X.shape => (seq_len, n_samples, n_features) に変換
    for i in range(n_batches):
        start = i * batch_size
        end = start + batch_size
        inputs, labels = train_X_t[:, start:end, :], train_y[start:end]
        inputs, labels = Variable(torch.from_numpy(inputs).cuda()
                         ), Variable(torch.from_numpy(labels).cuda())
        train_cost += train(gru, inputs, labels, optimizer, criterion) / train_X.shape[0]

    # 検証
    gru.eval()
    valid_X_t = np.transpose(valid_X, (1, 0, 2)) # X.shape => (seq_len, n_samples, n_features) に変換
    for i in range(n_batches_v):
        start = i * batch_size
        end = start + batch_size
        inputs, labels = valid_X_t[:, start:end, :], valid_y[start:end]
        inputs, labels = Variable(torch.from_numpy(inputs).cuda()
                         ), Variable(torch.from_numpy(labels).cuda())
        valid_cost += validate(gru, inputs, labels, optimizer, criterion) / valid_X.shape[0]

    print('EPOCH:: %i, (%s) Training cost: %.5f, Validation cost: %.5f' % (epoch + 1,
                       timeSince(start_time), train_cost, valid_cost))

print('Finished Training')

EPOCH:: 1, (0m 2s) Training cost: 0.00647, Validation cost: 0.00460
EPOCH:: 2, (0m 4s) Training cost: 0.00387, Validation cost: 0.00319
EPOCH:: 3, (0m 6s) Training cost: 0.00313, Validation cost: 0.00301
EPOCH:: 4, (0m 8s) Training cost: 0.00260, Validation cost: 0.00207
EPOCH:: 5, (0m 11s) Training cost: 0.00199, Validation cost: 0.00159
EPOCH:: 6, (0m 13s) Training cost: 0.00155, Validation cost: 0.00127
EPOCH:: 7, (0m 15s) Training cost: 0.00123, Validation cost: 0.00098
EPOCH:: 8, (0m 17s) Training cost: 0.00100, Validation cost: 0.00091
EPOCH:: 9, (0m 20s) Training cost: 0.00086, Validation cost: 0.00073
EPOCH:: 10, (0m 22s) Training cost: 0.00073, Validation cost: 0.00063
EPOCH:: 11, (0m 24s) Training cost: 0.00065, Validation cost: 0.00054
EPOCH:: 12, (0m 26s) Training cost: 0.00062, Validation cost: 0.00051
EPOCH:: 13, (0m 28s) Training cost: 0.00053, Validation cost: 0.00045
EPOCH:: 14, (0m 31s) Training cost: 0.00049, Validation cost: 0.00042
EPOCH:: 15, (0m 33s) Training cos

In [58]:
''' テスト '''
test_X_t = np.transpose(test_X[:200], (1, 0, 2))
inputs, labels = test_X_t, test_y[:200]
inputs, labels = Variable(torch.from_numpy(inputs).cuda()
                 ), Variable(torch.from_numpy(labels).cuda())
# 隠れ変数の初期化
hidden = Variable(torch.randn(
    gru.hidden_layers, inputs.size(1), gru.hidden_size).cuda())
# 勾配の初期化
optimizer.zero_grad()
outputs = gru(inputs, hidden)

# 正解数
(torch.max(outputs, 1)[1] == labels).sum()

Variable containing:
 193
[torch.cuda.ByteTensor of size 1 (GPU 2)]

## 4. ハイパーパラメタのチューニング

In [6]:
from hyperopt import fmin, tpe, hp, rand

parameter_space = {
	'hidden_size':hp.quniform('hidden_size', 1, 256, q=1),
    'hidden_layers':hp.quniform('hidden_layers', 1, 5, q=1),
	'drop_rate':hp.uniform('drop_rate', 0, 1),
	'lr': hp.loguniform("lr", -10, 0),
}


In [7]:
# 目的関数
def objective(args):
    print(args)
    hidden_size   = int(args['hidden_size'])
    hidden_layers = int(args['hidden_layers'])
    drop_rate     = args['drop_rate']
    lr            = args['lr']

    train_X, test_X, train_y, test_y, valid_X, valid_y = set_data2()
    input_size = train_X.shape[2]
    output_size = np.unique(train_y).size

    # インスタンスの作成
    gru = GRU(input_size, hidden_size, output_size, hidden_layers, drop_rate=drop_rate)
    gru.cuda()

    # loss, optimizerの定義
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(gru.parameters(), lr=lr)

    n_epochs = 3
    batch_size = 256
    n_batches = train_X.shape[0]//batch_size
    n_batches_v = valid_X.shape[0]//batch_size
    start_time = time.time()

    for epoch in range(n_epochs):
        train_cost, valid_cost = 0, 0

        train_X, train_y = shuffle(train_X, train_y, random_state=42)

        # 訓練
        gru.train()
        train_X_t = np.transpose(train_X, (1, 0, 2)) # X.shape => (seq_len, n_samples, n_features) に変換
        for i in range(n_batches):
            start = i * batch_size
            end = start + batch_size
            inputs, labels = train_X_t[:, start:end, :], train_y[start:end]
            inputs, labels = Variable(torch.from_numpy(inputs).cuda()
                             ), Variable(torch.from_numpy(labels).cuda())
            train_cost += train(gru, inputs, labels, optimizer, criterion) / train_X.shape[0]

    # 検証
    gru.eval()
    valid_X_t = np.transpose(valid_X, (1, 0, 2)) # X.shape => (seq_len, n_samples, n_features) に変換
    for i in range(n_batches_v):
        start = i * batch_size
        end = start + batch_size
        inputs, labels = valid_X_t[:, start:end, :], valid_y[start:end]
        inputs, labels = Variable(torch.from_numpy(inputs).cuda()
                         ), Variable(torch.from_numpy(labels).cuda())
        valid_cost += train(gru, inputs, labels, optimizer, criterion) / valid_X.shape[0]

    # 最後のepochのvalidation_costを返す
    print('(%s) Validation cost: %.7f' % (timeSince(start_time), valid_cost))
    return valid_cost

    del train_X, test_X, train_y, test_y, valid_X, valid_y


In [8]:
best = fmin(objective, parameter_space, algo=rand.suggest, max_evals=100)
print(best)

{'lr': 0.012006039094902665, 'hidden_size': 138.0, 'hidden_layers': 2.0, 'drop_rate': 0.5227762390918008}
(0m 9s) Validation cost: 0.0085947
{'lr': 0.0011715281103349756, 'hidden_size': 243.0, 'hidden_layers': 1.0, 'drop_rate': 0.9441586620847976}
(0m 7s) Validation cost: 0.0086349
{'lr': 0.06007910407078869, 'hidden_size': 143.0, 'hidden_layers': 2.0, 'drop_rate': 0.14618554393035188}
(0m 9s) Validation cost: 0.0059856
{'lr': 0.12184383541904328, 'hidden_size': 118.0, 'hidden_layers': 5.0, 'drop_rate': 0.31704438123716316}
(0m 29s) Validation cost: 0.0027788
{'lr': 0.0006462170286378042, 'hidden_size': 178.0, 'hidden_layers': 4.0, 'drop_rate': 0.10630280784984647}
(0m 28s) Validation cost: 0.0086363
{'lr': 0.00038840833399271513, 'hidden_size': 99.0, 'hidden_layers': 4.0, 'drop_rate': 0.1663584234845228}
(0m 18s) Validation cost: 0.0086320
{'lr': 0.0034080043851282787, 'hidden_size': 11.0, 'hidden_layers': 2.0, 'drop_rate': 0.13500015329653148}
(0m 5s) Validation cost: 0.0086397
{'lr'

(0m 6s) Validation cost: 0.0086393
{'lr': 0.00047789713266875003, 'hidden_size': 244.0, 'hidden_layers': 2.0, 'drop_rate': 0.19675360688662924}
(0m 14s) Validation cost: 0.0086337
{'lr': 0.0010515609729733497, 'hidden_size': 88.0, 'hidden_layers': 4.0, 'drop_rate': 0.3994557894193824}
(0m 16s) Validation cost: 0.0086401
{'lr': 1.0721147374301465, 'hidden_size': 163.0, 'hidden_layers': 4.0, 'drop_rate': 0.311273462722372}
(0m 26s) Validation cost: 0.0270726
{'lr': 0.0006142400149215317, 'hidden_size': 139.0, 'hidden_layers': 1.0, 'drop_rate': 0.09970694422469706}
(0m 5s) Validation cost: 0.0086348
{'lr': 0.008621807522827577, 'hidden_size': 108.0, 'hidden_layers': 2.0, 'drop_rate': 0.9171596251393057}
(0m 8s) Validation cost: 0.0086140
{'lr': 0.7191596416598559, 'hidden_size': 228.0, 'hidden_layers': 5.0, 'drop_rate': 0.959293069191341}
(0m 45s) Validation cost: 0.1041266
{'lr': 1.2201471597603473, 'hidden_size': 84.0, 'hidden_layers': 3.0, 'drop_rate': 0.5284482964012549}
(0m 10s) Vali