# GRUでMNIST（ベンチマーク）

In [9]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_mldata
import numpy as np

cuda_id = 1

## 1. MNISTデータセット準備

In [10]:
mnist = fetch_mldata('MNIST original')
mnist_X, mnist_y = shuffle(mnist.data, mnist.target, random_state=42)
mnist_X = mnist_X / 255.0


# pytorch用に型変換
mnist_X, mnist_y = mnist_X.astype('float32'), mnist_y.astype('int64')


# 2次元の画像を、各行を互い違いにして1次元に変換
def flatten_img(images):
    '''
    images: shape => (n, rows, columns)
    output: shape => (n, rows*columns)
    '''
    n_rows    = images.shape[1]
    n_columns = images.shape[2]
    for num in range(n_rows):
        if num % 2 != 0:
            images[:, num, :] = images[:, num, :][:, ::-1]
    output = images.reshape(-1, n_rows*n_columns)
    return output

mnist_X = mnist_X.reshape(-1, 28, 28)
mnist_X = flatten_img(mnist_X)


# X.shape => (n_samples, seq_len, n_features) に変換
mnist_X = mnist_X[:, :, np.newaxis]


# 訓練、テスト、検証データに分割
train_X, test_X, train_y, test_y = train_test_split(mnist_X, mnist_y,
                                                    test_size=0.2,
                                                    random_state=42)
train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y,
                                                      test_size=0.1,
                                                      random_state=42)


## 2. モデル構築

In [11]:
import torch
import torch.nn as nn
from torch.autograd import Variable

class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, hidden_layers=1, drop_rate=0.6):
        super(GRU, self).__init__()

        self.hidden_size = hidden_size
        self.hidden_layers = hidden_layers

        self.gru = nn.GRU(input_size, hidden_size, dropout=drop_rate)
        self.linear = nn.Linear(hidden_size, output_size)
        self.log_softmax = nn.LogSoftmax()

    def forward(self, inputs, hidden):
        _, ht = self.gru(inputs, hidden)
        ## extract the last hidden layer from ht(n_layers, n_samples, hidden_size)
        htL = ht[-1]
        outputs = self.linear(htL)
        return outputs

## 3. 訓練

In [12]:
import time
import math
import torch.optim as optim


# 計算時間を表示させる
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


# mini-batchあたりの訓練
def train(model, inputs, labels):
    # 隠れ変数の初期化
    hidden = Variable(torch.randn(
        model.hidden_layers, inputs.size(1), model.hidden_size).cuda(cuda_id))
    # 勾配の初期化
    optimizer.zero_grad()
    outputs = gru(inputs, hidden)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    return loss.data[0]


# 検証
def validate(model, inputs, labels):
    # 隠れ変数の初期化
    hidden = Variable(torch.randn(
        model.hidden_layers, inputs.size(1), model.hidden_size).cuda(cuda_id))
    # 勾配の初期化
    optimizer.zero_grad()
    outputs = gru(inputs, hidden)
    loss = criterion(outputs, labels)
    return loss.date[0]

In [14]:
# パラメータの設定
input_size = train_X.shape[2]
hidden_size = 200
output_size = np.unique(train_y).size
drop_rate = 0.5
lr = 0.1

# インスタンスの作成
gru = GRU(input_size, hidden_size, output_size, drop_rate=drop_rate)
gru.cuda(cuda_id)

# loss, optimizerの定義
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(gru.parameters(), lr=lr)


RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1502008109146/work/torch/lib/THC/generic/THCStorage.cu:66

In [9]:
''' 訓練 '''
n_epochs = 10
batch_size = 500
n_batches = train_X.shape[0]//batch_size
n_batches_v = valid_X.shape[0]//batch_size

start_time = time.time()

for epoch in range(n_epochs):
    train_cost, valid_cost = 0, 0
    
    train_X, train_y = shuffle(train_X, train_y, random_state=42)

    # 訓練
    gru.train()
    train_X_t = np.transpose(train_X, (1, 0, 2)) # X.shape => (seq_len, n_samples, n_features) に変換
    for i in range(n_batches):
        start = i * batch_size
        end = start + batch_size
        inputs, labels = train_X_t[:, start:end, :], train_y[start:end]
        inputs, labels = Variable(torch.from_numpy(inputs).cuda(cuda_id)
                         ), Variable(torch.from_numpy(labels).cuda(cuda_id))
        train_cost += train(gru, inputs, labels) / train_X.shape[0]

    # 検証
    gru.eval()
    valid_X_t = np.transpose(valid_X, (1, 0, 2)) # X.shape => (seq_len, n_samples, n_features) に変換
    for i in range(n_batches_v):
        start = i * batch_size
        end = start + batch_size
        inputs, labels = valid_X_t[:, start:end, :], valid_y[start:end]
        inputs, labels = Variable(torch.from_numpy(inputs).cuda(cuda_id)
                         ), Variable(torch.from_numpy(labels).cuda(cuda_id))
        valid_cost += train(gru, inputs, labels) / valid_X.shape[0]

    print('EPOCH:: %i, (%s) Training cost: %.5f, Validation cost: %.5f' % (epoch + 1,
                       timeSince(start_time), train_cost, valid_cost))

print('Finished Training')

EPOCH:: 1, (0m 44s) Training cost: 0.00457, Validation cost: 0.00452
EPOCH:: 2, (1m 29s) Training cost: 0.00457, Validation cost: 0.00452
EPOCH:: 3, (2m 14s) Training cost: 0.00457, Validation cost: 0.00452
EPOCH:: 4, (2m 58s) Training cost: 0.00457, Validation cost: 0.00452
EPOCH:: 5, (3m 43s) Training cost: 0.00457, Validation cost: 0.00452
EPOCH:: 6, (4m 28s) Training cost: 0.00457, Validation cost: 0.00452
EPOCH:: 7, (5m 12s) Training cost: 0.00457, Validation cost: 0.00452
EPOCH:: 8, (5m 57s) Training cost: 0.00457, Validation cost: 0.00452
EPOCH:: 9, (6m 42s) Training cost: 0.00457, Validation cost: 0.00452
EPOCH:: 10, (7m 27s) Training cost: 0.00457, Validation cost: 0.00452
Finished Training


## 4. ハイパーパラメタのチューニング

In [21]:
from hyperopt import fmin, tpe, hp, rand

parameter_space = {
	'hidden_size':hp.quniform('hidden_size', 1, 256, q=1),
	'drop_rate':hp.uniform('drop_rate', 0, 1),
	'lr': hp.loguniform("lr", -10, 1),
}


In [32]:
def objective(args):
    hidden_size = int(args['hidden_size'])
    drop_rate   = args['drop_rate']
    lr          = args['lr']
    
    # インスタンスの作成
    gru = GRU(input_size, hidden_size, output_size, drop_rate=drop_rate)
    gru.cuda(cuda_id)

    # loss, optimizerの定義
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(gru.parameters(), lr=lr)

    n_epochs = 2
    batch_size = 500
    n_batches = train_X.shape[0]//batch_size
    n_batches_v = valid_X.shape[0]//batch_size

    for epoch in range(n_epochs):
        train_cost, valid_cost = 0, 0

        train_X, train_y = shuffle(train_X, train_y, random_state=42)

        # 訓練
        gru.train()
        train_X_t = np.transpose(train_X, (1, 0, 2)) # X.shape => (seq_len, n_samples, n_features) に変換
        for i in range(n_batches):
            start = i * batch_size
            end = start + batch_size
            inputs, labels = train_X_t[:, start:end, :], train_y[start:end]
            inputs, labels = Variable(torch.from_numpy(inputs).cuda(cuda_id)
                             ), Variable(torch.from_numpy(labels).cuda(cuda_id))
            train_cost += train(gru, inputs, labels) / train_X.shape[0]

        # 検証
        gru.eval()
        valid_X_t = np.transpose(valid_X, (1, 0, 2)) # X.shape => (seq_len, n_samples, n_features) に変換
        for i in range(n_batches_v):
            start = i * batch_size
            end = start + batch_size
            inputs, labels = valid_X_t[:, start:end, :], valid_y[start:end]
            inputs, labels = Variable(torch.from_numpy(inputs).cuda(cuda_id)
                             ), Variable(torch.from_numpy(labels).cuda(cuda_id))
            valid_cost += train(gru, inputs, labels) / valid_X.shape[0]
    
    # 最後のepochのvalidation_costを返す
    print('Validation cost: %.5f' % (valid_cost))
    return valid_cost

In [33]:
best = fmin(objective, parameter_space, algo=rand.suggest, max_evals=100)
print(best)

RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1502008109146/work/torch/lib/THC/generic/THCStorage.cu:66