# MNISTでSRU

In [1]:
import os
import time
import math
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_mldata
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from models import SRU, GRU, LSTM

gpu = False
# torch.cuda.set_device(2)
seed = 42
torch.manual_seed(seed)
dir_path = './trained_models/ipynb'

## 1. MNISTデータセット準備

In [2]:
def load_mnist():
    mnist = fetch_mldata('MNIST original')
    mnist_X, mnist_y = shuffle(mnist.data, mnist.target, random_state=seed)
    mnist_X = mnist_X / 255.0

    # pytorch用に型変換
    mnist_X, mnist_y = mnist_X.astype('float32'), mnist_y.astype('int64')

    # 2次元の画像を、各行を互い違いにして1次元に変換
    def flatten_img(images):
        '''
        images: shape => (n, rows, columns)
        output: shape => (n, rows*columns)
        '''
        n_rows    = images.shape[1]
        n_columns = images.shape[2]
        for num in range(n_rows):
            if num % 2 != 0:
                images[:, num, :] = images[:, num, :][:, ::-1]
        output = images.reshape(-1, n_rows*n_columns)
        return output

    mnist_X = mnist_X.reshape(-1, 28, 28)
    mnist_X = flatten_img(mnist_X) # X.shape => (n_samples, seq_len)
    mnist_X = mnist_X[:, :, np.newaxis] # X.shape => (n_samples, seq_len, n_features)

    # 訓練、テストデータに分割
    train_X, test_X, train_y, test_y = train_test_split(mnist_X, mnist_y,
                                                        test_size=0.2,
                                                        random_state=seed)
    return train_X, test_X, train_y, test_y

In [3]:
train_X, test_X, train_y, test_y = load_mnist()
# デモ用にサンプルを小さくする
train_X, test_X, train_y, test_y = train_X[:256], test_X[:256], train_y[:256], test_y[:256]

## 2. 訓練の準備

In [4]:
# 計算時間を表示させる
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

# batchあたりの訓練
def train(model, inputs, labels, optimizer, criterion, clip):
    batch_size = inputs.size(1)
    model.initHidden(batch_size) # 隠れ変数の初期化
    optimizer.zero_grad() # 勾配の初期化
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    torch.nn.utils.clip_grad_norm(model.parameters(), clip) # gradient clipping
    loss.backward()
    optimizer.step()
    acc = (torch.max(outputs, 1)[1] == labels).float().sum().data[0] / batch_size
    return loss.data[0], acc

# 検証
def test(model, inputs, labels, criterion):
    batch_size = inputs.size(1)
    model.initHidden(batch_size)
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    acc = (torch.max(outputs, 1)[1] == labels).float().sum().data[0] / batch_size
    return outputs, loss.data[0], acc

# モデルの保存
def checkpoint(model, optimizer, acc):
    filename = os.path.join(dir_path, '%s_acc-%d' % (model.__class__.__name__, acc))
    # modelの状態保存
    torch.save(model.state_dict(), filename + '.model')
    # optimizerの状態保存
    torch.save(optimizer.state_dict(), filename + '.state')

## 3. モデル、パラメータの設定

In [5]:
input_size = train_X.shape[2]
output_size = np.unique(train_y).size

# パラメータの設定
lr = 0.0005
weight_decay = 0.0005
dropout = 0.8
clip = 1

In [6]:
def load_model(model_name):
    if model_name == 'sru':
        phi_size      = 200
        r_size        = 60
        cell_out_size = 200
    elif model_name in ['gru', 'lstm']:
        hidden_size = 200
        num_layers  = 1
        init_forget_bias = 1
    
    # モデルのインスタンス作成
    if model_name == 'sru':
        model = SRU(input_size, phi_size, r_size, cell_out_size, output_size, dropout=dropout, gpu=gpu)
        model.initWeight()
    elif model_name == 'gru':
        model = GRU(input_size, hidden_size, output_size, num_layers, dropout, gpu=gpu)
        model.initWeight(init_forget_bias)
    elif model_name == 'lstm':
        model = LSTM(input_size, hidden_size, output_size, num_layers, dropout, gpu=gpu)
        model.initWeight(init_forget_bias)
    if gpu == True:
        model.cuda()

    return model

In [7]:
model = load_model('sru')
# loss, optimizerの定義
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

## 4. 訓練

In [8]:
''' 訓練 '''

n_epochs = 10
batch_size = 64
n_batches = train_X.shape[0]//batch_size
n_batches_test = test_X.shape[0]//batch_size
all_acc = []
start_time = time.time()

for epoch in range(n_epochs):
    train_cost, test_cost, train_acc, test_acc  = 0, 0, 0, 0
    train_X, train_y = shuffle(train_X, train_y, random_state=seed)

    # 訓練
    model.train()
    train_X_t = np.transpose(train_X, (1, 0, 2)) # X.shape => (seq_len, n_samples, n_features) に変換
    for i in range(n_batches):
        start = i * batch_size
        end = start + batch_size
        inputs, labels = train_X_t[:, start:end, :], train_y[start:end]
        inputs, labels = Variable(torch.from_numpy(inputs)
                         ), Variable(torch.from_numpy(labels))
        if gpu == True:
            inputs, labels = inputs.cuda(), labels.cuda()
        cost, accuracy = train(model, inputs, labels, optimizer, criterion, clip)
        train_cost += cost / n_batches
        train_acc  += accuracy / n_batches

    # 検証
    model.eval()
    test_X_t = np.transpose(test_X, (1, 0, 2))
    for i in range(n_batches_test):
        start = i * batch_size
        end = start + batch_size
        inputs, labels = test_X_t[:, start:end, :], test_y[start:end]
        inputs, labels = Variable(torch.from_numpy(inputs)
                         ), Variable(torch.from_numpy(labels))
        if gpu == True:
            inputs, labels = inputs.cuda(), labels.cuda()
        _, cost, accuracy = test(model, inputs, labels, criterion)
        test_cost += cost / n_batches_test
        test_acc += accuracy / n_batches_test

    print('EPOCH:: %i, (%s) train_cost: %.3f, test_cost: %.3f, train_acc: %.3f, test_acc: %.3f' % (epoch + 1,
                       timeSince(start_time), train_cost, test_cost, train_acc, test_acc))

#     # 過去のエポックのtest_accを上回った時だけモデルの保存
#     if len(all_acc) == 0 or test_acc > max(all_acc):
#         checkpoint(model, optimizer, test_acc*10000)
#     all_acc.append(test_acc)

print('Finished Training')

EPOCH:: 1, (0m 18s) train_cost: 2.523, test_cost: 2.301, train_acc: 0.090, test_acc: 0.113
EPOCH:: 2, (0m 36s) train_cost: 2.296, test_cost: 2.300, train_acc: 0.102, test_acc: 0.078
EPOCH:: 3, (0m 55s) train_cost: 2.301, test_cost: 2.300, train_acc: 0.074, test_acc: 0.078
EPOCH:: 4, (1m 15s) train_cost: 2.280, test_cost: 2.301, train_acc: 0.141, test_acc: 0.078
EPOCH:: 5, (1m 34s) train_cost: 2.311, test_cost: 2.300, train_acc: 0.105, test_acc: 0.113
EPOCH:: 6, (1m 52s) train_cost: 2.261, test_cost: 2.298, train_acc: 0.172, test_acc: 0.113
EPOCH:: 7, (2m 10s) train_cost: 2.269, test_cost: 2.298, train_acc: 0.148, test_acc: 0.133
EPOCH:: 8, (2m 29s) train_cost: 2.278, test_cost: 2.300, train_acc: 0.105, test_acc: 0.148
EPOCH:: 9, (2m 46s) train_cost: 2.274, test_cost: 2.301, train_acc: 0.117, test_acc: 0.141
EPOCH:: 10, (3m 3s) train_cost: 2.274, test_cost: 2.300, train_acc: 0.141, test_acc: 0.129
Finished Training


## 5. 訓練済みモデルのロード

In [9]:
model.load_state_dict(torch.load(dir_path + '/SRU_sample.model'))
if gpu == True:
    model.cuda()
    model._gpu = True
    model.A_mask = model.A_mask.cuda()
    optimizer.load_state_dict(torch.load(dir_path + '/SRU_sample.state'))
else:
    model.cpu()
    model._gpu = False
    model.A_mask = model.A_mask.cpu()
    # ToDo: optimizerをcpu, gpuともに対応させてloadできるようにする
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

## 6. テスト

In [10]:
model.eval()
test_X_t = np.transpose(test_X, (1, 0, 2))
inputs, labels = test_X_t, test_y
inputs, labels = Variable(torch.from_numpy(inputs)
                 ), Variable(torch.from_numpy(labels))
if gpu == True:
    inputs, labels = inputs.cuda(), labels.cuda()
outputs, cost, accuracy = test(model, inputs, labels, criterion)
print(accuracy)

0.97265625
