In [1]:
import numpy as np
def cross_entropy_error(y, t):
   if y.ndim == 1:
       t = t.reshape(1, t.size)
       y = y.reshape(1, y.size)
   # 教師データがone-hot-vectorの場合、正解ラベルのインデックスに変換
   if t.size == y.size:
       t = t.argmax(axis=1)
   batch_size = y.shape[0]
   return -np.sum(np.log(y[np.arange(batch_size), t])) / batch_size

In [2]:
class SoftmaxWithLoss:
   def __init__(self):
       self.loss = None
       self.y = None # softmaxの出力
       self.t = None # 教師データ
   def forward(self, x, t):
       self.t = t
       self.y = softmax(x)
       # forwardの式
       # -sum ( t * log (y))
       self.loss = cross_entropy_error(self.y, self.t)
       return self.loss
   def backward(self, dout=1):
       # backwardの式
       # yi - ti (iはIndex)
       batch_size = self.t.shape[0]
       # Backwardを実装して、微分値をdxに代入してください
       dx = (self.y - self.t) / batch_size
       return dx

In [3]:
import sys, os
sys.path.append(os.pardir)  # 親ディレクトリのファイルをインポートするための設定
from common.functions import *
from common.gradient import numerical_gradient

class TwoLayerNet:

    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

    def predict(self, x):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
    
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)
       
        return y
       
    # x:入力データ, t:教師データ
    def loss(self, x, t):
        y = self.predict(x)
       
        return cross_entropy_error(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
        
    # x:入力データ, t:教師データ
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
        
    def gradient(self, x, t):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        grads = {}
        
        batch_num = x.shape[0]
        
        # forward
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)
        
        # backward
        dy = (y - t) / batch_num
        grads['W2'] = np.dot(z1.T, dy)
        grads['b2'] = np.sum(dy, axis=0)
        
        da1 = np.dot(dy, W2.T)
        dz1 = sigmoid_grad(a1) * da1
        grads['W1'] = np.dot(x.T, dz1)
        grads['b1'] = np.sum(dz1, axis=0)

        return grads

In [4]:
from dataset.mnist import load_mnist

# データの読み込み
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
x_batch = x_train[:3]
t_batch = t_train[:3]
# 数値微分
grad_numerical = network.numerical_gradient(x_batch, t_batch)

# Backward
#grad_backprop = gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch, t_batch)
for key in grad_numerical.keys():
   diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]) )
   print(key + ":" + str(diff))

W1:4.067192341865417e-13
b1:1.416935710608783e-12
W2:1.532551507921198e-11
b2:1.203481675426943e-10


数値微分との差が十分に小さいとわかる。

In [5]:
def gradient(network, x, t):
   # 自分で実装したSoftmax with lossクラスを使ってみてください
   lastLayer = SoftmaxWithLoss()
   # forward   
   #self.loss(x, t)
   network.loss(x, t)
   # backward
   dout = 1
   dout = lastLayer.backward(dout)
   #layers = list(self.layers.values())
   layers = list(network.layers.values())
   layers.reverse()
   for layer in layers:
      dout = layer.backward(dout)
   # 設定
   grads = {}
   #grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
   grads['W1'], grads['b1'] = network.layers['Affine1'].dW, self.layers['Affine1'].db
   #grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db
   grads['W2'], grads['b2'] = network.layers['Affine2'].dW, self.layers['Affine2'].db
   return grads

In [6]:
from dataset.mnist import load_mnist

# データの読み込み
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []
iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
   batch_mask = np.random.choice(train_size, batch_size)
   x_batch = x_train[batch_mask]
   t_batch = t_train[batch_mask]
    
   #grad = network.numerical_gradient(x_batch, t_batch)
   #grad = gradient(x_batch, t_batch)
   grad = network.gradient(x_batch, t_batch)

   # 更新
   for key in ('W1', 'b1', 'W2', 'b2'):
       network.params[key] -= learning_rate * grad[key]
   loss = network.loss(x_batch, t_batch)
   train_loss_list.append(loss)
    
   if i % iter_per_epoch == 0:
       train_acc = network.accuracy(x_train, t_train)
       test_acc = network.accuracy(x_test, t_test)
       train_acc_list.append(train_acc)
       test_acc_list.append(test_acc)
       print(train_acc, test_acc)

0.11236666666666667 0.1135
0.7889 0.795
0.8772666666666666 0.8818
0.89905 0.9026
0.9087166666666666 0.9097
0.9140833333333334 0.9136
0.9193 0.9188
0.923 0.9234
0.9272 0.9267
0.9300666666666667 0.929
0.9332 0.9326
0.9363833333333333 0.936
0.93925 0.9383
0.94185 0.9414
0.9440166666666666 0.9424
0.94575 0.9437
0.94715 0.9457


### 感想
逆伝搬による微分の簡単さがなんとなくわかった。
本当はTwoLayersNetにレイヤーを生成して(+lastLayerをSoftmaxWithLossにして)forで回したかったが、何度やっても(恐らくself.tがNoneになってしまって)エラーが出てしまったため断念した。残念。少し悔しい。
### 参考文献
[深層学習による画像認識・生成](https://www.inf.uec.ac.jp/kobo2021/?%E6%B7%B1%E5%B1%A4%E5%AD%A6%E7%BF%92%E3%81%AB%E3%82%88%E3%82%8B%E7%94%BB%E5%83%8F%E8%AA%8D%E8%AD%98%E3%83%BB%E7%94%9F%E6%88%90#tdb683e4)

[【ゼロから作るDeep Learning】5章 誤差逆伝播法](https://yusuke-ujitoko.hatenablog.com/entry/2016/12/28/155150)