# Backpropagation

In [1]:
import numpy as np

In [2]:
class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None
    
    def forward(self, x, y):
        self.x = x
        self.y = y
        out = x * y
        return out
    
    def backward(self, dout):
        dx = dout * self.y
        dy = dout * self.x
        return dx, dy

In [3]:
apple = 100
apple_num = 2
tax = 1.1

mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)

price

220.00000000000003

In [4]:
dprice = 1

dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

print('dapple_price: {}'.format(dapple_price))
print('dtax: {}'.format(dtax))
print('dapple: {}'.format(dapple))
print('dapple_num: {}'.format(dapple_num))

dapple_price: 1.1
dtax: 200
dapple: 2.2
dapple_num: 110.00000000000001


In [5]:
class AddLayer:
    def __init__(self):
        pass
    
    def forward(self, x, y):
        return x + y
    
    def backward(self, dout):
        return dout, dout

In [6]:
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

mul_apple_layer, mul_orange_layer, mul_tax_layer = MulLayer(), MulLayer(), MulLayer()
add_apple_orange_layer = AddLayer()

apple_price, orange_price = mul_apple_layer.forward(apple, apple_num), mul_orange_layer.forward(orange, orange_num)
price_before_tax = add_apple_orange_layer.forward(apple_price, orange_price)
price = mul_tax_layer.forward(price_before_tax, tax)

print('apple_price: {}'.format(apple_price))
print('orange_price: {}'.format(orange_price))
print('price_before_tax: {}'.format(price_before_tax))
print('price: {}'.format(price))
print()

dprice = 1
dprice_before_tax, dtax = mul_tax_layer.backward(dprice)
dapple_price, dorange_price = add_apple_orange_layer.backward(dprice_before_tax)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)
dorange, dorange_num = mul_orange_layer.backward(dorange_price)

print('dprice: {}'.format(dprice))
print('dprice_before_tax: {}'.format(dprice_before_tax))
print('dapple_price: {}'.format(dapple_price))
print('dorange_price: {}'.format(dorange_price))
print('dapple: {}'.format(dapple))
print('dorange: {}'.format(dorange))
print('dapple_num: {}'.format(dapple_num))
print('dorange_num: {}'.format(dorange_num))

apple_price: 200
orange_price: 450
price_before_tax: 650
price: 715.0000000000001

dprice: 1
dprice_before_tax: 1.1
dapple_price: 1.1
dorange_price: 1.1
dapple: 2.2
dorange: 3.3000000000000003
dapple_num: 110.00000000000001
dorange_num: 165.0


In [7]:
class Relu:
    def __init__(self):
        self.mask = None
        
    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out
    
    def backward(self, dout):
        dx = dout.copy()
        dx[self.mask] = 0
        return dx

In [8]:
class Sigmoid:
    def __init__(self):
        self.out = None
        
    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        return out
    
    def backward(self, dout):
        dx = dout * self.out * (1 - self.out)
        return dx

In [9]:
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.X = None
        self.dW = None
        self.db = None
        
    def forward(self, X):
        out = X.dot(self.W) + b
        self.X = X
        return out
    
    def backward(self, dout):
        dx = dout.dot(self.W.T)
        self.dW = self.X.T.dot(dout)
        self.db = dout.sum(axis=0)
        return dx

In [10]:
def cross_entropy_error(y, t):
    if y.ndim == 1:
        y = y.reshape(1, -1)
        t = t.reshape(1, -1)
    batch_size = y.shape[0]
    return -np.sum(t * np.log(y + 1e-7)) / batch_size

In [11]:
def softmax(a):
    c = np.max(a)
    exp_a = np.exp(a - c)
    sum_exp_a = np.sum(exp_a)
    return exp_a / sum_exp_a

In [12]:
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None
        
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss
    
    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        return dx

In [13]:
import os
import sys
sys.path.append(os.pardir)
import numpy as np
from common.layers import *
from common.gradient import numerical_gradient
from collections import OrderedDict

In [14]:
class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        self.params = params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        
        self.lastlayer = SoftmaxWithLoss()
    
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        return x
    
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastlayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1:
            t = np.argmax(t, axis=1)
        accuracy = np.mean(y == t)
        return accuracy
    
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        return grads
    
    def gradient(self, x, t):
        
        self.loss(x, t)
        
        dout = 1
        dout = self.lastlayer.backward(dout)
        layers = list(self.layers.values())
        layers.reverse()
        
        for layer in layers:
            dout = layer.backward(dout)
            
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads

In [15]:
import sys, os
sys.path.append(os.pardir)
import numpy as np
from dataset.mnist import load_mnist

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label = True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
x_batch = x_train[:3]
t_batch = t_train[:3]
grad_numerical = network.numerical_gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch, t_batch)

for key in grad_numerical.keys():
    diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]) )
    print(key + ":" + str(diff))

W1:4.2554946990302684e-10
b1:2.9282044488395004e-09
W2:5.054496531598168e-09
b2:1.3999609940057445e-07


In [16]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1
train_loss_list = []
train_acc_list = []
test_acc_list = []
iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    # 通过误差反向传播法求梯度
    grad = network.gradient(x_batch, t_batch)
    # 更新
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
        loss = network.loss(x_batch, t_batch)
        train_loss_list.append(loss)
        if i % iter_per_epoch == 0:
            train_acc = network.accuracy(x_train, t_train)
            test_acc = network.accuracy(x_test, t_test)
            train_acc_list.append(train_acc)
            test_acc_list.append(test_acc)
            print(train_acc, test_acc)

0.12006666666666667 0.1147
0.1201 0.1147
0.13161666666666666 0.1282
0.12365 0.1157
0.9025333333333333 0.9044
0.9025333333333333 0.9046
0.90305 0.9058
0.9031 0.9058
0.9218666666666666 0.9243
0.9218833333333334 0.9243
0.9205166666666666 0.9236
0.9204833333333333 0.9235
0.9353 0.9344
0.9352166666666667 0.9343
0.9338833333333333 0.9336
0.9338666666666666 0.9337
0.9411 0.939
0.9410833333333334 0.9389
0.9414833333333333 0.9388
0.9414833333333333 0.9388
0.9492333333333334 0.9458
0.9492666666666667 0.9459
0.9497333333333333 0.9464
0.9497333333333333 0.9464
0.9557166666666667 0.9534
0.9557333333333333 0.9532
0.95585 0.9528
0.9558666666666666 0.9528
0.9593333333333334 0.9557
0.9592666666666667 0.9557
0.9592166666666667 0.9558
0.9592666666666667 0.9558
0.9622333333333334 0.9579
0.9622166666666667 0.9579
0.96235 0.958
0.96235 0.9579
0.9665666666666667 0.9629
0.9665833333333333 0.9629
0.96675 0.9629
0.96675 0.9628
0.9680166666666666 0.9616
0.9680166666666666 0.9615
0.9681833333333333 0.9616
0.96818