In [2]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from books.yubook.common.functions import softmax, cross_entropy_error
from books.yubook.datasets import mnist
from books.yubook.project3.two_layer_net import TwoLayerNet

链式法则：

$$
\frac{\delta z}{\delta x} = \frac{\delta z}{\delta t} \frac{\delta t}{\delta x}
$$

**加法节点的反向传播只是将信号反向传播到下一个节点**。

**乘法节点的反向传播会乘上一个翻转值传播到下一个节点**，例如，$z = xy$，那么就有$\delta x = \frac{\delta L}{\delta z} y$

In [5]:
class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None

    def forward(self, x, y):
        self.x = x
        self.y = y
        return x * y

    def backward(self, dout):
        dx = dout * self.y
        dy = dout * self.x

        return dx, dy

apple = 100
apple_num = 2
tax = 1.1

mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)
print(price)

dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)
print(dapple, dapple_num, dtax)

220.00000000000003
2.2 110.00000000000001 200


In [9]:
class AddLayer:
    def __init__(self):
        self.x = None
        self.y = None

    def forward(self, x, y):
        return x + y

    def backward(self, dout):
        dx = dout * 1
        dy = dout * 1
        return dx, dy

apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

mul_apple_layer = MulLayer()
mul_organge_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()

apple_price = mul_apple_layer.forward(apple, apple_num)
orange_price = mul_organge_layer.forward(orange, orange_num)
all_price = add_apple_orange_layer.forward(apple_price, orange_price)
price = mul_tax_layer.forward(all_price, tax)
print(price)

dprice = 1
dall_price, dtax = mul_tax_layer.backward(dprice)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price)
dorange, dorange_num = mul_organge_layer.backward(dorange_price)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

print(price, dapple, dapple_num, dorange, dorange_num, dtax)

715.0000000000001
715.0000000000001 2.2 110.00000000000001 3.3000000000000003 165.0 650


ReLU Layer:

$$
y = \left\{\begin{aligned}
x \quad (x \gt 0) \\
0 \quad (x \le 0)
\end{aligned}\right.
$$

$$
\frac{\delta y}{\delta x} = \left\{\begin{aligned}
1 \quad (x \gt 0) \\
0 \quad (x \le 0)
\end{aligned}\right.
$$

In [10]:
class ReLU:
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0

        return out

    def backward(self, dout):
        dout[self.mask] = 0
        return dout

Sigmoid Layer:

$$
y = \frac{1}{1 + e^{-x}}
$$

$$
\begin{aligned}
\frac{\delta L}{\delta y} &= \frac{\delta L}{\delta y}y^2e^{-x} \\
    &= \frac{\delta L}{\delta y}\frac{1}{(1 + e^{-x})^2}e^{-x} \\
    &= \frac{\delta L}{\delta y}\frac{1}{(1 + e^{-x})}\frac{e^{-x}}{(1 + e^{-x})^2} \\
    &= \frac{\delta L}{\delta y}y(1 - y)
\end{aligned}
$$

In [None]:
class Sigmoid:
    def __init__(self):
        self.out = None

    def forward(self, x):
        self.out = 1 / (1 + np.exp(-x))
        return self.out

    def backward(self, dout):
        dx = dout * self.out * (1.0 - self.out)
        return dx

Affine Layer:

$$
y = x \times w + b
$$

$$
\frac{\delta L}{\delta X} = \frac{\delta L}{\delta Y} \cdot W^T
$$

$$
\frac{\delta L}{\delta W} = X^T \cdot \frac{\delta L}{\delta Y}
$$

$$
\frac{\delta L}{\delta B} = \frac{\delta L}{\delta Y}
$$

In [None]:
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None

    def forward(self, x):
        self.x = x
        return np.dot(self.x, self.W) + self.b

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)

        return dx

经典多重感知机:

$$
Input \longrightarrow (Affine -> ReLU) \times N \longrightarrow Affine \longrightarrow Softmax
\longrightarrow Output
$$

经典多分类概率问题

Softmax Layer(Softmax with Loss Layer):

$$
\frac{\delta L}{\delta a} = y - t
$$

In [None]:
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None

    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)

        return self.loss

    def backward(self, dout = 1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size     # for single x

        return dx

In [15]:
from books.yubook.datasets import mnist
from books.yubook.project3.two_layer_net import TwoLayerNet

(x_train, t_train), (x_test, t_test) = mnist.load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=100, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for epoch in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]

    grad = network.gradient(x_batch, t_batch)

    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]

    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)

    if epoch % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("train_acc={:.4f}, test_acc={:.4f}, loss={:.4f}".format(train_acc, test_acc, loss))

# x_batch = x_train[:3]
# t_batch = t_train[:3]
#
# grad_backprop = network.gradient(x_batch, t_batch)
# print(grad_backprop)

train_acc=0.1135, test_acc=0.1159, loss=2.2990
train_acc=0.9070, test_acc=0.9116, loss=0.1892
train_acc=0.9229, test_acc=0.9273, loss=0.2193
train_acc=0.9404, test_acc=0.9392, loss=0.1694
train_acc=0.9499, test_acc=0.9479, loss=0.1567
train_acc=0.9570, test_acc=0.9539, loss=0.2089
train_acc=0.9610, test_acc=0.9578, loss=0.1355
train_acc=0.9661, test_acc=0.9612, loss=0.0901
train_acc=0.9697, test_acc=0.9646, loss=0.0989
train_acc=0.9728, test_acc=0.9684, loss=0.1183
train_acc=0.9751, test_acc=0.9687, loss=0.0903
train_acc=0.9770, test_acc=0.9702, loss=0.0564
train_acc=0.9791, test_acc=0.9719, loss=0.0631
train_acc=0.9802, test_acc=0.9719, loss=0.0331
train_acc=0.9818, test_acc=0.9738, loss=0.0483
train_acc=0.9825, test_acc=0.9738, loss=0.0706
train_acc=0.9833, test_acc=0.9732, loss=0.0359
