In [1]:
import numpy as np

In [2]:
def sigmoid(x):
  return 1 / (1 + np.exp(-x))

In [3]:
class Sigmoid:
  def __init__(self):
    self.y = None

  def forward(self, x):
    self.y = sigmoid(x)
    return self.y

  def backward(self, dout):
    dx = self.y * (1 - self.y)
    return dx

In [4]:
def relu(x):
  if x > 0:
    return x
  else:
    return 0

In [16]:
class Relu:
  def __init__(self):
    self.mask = None

  def forward(self, x):
    self.mask = (x <= 0)
    out = x.copy()
    out[self.mask] = 0
    return out

  def backward(self, dout):
    dout[self.mask] = 0
    dx = dout
    return dx

In [17]:
class Affine:
  def __init__(self, w, b):
    self.w = w
    self.b = b
    self.dw = None
    self.db = None
    self.x = None
  
  def forward(self, x):
    self.x = x
    out = np.dot(x, self.w) + self.b
    return out

  def backward(self, dout):
    dx = np.dot(dout, self.w.T)
    self.dw = np.dot(self.x.T, dout)
    self.db = np.sum(dout, axis=0)
    return dx

In [7]:
def softmax(x):
  c = np.max(x)
  exp_x = np.exp(x-c)
  sum_exp_x = np.sum(exp_x)

  return exp_x / sum_exp_x

In [8]:
def cross_entropy_error(y, t):
  delta = 1e-7
  if y.ndim == 1:
    y = y.reshape(1, y.size)
    t = t.reshape(1, t.size)
  batch_size = y.shape[0]
  return np.sum(t * np.log(y + delta)) / batch_size

In [9]:
class SoftmaxWithLoss: # softmax와 cross_entropy_error
  def __init__(self):
    self.y = None
    self.t = None
    self.loss = None
  
  def forward(self, x, t):
    self.y = softmax(x)
    self.t = t
    self.loss = cross_entropy_error(self.y, t)
    return self.loss

  def backward(self, dout):
    batch_size = self.t.shape[0]
    dx = (self.y - self.t) / batch_size
    return dx

In [22]:
def numerical_gradient_general(f, x): # numerical_gradient(f, x)에서 x를 다차원 배열로 받을 수 있도록 수정.
  h = 1e-4
  grad = np.zeros_like(x)

  it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
  while not it.finished:
      idx = it.multi_index
      tmp_val = x[idx]

      # f(x+h)
      x[idx] = tmp_val + h
      fxh1 = f(x[idx])

      # f(x-h)
      x[idx] = tmp_val - h
      fxh2 = f(x[idx])

      grad[idx] = (fxh1 - fxh2) / (2 * h)
          
      x[idx] = tmp_val
      it.iternext()

  return grad

In [10]:
from collections import OrderedDict

In [26]:
class TwoLayerNet:
  def __init__(self, input_size, hidden_size, output_size, weight_std=0.01):
    self.params = {}
    self.params['w1'] = weight_std * np.random.randn(input_size, hidden_size)
    self.params['b1'] = np.zeros(hidden_size)
    self.params['w2'] = weight_std * np.random.randn(hidden_size, output_size)
    self.params['b2'] = np.zeros(output_size)

    self.layers = OrderedDict()
    self.layers['Affine1'] = Affine(self.params['w1'], self.params['b1'])
    self.layers['relu1'] = Relu()
    self.layers['Affine2'] = Affine(self.params['w2'], self.params['b2'])
    self.lastLayer = SoftmaxWithLoss()

  def predict(self, x):
    layers = self.layers
    for layer in layers.values():
      x = layer.forward(x)
    return x

  def loss(self, x, t):
    y = self.predict(x)
    loss = self.lastLayer.forward(y, t)
    return loss

  def numerical_gradient(self, x, t):
      loss_w = lambda w : self.loss(x, t)

      grads = {}
      grads['w1'] = numerical_gradient_general(loss_w, self.params['w1'])
      grads['b1'] = numerical_gradient_general(loss_w, self.params['b1'])
      grads['w2'] = numerical_gradient_general(loss_w, self.params['w2'])
      grads['b2'] = numerical_gradient_general(loss_w, self.params['b2'])

      return grads

  def gradient(self, x, t):
    # 순전파 먼저 연산
    self.loss(x, t)

    # 역전파
    dout = 1
    dout = self.lastLayer.backward(dout)
    grads = {}
    layers = list(self.layers.values())
    layers.reverse()
    for layer in layers:
      dout = layer.backward(dout)
    
    grads['w1'] = self.layers['Affine1'].dw
    grads['b1'] = self.layers['Affine1'].db
    grads['w2'] = self.layers['Affine2'].dw
    grads['b2'] = self.layers['Affine2'].db

    return grads

In [27]:
x_train = np.random.randn(1000, 784)

t_train = np.zeros((1000, 10))
t_train[np.arange(1000), np.random.randint(0, 9, size=(1000))] = 1

# print(x_train)
print(t_train)
print(t_train.shape)

[[0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(1000, 10)


In [28]:
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
network.gradient(x_train, t_train)

(1000, 50)
(1000, 50)
(1000, 784)


{'w1': array([[-1.89287329e-04,  1.60599637e-04,  1.91065462e-04, ...,
          1.53217406e-04, -8.60468236e-05,  3.80597900e-05],
        [-5.20152970e-04, -1.21279484e-04, -1.73406028e-04, ...,
          2.13196230e-04,  1.10462252e-05, -2.87976270e-04],
        [-1.08309908e-04, -2.55186358e-05, -5.94385577e-05, ...,
         -7.85458033e-05,  2.24783574e-05,  3.37002750e-06],
        ...,
        [-2.10964473e-04, -4.14073482e-04,  3.47505721e-04, ...,
          1.30888281e-04,  7.40818771e-05, -2.81849029e-04],
        [-1.56025202e-04, -4.92964334e-04,  2.69029263e-04, ...,
         -2.53004339e-04,  3.77263630e-04,  9.85926948e-05],
        [-1.83616099e-04,  7.21689447e-04, -9.30845909e-05, ...,
         -2.99527727e-04,  3.40892829e-04,  3.27093632e-05]]),
 'b1': array([ 2.12599872e-03, -2.84835372e-04,  6.21966947e-04, -1.25715657e-03,
        -1.15474525e-03, -1.07050838e-03,  1.68910869e-03, -2.24228306e-03,
         9.18356039e-04,  8.73582485e-05, -6.65666531e-04, -3.403

In [29]:
network.numerical_gradient(x_train, t_train)

{'w1': array([[ 1.58460596e-04, -1.84328748e-04, -1.87216331e-04, ...,
         -1.39960008e-04,  6.14035400e-05, -2.86928792e-05],
        [ 4.18475450e-04,  1.31348417e-04,  1.65124590e-04, ...,
         -2.13300115e-04, -6.09226181e-05,  2.89333588e-04],
        [ 1.00495177e-04, -5.52722312e-06,  6.56903154e-05, ...,
          3.27753025e-05, -6.19218987e-05,  2.95295166e-05],
        ...,
        [ 2.21324381e-04,  4.56172868e-04, -3.29614105e-04, ...,
         -1.40650114e-04, -6.33598152e-05,  2.25639507e-04],
        [ 1.96626484e-04,  5.03382438e-04, -2.57052291e-04, ...,
          1.45570107e-04, -4.02467322e-04, -5.56111424e-05],
        [ 1.28071518e-04, -7.57429319e-04,  9.08331987e-05, ...,
          2.51490704e-04, -3.15681064e-04, -1.04174802e-05]]),
 'b1': array([-9.32185733e-04,  8.69094814e-04, -5.62758862e-04,  9.00495332e-04,
        -5.09146263e-04, -2.84010229e-04, -2.06122497e-04,  2.03204866e-04,
         5.51437260e-04, -3.71890643e-04,  3.73580935e-04,  6.173

In [None]:
# numerical_gradient b2 : 
# [ 0.02995705,  0.00709242,  0.00349201,  0.01073253,  0.00965313,
#         0.01196627,  0.0071168 ,  0.00294501,  0.01755373, -0.10050895]

# backpropagation gradient b2 :
# [-1.29899987e-01, -1.07899098e-01, -1.02900496e-01, -1.10899743e-01,
#        -1.08900664e-01, -1.10900979e-01, -1.07899123e-01, -1.02899948e-01,
#        -1.16900572e-01,  1.00609552e-04]

뭐가 틀린걸까.. 생각보다 오차가 너무 큰데

In [30]:
a = np.array([ 0.02995705,  0.00709242,  0.00349201,  0.01073253,  0.00965313,
         0.01196627,  0.0071168 ,  0.00294501,  0.01755373, -0.10050895])

b = np.array([-1.29899987e-01, -1.07899098e-01, -1.02900496e-01, -1.10899743e-01,
        -1.08900664e-01, -1.10900979e-01, -1.07899123e-01, -1.02899948e-01,
        -1.16900572e-01,  1.00609552e-04])

np.average(np.abs(a-b)) # 오차 평균

0.1200219119552