# 역전파의 구현과 확률적 경사하강법 구현

In [3]:
import matplotlib.pyplot as plt
import numpy as np

저번 시간에 구현했던 MNIST 데이터셋은 다음과 같습니다.

In [4]:
import os, cv2
class mnist_dataset():
    def __init__(self, path):
        data_list = []
        for label in range(10):
            label_path = os.path.join(path, str(label))
            file_list = os.listdir(label_path)
            data_list += [[os.path.join(label_path, file), label] for file in file_list]
        self.data_list = data_list
    def __len__(self):
        return len(self.data_list)
    def __getitem__(self, i):
        png_path, label = self.data_list[i]
        img_loaded = cv2.imread(png_path, cv2.IMREAD_GRAYSCALE)
        return img_loaded.reshape(-1)/255, label
training_dataset = mnist_dataset('../data/mnist_png/training/')
test_dataset = mnist_dataset('../data/mnist_png/testing/')

이를 이용해 배치를 생성하는 클래스 dataloader를 다음과 구현합니다.     
dataloader는 임의의 dataset을 입력으로 받아 배치들을 생성해 주는 함수입니다.

In [5]:
x = np.arange(10)
np.random.shuffle(x)
print(x)

[2 3 5 4 8 7 9 0 6 1]


In [9]:
class dataloader():
    def __init__(self, dataset, batch_size):
        self.dataset = dataset
        self.batch_size = batch_size
        self.length = len(dataset)
        self.index = np.arange(self.length)
        np.random.shuffle(self.index)
    def __getitem__(self, i):
        if i == len(self) - 1:
            self.shuffle()
        return [self.dataset[self.index[j]] for j in range(i * self.batch_size, (i+1) * self.batch_size)]
    def __len__(self):
        return self.length//self.batch_size
    def shuffle(self):
        np.random.shuffle(self.index)
        
training_dataloader = dataloader(training_dataset, 16)

In [10]:
for batch in tqdm(training_dataloader):
    pass

100%|█████████████████████████████████████████████████████████████████████████████| 3750/3750 [00:08<00:00, 448.72it/s]


In [8]:
from tqdm import tqdm

저번 시간에 구현했던 MLP 네트워크와 각종 함수들은 다음과 같습니다

In [7]:
def sigmoid(z):
    return 1/(1 + np.exp(-z))

def ReLU(x):
    return x * (x > 0)

def no_activation(x):
    return x

def softmax_temp(z): 
    return np.exp(z) / np.sum(np.exp(z))

def softmax(z):
    y = z- np.max(z)
    return np.exp(y) / np.sum(np.exp(y))

def cross_entropy_loss(y, y_hat):
    return np.sum( -y * np.log(y_hat))

class MLP_Network:
    def __init__(self, sizes, activation_function = ReLU, last_activation = no_activation):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y) for y in sizes[1:]]
        self.weights = []
        for i in range(self.num_layers-1):
            self.weights.append(np.random.randn(self.sizes[i+1], self.sizes[i]))
        self.activation_function = ReLU
        self.last_activation = last_activation
    def forward(self, a):
        for i, [bias, weight] in enumerate(zip(self.biases, self.weights)):
            if i == self.num_layers - 2:
                activation = self.last_activation
            else:
                activation = self.activation_function
            a = activation(np.dot(weight, a)+bias)
            
        return a
    
    def accuracy(self,dataset):
        num_correct = 0
        for i, [img, label] in enumerate(dataset):
            y = eye[label]
            y_hat = self.forward(img.reshape(-1))
            if np.argmax(y_hat) == label:
                num_correct += 1
        print(f'Accuracy is {num_correct/len(dataset)}' )
        

    
net = MLP_Network([2, 3, 6, 4, 10, 10], activation_function = sigmoid)
output = net.forward(np.random.randn((2)))
print(output)
print(np.sum(output))

net = MLP_Network([2, 3, 6, 4, 10, 10], activation_function = sigmoid, last_activation = softmax)
output = net.forward(np.random.randn((2)))
print(output)
print(np.sum(output))

[ 2.48026437  4.16774204 -7.50866318  5.09699204 -3.80197689 -1.49635146
  2.08351428  0.48658552 -3.3346366  -2.26909525]
-4.095625143264587
[1.95928009e-07 4.80767971e-02 1.49119696e-06 2.88211066e-06
 8.77502455e-01 7.95170055e-09 3.32523136e-05 1.43003059e-07
 4.43998822e-03 6.99427870e-02]
1.0


역전파의 구현의 편의를 위해 활성화 함수는 ReLU로 마지막 활성화 함수는 소프트맥스로 고정한 후 구현합니다.

In [8]:
class MLP_Network:
    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y) for y in sizes[1:]]
        self.weights = []
        for i in range(self.num_layers-1):
            self.weights.append(np.random.randn(self.sizes[i+1], self.sizes[i]))
        self.activation_function = ReLU
        self.last_activation = softmax
    def forward(self, a):
        for i, [bias, weight] in enumerate(zip(self.biases, self.weights)):
            if i == self.num_layers - 2:
                activation = self.last_activation
            else:
                activation = self.activation_function
            a = activation(np.dot(weight, a)+bias)
            
        return a
    
    def accuracy(self, dataset):
        num_correct = 0
        for i, [img, label] in enumerate(dataset):
            y = eye[label]
            y_hat = self.forward(img.reshape(-1))
            if np.argmax(y_hat) == label:
                num_correct += 1
        print(f'Accuracy is {num_correct/len(dataset)}' )

이제 역전파를 위한 함수를 구현합니다.    
수업시간에 다뤘던 대로 $\frac{\partial C}{\partial a^{L}}$를 계산해야 합니다.    
수업시간에는 L2 손실함수를 사용하였기 때문에 $\frac{\partial C}{\partial a^{L}} = -(y-x)$였습니다.    
하지만 교차 엔트로피 함수를 손실함수로 사용하면 이 미분값도 다시 계산해야 합니다.

마지막 $L$층에서의 출력값은 $a^L = softmax(z^L)$가 되고 이를 교차 엔트로피 손실함수에 입력하면 $C = \sum ^n _{i=1} -y_{i}\textrm{log}(a^L_i)$가 됩니다.     
 $\frac{\partial C}{\partial z^{L}}$을 계산하면 $y_i=1$인 $i$에 대해 $\frac{\partial C}{\partial z^{L}} = a^L_i - y_i$를 얻을 수 있습니다.

먼저 역전파 진행하는데 앞서 순전파를 진행하여 필요한 $z^l$과 $a^l$를 계산합니다.

In [9]:
def backward(self, x, y):
    y = self.eye[y]
    grad_bs = [np.zeros(b.shape) for b in self.biases]
    grad_ws = [np.zeros(w.shape) for w in self.weights]
    z_s = []
    a_s = [x]
    a = x
    for i, [bias, weight] in enumerate(zip(self.biases, self.weights)):
        if i == self.num_layers - 2:
            activation = self.last_activation
        else:
            activation = self.activation_function
        z = np.matmul(weight, a)+bias
        a = activation(z)
        z_s.append(z)
        a_s.append(a)
    dz = a_s[-1] - y 
    grad_bs[-1] = dz
    grad_ws[-1] = np.matmul(dz.reshape(-1, 1), a_s[-2].reshape(1, -1))
    for l in range(2, self.num_layers):
        z = z_s[-l]
        dz = np.matmul(self.weights[-l+1].transpose(), dz) * self.ReLU_prime(z)
        grad_bs[-l] = dz
        grad_ws[-l] = np.matmul(dz.reshape(-1,1), a_s[-l-1].reshape(1,-1))
    return [grad_bs, grad_ws]

    
def ReLU_prime(self, z):
    return (z>0).astype(np.int)



각각의 $(x_i, y_i)$ 대해 경사를 계산 후 이를 배치 전체의 데이터에 대해 더해 전체 경사를 계산합니다.

In [10]:
def grad_mini_batch(self, batch):
    sum_grad_b = [np.zeros(b.shape) for b in self.biases]
    sum_grad_w = [np.zeros(w.shape) for w in self.weights]
    for x, y in mini_batch:
        grad_b, grad_w = self.backprop(x, y)
        sum_grad_b = [sgb + gb for sgb, gb in zip(sum_grad_b, grad_b)]
        sum_grad_w = [sgw + gw for sgw, gw in zip(sum_grad_w, grad_w)]
    return sum_grad_b, sum_grad_w

이를 이용해 배치와 학습률 $\lambda$를 입력으로 받아 파라미터를 업데이트하는 함수를 다음과 같이 구현합니다.

In [11]:
def update_mini_batch(self, batch, lambda_):
    n = len(mini_batch)
    sum_grad_b, sum_grad_w = self.grad_mini_batch(batch)
    self.weigths = [w- (lambda_/n) * sgb  for w, sgb in zip(self.weights, sum_grad_b)]
    self.biases = [b- (lambda_/n) * sgw  for b, sgw in zip(self.biases, sum_grad_w)]

배치에 대해 계산한 경사를 이용해 확률적 경사 하강법을 다음과 같이 구현합니다.

In [12]:
def SGD(self, dataset, epochs, batch_size, lambda_):
    training_dataloader = dataloader(dataset, batch_size)
    n = len(training_data)
    for j in range(epochs):
        for mini_batch in training_dataloader:
            self.update_mini_batch(mini_batch, lambda_)
        print(f'Epoch {j} complete')

지금까지 구현한 결과를 통합하면 다음과 같습니다.

In [13]:
class MLP_Network:
    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y) for y in sizes[1:]]
        self.weights = []
        for i in range(self.num_layers-1):
            self.weights.append(np.random.randn(self.sizes[i+1], self.sizes[i]))
        self.activation_function = ReLU
        self.last_activation = softmax
        self.eye = np.eye(sizes[-1])
    def forward(self, a):
        for i, [bias, weight] in enumerate(zip(self.biases, self.weights)):
            if i == self.num_layers - 2:
                activation = self.last_activation
            else:
                activation = self.activation_function
            a = activation(np.matmul(weight, a)+bias)
        return a
    
    def accuracy(self,dataset):
        num_correct = 0
        for i, [img, label] in enumerate(dataset):
            y = self.eye[label]
            y_hat = self.forward(img)
            if np.argmax(y_hat) == label:
                num_correct += 1
        print(f'Accuracy is {num_correct/len(dataset)}' )
    def backward(self, x, y):
        y = self.eye[y]
        grad_bs = [np.zeros(b.shape) for b in self.biases]
        grad_ws = [np.zeros(w.shape) for w in self.weights]
        z_s = []
        a_s = [x]
        a = x
        for i, [bias, weight] in enumerate(zip(self.biases, self.weights)):
            if i == self.num_layers - 2:
                activation = self.last_activation
            else:
                activation = self.activation_function
            z = np.matmul(weight, a)+bias
            a = activation(z)
            z_s.append(z)
            a_s.append(a)
        dz = a_s[-1] - y 
        grad_bs[-1] = dz
        grad_ws[-1] = np.matmul(dz.reshape(-1, 1), a_s[-2].reshape(1, -1))
        for l in range(2, self.num_layers):
            z = z_s[-l]
            dz = np.matmul(self.weights[-l+1].transpose(), dz) * self.ReLU_prime(z)
            grad_bs[-l] = dz
            grad_ws[-l] = np.matmul(dz.reshape(-1,1), a_s[-l-1].reshape(1,-1))
        return [grad_bs, grad_ws]

    def ReLU_prime(self, z):
        return (z>0).astype(np.int)
    
    def grad_batch(self, batch):
        sum_grad_b = [np.zeros(b.shape) for b in self.biases]
        sum_grad_w = [np.zeros(w.shape) for w in self.weights]
        for x, y in batch:
            grad_b, grad_w = self.backward(x, y)
            sum_grad_b = [sgb + gb for sgb, gb in zip(sum_grad_b, grad_b)]
            sum_grad_w = [sgw + gw for sgw, gw in zip(sum_grad_w, grad_w)]
        return sum_grad_b, sum_grad_w
    
    def update_batch(self, batch, lambda_):
        n = len(batch)
        sum_grad_b, sum_grad_w = self.grad_batch(batch)
        self.weights = [w- (lambda_/n) * sgb  for w, sgb in zip(self.weights, sum_grad_w)]
        self.biases = [b- (lambda_/n) * sgw  for b, sgw in zip(self.biases, sum_grad_b)]
        
    def SGD(self, dataset, epochs, batch_size, lambda_):
        training_dataloader = dataloader(dataset, batch_size)
        for j in range(epochs):
            for mini_batch in training_dataloader:
                self.update_batch(mini_batch, lambda_)
                
            print(f'Epoch {j} complete')
            self.accuracy(dataset)
            self.loss_dataset(dataset)
            
    def loss(self, img, label):
        y = self.eye[label]
        y_hat = self.forward(img)
        loss = cross_entropy_loss(y, y_hat)
        return loss
    
    def loss_dataset(self, dataset):
        loss_sum = 0
        for i, [img, label] in enumerate(dataset):
            loss = self.loss(img, label)
            loss_sum += loss
        print(f'Loss is {loss_sum/len(dataset)}')
        return loss_sum/len(dataset)

In [14]:
mlp_network = MLP_Network([784, 512, 10])
mlp_network.SGD(training_dataset, 3, 10, 1.0)