In [5]:
import numpy as np
import sklearn.datasets

In [31]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

def d_sigmoid(x):
    # derivative of sigmoid
    exp = np.exp(-x)
    return (exp)/((1+exp)**2)

def softmax(x):
    exp = np.exp(x)
    return exp/np.sum(exp, axis=0)

def compute_loss(y_true, y_pred):
    # loss calculation

    num_sample = y_true.shape[1]
    Li = -1 * np.sum(y_true * np.log(y_pred))

    return Li/num_sample

def foward_pass(x, params, activation):
    if activation == 'relu':
        params["S1"] = np.dot(params["W1"], x) + params["b1"]
        params["A1"] = relu(params["S1"])
        params["S2"] = np.dot(params["W2"], params["A1"]) + params["b2"]
        params["A2"] = relu(params["S2"])
        params["S3"] = np.dot(params["W3"], params["A2"]) + params["b3"]
        params["A3"] = softmax(params["S3"])
    elif activation == 'sigmoid':
        params["S1"] = np.dot(params["W1"], x) + params["b1"]
        params["A1"] = sigmoid(params["S1"])
        params["S2"] = np.dot(params["W2"], params["A1"]) + params["b2"]
        params["A2"] = sigmoid(params["S2"])
        params["S3"] = np.dot(params["W3"], params["A2"]) + params["b3"]
        params["A3"] = softmax(params["S3"])
    return params

def foward_pass_test(x, params, activation):

    params_test = {}
    if activation == 'relu':
        params_test["S1"] = np.dot(params["W1"], x) + params["b1"]
        params_test["A1"] = relu(params_test["S1"])
        params_test["S2"] = np.dot(params["W2"], params_test["A1"]) + params["b2"]
        params_test["A2"] = relu(params_test["S2"])
        params_test["S3"] = np.dot(params["W3"], params_test["A2"]) + params["b3"]
        params_test["A3"] = softmax(params_test["S3"])
    if activation == 'sigmoid':
        params_test["S1"] = np.dot(params["W1"], x) + params["b1"]
        params_test["A1"] = sigmoid(params_test["S1"])
        params_test["S2"] = np.dot(params["W2"], params_test["A1"]) + params["b2"]
        params_test["A2"] = sigmoid(params_test["S2"])
        params_test["S3"] = np.dot(params["W3"], params_test["A2"]) + params["b3"]
        params_test["A3"] = softmax(params_test["S3"])
    return params_test

def compute_accuracy(y_true, y_pred):
    y_true_idx = np.argmax(y_true, axis = 0)
    y_pred_idx = np.argmax(y_pred, axis = 0)
    num_correct = np.sum(y_true_idx==y_pred_idx)

    accuracy = num_correct / y_true.shape[1] * 100

    return accuracy

## 과제 1
ReLu activation function과 derivative function을 구현해보세요
- Hint : np.maximum 함수 사용하면 편리합니다
- 다른 방법 사용하셔도 무방합니다


In [32]:
def relu(x):
    return np.maximum(x, 0)

In [33]:
def d_relu(x):
    return np.where(x>0, 1, 0)

## 과제 2
Deep Learning Basic 코드 파일의 MLP implementation with Numpy library using MNIST dataset 코드 참고해서
Three layer MLP 일 때의 backward_pass 함수를 완성해주세요.   
- Hint : 코드 파일의 예시는 Two layer MLP


In [34]:
def backward_pass(x, y_true, params, activation):
    dS3 = params["A3"] - y_true
    
    grads = {}
    if activation == 'relu':
        grads['dW3'] = np.dot(dS3, params['A2'].T) / x.shape[1]
        grads['db3'] = (1 / x.shape[1]) * np.sum(dS3, axis = 1, keepdims = True) / x.shape[1]

        dA2 = np.dot(params['W3'].T, dS3)
        dS2 = dA2 * d_relu(params['S2'])

        grads['dW2'] = np.dot(dS2, params['A1'].T) / x.shape[1]
        grads['db2'] = (1 / x.shape[1]) * np.sum(dS2, axis = 1, keepdims = True) / x.shape[1]

        dA1 = np.dot(params['W2'].T, dS2)
        dS1 = dA1 * d_relu(params['S1'])

        grads['dW1'] = np.dot(dS1, x.T) / x.shape[1]
        grads['db1'] = np.sum(dS1, axis = 1, keepdims = True) / x.shape[1]
    elif activation == 'sigmoid':
        grads['dW3'] = np.dot(dS3, params['A2'].T) / x.shape[1]
        grads['db3'] = (1 / x.shape[1]) * np.sum(dS3, axis = 1, keepdims = True) / x.shape[1]

        dA2 = np.dot(params['W3'].T, dS3)
        dS2 = dA2 * d_sigmoid(params['S2'])

        grads['dW2'] = np.dot(dS2, params['A1'].T) / x.shape[1]
        grads['db2'] = (1 / x.shape[1]) * np.sum(dS2, axis = 1, keepdims = True) / x.shape[1]

        dA1 = np.dot(params['W2'].T, dS2)
        dS1 = dA1 * d_sigmoid(params['S1'])

        grads['dW1'] = np.dot(dS1, x.T) / x.shape[1]
        grads['db1'] = np.sum(dS1, axis = 1, keepdims = True) / x.shape[1]
    return grads

## 과제 3
Deep Learning Basic 코드 파일의 MLP implementation with Pytorch library using MNIST dataset 코드 참고해서
Three layer MLP를 구한후, 학습을 돌려 보세요

hyperparameter는 다음과 같이 설정

- epochs : 100
- hiddensize : 128, 64 (two layer)
- learning_rate : 0.5

In [35]:
mnist = sklearn.datasets.fetch_openml('mnist_784', data_home="mnist_784")

In [36]:
# data preprocessing

num_train = 60000
num_class = 10

x_train = np.float32(mnist.data[:num_train]).T
y_train_index = np.int32(mnist.target[:num_train]).T
x_test = np.float32(mnist.data[num_train:]).T
y_test_index = np.int32(mnist.target[num_train:]).T

# Normalization

x_train /= 255
x_test /= 255
x_size = x_train.shape[0]

y_train = np.zeros((num_class, y_train_index.shape[0]))
for idx in range(y_train_index.shape[0]):
    y_train[y_train_index[idx], idx] = 1

y_test = np.zeros((num_class, y_test_index.shape[0]))
for idx in range(y_test_index.shape[0]):
    y_test[y_test_index[idx], idx] = 1    

In [39]:
#parameter initialization

hidden_size_1 = 128
hidden_size_2 = 64 # hidden unit size

# three-layer neural network

params = {"W1": np.random.randn(hidden_size_1, x_size) * np.sqrt(1/ x_size),
          "b1": np.zeros((hidden_size_1, 1)) * np.sqrt(1/ x_size),
          "W2": np.random.randn(hidden_size_2, hidden_size_1) * np.sqrt(1/ hidden_size_1),
          "b2": np.zeros((hidden_size_2, 1)) * np.sqrt(1/ hidden_size_1),
          "W3": np.random.randn(num_class, hidden_size_2) * np.sqrt(1/ hidden_size_2),
          "b3": np.zeros((num_class, 1)) * np.sqrt(1/ hidden_size_2)
          
          }
# Xavier initialization: https://reniew.github.io/13/

In [40]:
epochs = 100
learning_rate = 0.5
activation = 'sigmoid'
for i in range(epochs):

    if i == 0:
        params = foward_pass(x_train, params, activation)

    grads = backward_pass(x_train, y_train, params, activation)

    params["W1"] -= learning_rate * grads["dW1"]
    params["b1"] -= learning_rate * grads["db1"]
    params["W2"] -= learning_rate * grads["dW2"]
    params["b2"] -= learning_rate * grads["db2"]
    params["W3"] -= learning_rate * grads["dW3"]
    params["b3"] -= learning_rate * grads["db3"]

    params = foward_pass(x_train, params, activation)
    train_loss = compute_loss(y_train, params["A3"])
    train_acc = compute_accuracy(y_train, params["A3"])

    params_test = foward_pass_test(x_test, params, activation)
    test_loss = compute_loss(y_test, params_test["A3"])
    test_acc = compute_accuracy(y_test, params_test["A3"])

    print("Epoch {}: training loss = {}, training acuracy = {}%, test loss = {}, training acuracy = {}%"
    .format(i + 1, np.round(train_loss, 6), np.round(train_acc, 2), np.round(test_loss, 6), np.round(test_acc, 2)))

Epoch 1: training loss = 2.299048, training acuracy = 12.5%, test loss = 2.299367, training acuracy = 12.94%
Epoch 2: training loss = 2.295244, training acuracy = 11.24%, test loss = 2.29485, training acuracy = 11.35%
Epoch 3: training loss = 2.292805, training acuracy = 11.24%, test loss = 2.292325, training acuracy = 11.35%
Epoch 4: training loss = 2.29036, training acuracy = 11.24%, test loss = 2.289819, training acuracy = 11.35%
Epoch 5: training loss = 2.287899, training acuracy = 11.24%, test loss = 2.287299, training acuracy = 11.35%
Epoch 6: training loss = 2.285418, training acuracy = 11.26%, test loss = 2.284759, training acuracy = 11.35%
Epoch 7: training loss = 2.282912, training acuracy = 11.33%, test loss = 2.282193, training acuracy = 11.45%
Epoch 8: training loss = 2.280376, training acuracy = 11.56%, test loss = 2.279596, training acuracy = 11.77%
Epoch 9: training loss = 2.277806, training acuracy = 11.93%, test loss = 2.276965, training acuracy = 12.32%
Epoch 10: tra

## 과제 4
과제 3 부분의 성능을 지금까지 배운 지식을 바탕으로 향상시켜보세요

- Hint : Activation function, hyperparameter setting

In [52]:
#parameter initialization

hidden_size_1 = 128
hidden_size_2 = 64 # hidden unit size

# three-layer neural network

params = {"W1": np.random.randn(hidden_size_1, x_size) * np.sqrt(1/ x_size),
          "b1": np.zeros((hidden_size_1, 1)) * np.sqrt(1/ x_size),
          "W2": np.random.randn(hidden_size_2, hidden_size_1) * np.sqrt(1/ hidden_size_1),
          "b2": np.zeros((hidden_size_2, 1)) * np.sqrt(1/ hidden_size_1),
          "W3": np.random.randn(num_class, hidden_size_2) * np.sqrt(1/ hidden_size_2),
          "b3": np.zeros((num_class, 1)) * np.sqrt(1/ hidden_size_2)
          
          }
# Xavier initialization: https://reniew.github.io/13/

In [53]:
# Assignment 4 구현은 여기서 ()
epochs = 500
learning_rate = 0.1
activation = 'relu'
for i in range(epochs):

    if i == 0:
        params = foward_pass(x_train, params, activation)

    grads = backward_pass(x_train, y_train, params, activation)

    params["W1"] -= learning_rate * grads["dW1"]
    params["b1"] -= learning_rate * grads["db1"]
    params["W2"] -= learning_rate * grads["dW2"]
    params["b2"] -= learning_rate * grads["db2"]
    params["W3"] -= learning_rate * grads["dW3"]
    params["b3"] -= learning_rate * grads["db3"]

    params = foward_pass(x_train, params, activation)
    train_loss = compute_loss(y_train, params["A3"])
    train_acc = compute_accuracy(y_train, params["A3"])

    params_test = foward_pass_test(x_test, params, activation)
    test_loss = compute_loss(y_test, params_test["A3"])
    test_acc = compute_accuracy(y_test, params_test["A3"])

    print("Epoch {}: training loss = {}, training acuracy = {}%, test loss = {}, training acuracy = {}%"
    .format(i + 1, np.round(train_loss, 6), np.round(train_acc, 2), np.round(test_loss, 6), np.round(test_acc, 2)))

Epoch 1: training loss = 2.289795, training acuracy = 16.4%, test loss = 2.287241, training acuracy = 17.03%
Epoch 2: training loss = 2.264904, training acuracy = 21.51%, test loss = 2.261363, training acuracy = 22.86%
Epoch 3: training loss = 2.242305, training acuracy = 26.53%, test loss = 2.237864, training acuracy = 28.29%
Epoch 4: training loss = 2.22084, training acuracy = 31.47%, test loss = 2.21558, training acuracy = 33.32%
Epoch 5: training loss = 2.199529, training acuracy = 36.17%, test loss = 2.193455, training acuracy = 38.18%
Epoch 6: training loss = 2.177743, training acuracy = 40.18%, test loss = 2.170841, training acuracy = 42.08%
Epoch 7: training loss = 2.155159, training acuracy = 43.48%, test loss = 2.14741, training acuracy = 45.5%
Epoch 8: training loss = 2.131483, training acuracy = 46.07%, test loss = 2.122914, training acuracy = 47.95%
Epoch 9: training loss = 2.106488, training acuracy = 48.17%, test loss = 2.097118, training acuracy = 49.93%
Epoch 10: train

**무엇을 보완하였고, 왜 보완되었는지에 대한 자유 서술 (아래에)**

In [47]:
'''
3 layers를 사용하기 때문에 sigmoid function의 단점인 vanishing gradient 현상이 더 드러났을 것이라고 생각 됨. 
따라서 vanishing gradient 효과가 발생하지 않는 relu를 사용했을 때 더 좋은 성능이 나온 것으로 생각함.
lr을 0.5로하면 loss가 fluctuate하기 때문에 lr을 낮추고 epoch을 늘렸다.
'''

'\n3 layers를 사용하기 때문에 sigmoid function의 단점인 vanishing gradient 현상이 더 드러났을 것이라고 생각 됨. \n따라서 vanishing gradient 효과가 발생하지 않는 relu를 사용했을 때 더 좋은 성능이 나온 것으로 생각함.\n'