In [1]:
import numpy as np

## 과제 1
ReLu activation function과 derivative function을 구현해보세요
- Hint : np.maximum 함수 사용하면 편리합니다
- 다른 방법 사용하셔도 무방합니다


In [2]:
def relu(x):

    return np.maximun(x,0)

In [34]:
def d_relu(x):
    return np.maximum(x/abs(x),0)

## 과제 2
Deep Learning Basic 코드 파일의 MLP implementation with Numpy library using MNIST dataset 코드 참고해서
Three layer MLP 일 때의 backward_pass 함수를 완성해주세요.   
- Hint : 코드 파일의 예시는 Two layer MLP


In [16]:
def backward_pass(x, y_true, params):
    dS3 = params["A3"] - y_true
    grads = {}

    grads["dW3"] =  np.dot(dS3, params["A2"].T)/x.shape[1]
    grads["db3"] =  (1/x.shape[1])*np.sum(dS3, axis=1, keepdims=True)/x.shape[1]

    dA2 = np.dot(params["W3"].T, dS3)
    dS2 = dA2 * d_sigmoid(params["S2"])

    grads["dW2"] = np.dot(dS2, params["A1"].T)/x.shape[1]
    grads["db2"] = (1/x.shape[1])*np.sum(dS2, axis=1, keepdims=True)/x.shape[1]

    dA1 = np.dot(params["W2"].T, dS2)
    dS1 = dA1 * d_sigmoid(params["S1"])

    grads["dW1"] = np.dot(dS1, x.T)/x.shape[1]
    grads["db1"] = np.sum(dS1, axis=1, keepdims=True)/x.shape[1]

    return grads

## 과제 3
Deep Learning Basic 코드 파일의 MLP implementation with Pytorch library using MNIST dataset 코드 참고해서
Three layer MLP를 구한후, 학습을 돌려 보세요

hyperparameter는 다음과 같이 설정

- epochs : 100
- hiddensize : 128, 64 (two layer)
- learning_rate : 0.5

In [6]:
from IPython import get_ipython
get_ipython().magic('reset -sf')

In [7]:
import numpy as np
import sklearn.datasets

mnist = sklearn.datasets.fetch_openml('mnist_784', data_home="mnist_784")

In [8]:
# data preprocessing

num_train = 60000
num_class = 10

x_train = np.float32(mnist.data[:num_train]).T
y_train_index = np.int32(mnist.target[:num_train]).T
x_test = np.float32(mnist.data[num_train:]).T
y_test_index = np.int32(mnist.target[num_train:]).T

# Normalization

x_train /= 255
x_test /= 255
x_size = x_train.shape[0]

y_train = np.zeros((num_class, y_train_index.shape[0]))
for idx in range(y_train_index.shape[0]):
  y_train[y_train_index[idx], idx] = 1

y_test = np.zeros((num_class, y_test_index.shape[0]))
for idx in range(y_test_index.shape[0]):
  y_test[y_test_index[idx], idx] = 1    

In [9]:
#parameter initialization

hidden_size_1 = 128
hidden_size_2 = 64 

# two-layer neural network

params = {"W1": np.random.randn(hidden_size_1, x_size) * np.sqrt(1/ x_size),
          "b1": np.zeros((hidden_size_1, 1)) * np.sqrt(1/ x_size),
          "W2": np.random.randn(hidden_size_2, hidden_size_1) * np.sqrt(1/ hidden_size_1),
          "b2": np.zeros((hidden_size_2, 1)) * np.sqrt(1/ hidden_size_1),
          "W3": np.random.randn(num_class, hidden_size_2) * np.sqrt(1/ hidden_size_2),
          "b3": np.zeros((num_class, 1)) * np.sqrt(1/ hidden_size_2)
          }
# Xavier initialization: https://reniew.github.io/13/

In [10]:
def sigmoid(x):
  return 1/(1+np.exp(-x))

def d_sigmoid(x):
  # derivative of sigmoid
  exp = np.exp(-x)
  return (exp)/((1+exp)**2)

def softmax(x):
  exp = np.exp(x)
  return exp/np.sum(exp, axis=0)

In [11]:
def compute_loss(y_true, y_pred):
  # loss calculation

  num_sample = y_true.shape[1]
  Li = -1 * np.sum(y_true * np.log(y_pred))
  
  return Li/num_sample

In [12]:
def foward_pass(x, params):
  
  params["S1"] = np.dot(params["W1"], x) + params["b1"]
  params["A1"] = sigmoid(params["S1"])
  params["S2"] = np.dot(params["W2"], params["A1"]) + params["b2"]
  params["A2"] = sigmoid(params["S2"])
  params["S3"] = np.dot(params["W3"], params["A2"]) + params["b3"]
  params["A3"] = softmax(params["S3"])

  return params

In [13]:
def foward_pass_test(x, params):

  params_test = {}
  
  params_test["S1"] = np.dot(params["W1"], x) + params["b1"]
  params_test["A1"] = sigmoid(params_test["S1"])
  params_test["S2"] = np.dot(params["W2"], params_test["A1"]) + params["b2"]
  params_test["A2"] = softmax(params_test["S2"])
  params_test["S3"] = np.dot(params["W3"], params_test["A2"]) + params["b3"]
  params_test["A3"] = softmax(params_test["S3"])

  return params_test

In [14]:
def compute_accuracy(y_true, y_pred):
  y_true_idx = np.argmax(y_true, axis = 0)
  y_pred_idx = np.argmax(y_pred, axis = 0)
  num_correct = np.sum(y_true_idx==y_pred_idx)

  accuracy = num_correct / y_true.shape[1] * 100

  return accuracy

In [28]:
def backward_pass(x, y_true, params):
    dS3 = params["A3"] - y_true
    grads = {}

    grads["dW3"] =  np.dot(dS3, params["A2"].T)/x.shape[1]
    grads["db3"] =  (1/x.shape[1])*np.sum(dS3, axis=1, keepdims=True)/x.shape[1]

    dA2 = np.dot(params["W3"].T, dS3)
    dS2 = dA2 * d_sigmoid(params["S2"])

    grads["dW2"] = np.dot(dS2, params["A1"].T)/x.shape[1]
    grads["db2"] = (1/x.shape[1])*np.sum(dS2, axis=1, keepdims=True)/x.shape[1]

    dA1 = np.dot(params["W2"].T, dS2)
    dS1 = dA1 * d_sigmoid(params["S1"])

    grads["dW1"] = np.dot(dS1, x.T)/x.shape[1]
    grads["db1"] = np.sum(dS1, axis=1, keepdims=True)/x.shape[1]

    return grads

In [27]:
epochs = 100
learning_rate = 0.5

for i in range(epochs):

  if i == 0:
    params = foward_pass(x_train, params)
    
  grads = backward_pass(x_train, y_train, params)

  params["W1"] -= learning_rate * grads["dW1"]
  params["b1"] -= learning_rate * grads["db1"]
  params["W2"] -= learning_rate * grads["dW2"]
  params["b2"] -= learning_rate * grads["db2"]
  params["W3"] -= learning_rate * grads["dW3"]
  params["b3"] -= learning_rate * grads["db3"]

  params = foward_pass(x_train, params)
  train_loss = compute_loss(y_train, params["A3"])
  train_acc = compute_accuracy(y_train, params["A3"])

  params_test = foward_pass_test(x_test, params)
  test_loss = compute_loss(y_test, params_test["A3"])
  test_acc = compute_accuracy(y_test, params_test["A3"])

  print("Epoch {}: training loss = {}, training acuracy = {}%, test loss = {}, training acuracy = {}%"
  .format(i + 1, np.round(train_loss, 6), np.round(train_acc, 2), np.round(test_loss, 6), np.round(test_acc, 2)))

Epoch 1: training loss = 2.293128, training acuracy = 11.24%, test loss = 2.30182, training acuracy = 10.61%
Epoch 2: training loss = 2.290366, training acuracy = 11.24%, test loss = 2.301669, training acuracy = 11.11%
Epoch 3: training loss = 2.287844, training acuracy = 11.25%, test loss = 2.301507, training acuracy = 11.77%
Epoch 4: training loss = 2.285312, training acuracy = 11.4%, test loss = 2.301344, training acuracy = 12.33%
Epoch 5: training loss = 2.282763, training acuracy = 11.97%, test loss = 2.30118, training acuracy = 13.04%
Epoch 6: training loss = 2.280192, training acuracy = 12.91%, test loss = 2.301014, training acuracy = 13.77%
Epoch 7: training loss = 2.277596, training acuracy = 13.96%, test loss = 2.300846, training acuracy = 14.7%
Epoch 8: training loss = 2.274968, training acuracy = 15.07%, test loss = 2.300677, training acuracy = 15.7%
Epoch 9: training loss = 2.272305, training acuracy = 16.22%, test loss = 2.300506, training acuracy = 17.19%
Epoch 10: train

## 과제 4
과제 3 부분의 성능을 지금까지 배운 지식을 바탕으로 향상시켜보세요

- Hint : Activation function, hyperparameter setting

In [113]:
from torchvision import transforms, datasets
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [114]:
# 이미지를 텐서로 변경
transform = transforms.Compose([
    transforms.ToTensor()
])

In [115]:
trainset = datasets.MNIST(
    root      = './.data/', 
    train     = True,
    download  = True,
    transform = transform
)
testset = datasets.MNIST(
    root      = './.data/', 
    train     = False,
    download  = True,
    transform = transform
)

In [116]:
BATCH_SIZE = 512
# train set과 test set 각각에 대하여 DataLoader를 생성합니다.
# shuffle=True 매개변수를 넣어 데이터를 섞어주세요.
train_loader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
test_loader =  torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE, shuffle=True)

### MLP 구현

In [117]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(784,128)
        self.layer2 = nn.Linear(128,64)
        self.layer3 = nn.Linear(64,10)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = x.view(-1, 784)
        out = self.layer1(x)
        out = self.relu(out)
        out = self.layer2(out)
        out = self.relu(out)
        out = self.layer3(out)

        return out

In [118]:
model = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [119]:
def train(model, train_loader, optimizer):
    model.train()

    batch_losses = []

    for data, target in train_loader:
        optimizer.zero_grad()

        
        output = model(data)

        loss = criterion(output, target)
        batch_losses.append(loss)

        loss.backward()

        optimizer.step()
    
    avg_loss = sum(batch_losses) / len(batch_losses)
    
    return avg_loss

def evaluate(model, test_loader):
    # 모델을 평가 모드로 전환
    model.eval()

    batch_losses = []
    correct = 0 

    with torch.no_grad(): 
        for data, target in test_loader:
            # 예측값 생성
            output = model(data)


            loss = criterion(output, target)
            batch_losses.append(loss)


            pred = output.max(1, keepdim=True)[1]

            # eq() 함수는 값이 일치하면 1을, 아니면 0을 출력.
            correct += pred.eq(target.view_as(pred)).sum().item()

    ######## 배치 당 평균 loss 계산 ############
    avg_loss =  sum(batch_losses) / len(batch_losses)

    #정확도 계산
    accuracy = 100. * correct / len(test_loader.dataset)

    return avg_loss, accuracy

### 학습 진행

In [121]:
EPOCHS = 10

for epoch in range(1, EPOCHS + 1):
    train_loss = train(model, train_loader, optimizer)
    test_loss, test_accuracy = evaluate(model, test_loader)
    
    print('[{}] Train Loss: {:.4f}\tTest Loss: {:.4f}\tAccuracy: {:.2f}%'.format(
          epoch, train_loss, test_loss, test_accuracy))

[1] Train Loss: 0.0215	Test Loss: 0.1235	Accuracy: 97.58%
[2] Train Loss: 0.0194	Test Loss: 0.1214	Accuracy: 97.48%
[3] Train Loss: 0.0255	Test Loss: 0.1291	Accuracy: 97.60%
[4] Train Loss: 0.0353	Test Loss: 0.1147	Accuracy: 97.52%
[5] Train Loss: 0.0264	Test Loss: 0.1271	Accuracy: 97.70%
[6] Train Loss: 0.0247	Test Loss: 0.1384	Accuracy: 97.50%
[7] Train Loss: 0.0173	Test Loss: 0.1270	Accuracy: 97.83%
[8] Train Loss: 0.0161	Test Loss: 0.1683	Accuracy: 97.30%
[9] Train Loss: 0.0196	Test Loss: 0.1261	Accuracy: 97.68%
[10] Train Loss: 0.0282	Test Loss: 0.1303	Accuracy: 97.58%


**무엇을 보완하였고, 왜 보완되었는지에 대한 자유 서술 (아래에)**

activation function을 relu로 변경하여 gradient vanishing 문제를 해결함
또한 optimizer를 Adam으로 설정하여 global local minimum에 빠지는 오류를 최소화함