## 과제 1
ReLu activation function과 derivative function을 구현해보세요
- Hint : np.maximum 함수 사용하면 편리합니다
- 다른 방법 사용하셔도 무방합니다


In [1]:
import numpy as np
from sympy import *

In [2]:
def relu(x):

  return np.maximum(x,0)

In [3]:
def d_relu(x):

  return 1. * (x > 0)

In [4]:
ex_list = [-1.3, -2.3, 0, 2, 3]

In [5]:
relu(ex_list)

array([0., 0., 0., 2., 3.])

## 과제 2
Deep Learning Basic 코드 파일의 MLP implementation with Numpy library using MNIST dataset 코드 참고해서
Three layer MLP 일 때의 backward_pass 함수를 완성해주세요.   
- Hint : 코드 파일의 예시는 Two layer MLP


In [6]:
from IPython import get_ipython
get_ipython().magic('reset -sf')

In [7]:
import numpy as np
import sklearn.datasets

mnist = sklearn.datasets.fetch_openml('mnist_784', data_home="mnist_784")

In [8]:
# data preprocessing

num_train = 60000
num_class = 10

x_train = np.float32(mnist.data[:num_train]).T
y_train_index = np.int32(mnist.target[:num_train]).T
x_test = np.float32(mnist.data[num_train:]).T
y_test_index = np.int32(mnist.target[num_train:]).T

# Normalization

x_train /= 255
x_test /= 255
x_size = x_train.shape[0]

y_train = np.zeros((num_class, y_train_index.shape[0]))
for idx in range(y_train_index.shape[0]):
  y_train[y_train_index[idx], idx] = 1

y_test = np.zeros((num_class, y_test_index.shape[0]))
for idx in range(y_test_index.shape[0]):
  y_test[y_test_index[idx], idx] = 1    

In [9]:
x_train.shape

(784, 60000)

In [10]:
#parameter initialization

hidden_size_1 = 128 # hidden unit size
hidden_size_2 = 64

# two-layer neural network

params = {"W1": np.random.randn(hidden_size_1, x_size) * np.sqrt(1/ x_size),
          "b1": np.zeros((hidden_size_1, 1)) * np.sqrt(1/ x_size),

          "W2": np.random.randn(hidden_size_2, hidden_size_1) * np.sqrt(1/ hidden_size_1),
          "b2": np.zeros((hidden_size_2, 1)) * np.sqrt(1/ hidden_size_1),
          
          "W3": np.random.randn(num_class, hidden_size_2) * np.sqrt(1/ hidden_size_2),
          "b3": np.zeros((num_class, 1)) * np.sqrt(1/ hidden_size_2)
          }
# Xavier initialization: https://reniew.github.io/13/

In [11]:
def sigmoid(x):
  return 1/(1+np.exp(-x))

def d_sigmoid(x):
  # derivative of sigmoid
  exp = np.exp(-x)
  return (exp)/((1+exp)**2)

def softmax(x):
  exp = np.exp(x)
  return exp/np.sum(exp, axis=0)

In [12]:
def compute_loss(y_true, y_pred):
  # loss calculation

  num_sample = y_true.shape[1]
  Li = -1 * np.sum(y_true * np.log(y_pred))
  
  return Li/num_sample

In [13]:
def foward_pass(x, params):
  
  params["S1"] = np.dot(params["W1"], x) + params["b1"]
  params["A1"] = sigmoid(params["S1"])

  params["S2"] = np.dot(params["W2"], params["A1"]) + params["b2"]
  params["A2"] = sigmoid(params["S2"])
  
  params["S3"] = np.dot(params["W3"], params["A2"]) + params["b3"]
  params["A3"] = softmax(params["S3"])

  return params

In [14]:
def foward_pass_test(x, params):

  params_test = {}
  
  params_test["S1"] = np.dot(params["W1"], x) + params["b1"]
  params_test["A1"] = sigmoid(params_test["S1"])
  params_test["S2"] = np.dot(params["W2"], params_test["A1"]) + params["b2"]
  params_test["A2"] = sigmoid(params_test["S2"])
  params_test["S3"] = np.dot(params["W3"], params_test["A2"]) + params["b3"]
  params_test["A3"] = softmax(params_test["S3"])

  return params_test

In [15]:
def compute_accuracy(y_true, y_pred):
  y_true_idx = np.argmax(y_true, axis = 0)
  y_pred_idx = np.argmax(y_pred, axis = 0)
  num_correct = np.sum(y_true_idx==y_pred_idx)

  accuracy = num_correct / y_true.shape[1] * 100

  return accuracy

In [16]:
def backward_pass(x, y_true, params):
  dS3 = params["A3"] - y_true
  # Please check http://machinelearningmechanic.com/deep_learning/2019/09/04/cross-entropy-loss-derivative.html
  # dS2 is softmax + CE loss derivative

  grads = {}

  grads["dW3"] =  np.dot(dS3, params["A2"].T)/x.shape[1]
  grads["db3"] =  np.sum(dS3, axis=1, keepdims=True)/x.shape[1] #*(1/x.shape[1])

  dA2 = np.dot(params["W3"].T, dS3)
  dS2 = dA2 * d_sigmoid(params["S2"])

  grads["dW2"] = np.dot(dS2, params["A1"].T)/x.shape[1]
  grads["db2"] = np.sum(dS2, axis=1, keepdims=True)/x.shape[1]

  dA1 = np.dot(params["W2"].T, dS2)
  dS1 = dA1 * d_sigmoid(params["S1"])

  grads["dW1"] = np.dot(dS1, x.T)/x.shape[1]
  grads["db1"] = np.sum(dS1, axis=1, keepdims=True)/x.shape[1]

  return grads

In [54]:
epochs = 100
learning_rate = 0.5

for i in range(epochs):

  if i == 0:
    params = foward_pass(x_train, params)
    
  grads = backward_pass(x_train, y_train, params)

  params["W1"] -= learning_rate * grads["dW1"]
  params["b1"] -= learning_rate * grads["db1"]
  params["W2"] -= learning_rate * grads["dW2"]
  params["b2"] -= learning_rate * grads["db2"]
  params["W3"] -= learning_rate * grads["dW3"]
  params["b3"] -= learning_rate * grads["db3"]

  params = foward_pass(x_train, params)
  train_loss = compute_loss(y_train, params["A3"])
  train_acc = compute_accuracy(y_train, params["A3"])

  params_test = foward_pass_test(x_test, params)
  test_loss = compute_loss(y_test, params_test["A3"])
  test_acc = compute_accuracy(y_test, params_test["A3"])

  print("Epoch {}: training loss = {}, training acuracy = {}%, test loss = {}, training acuracy = {}%"
  .format(i + 1, np.round(train_loss, 6), np.round(train_acc, 2), np.round(test_loss, 6), np.round(test_acc, 2)))

Epoch 1: training loss = 2.280842, training acuracy = 14.21%, test loss = 2.280371, training acuracy = 14.82%
Epoch 2: training loss = 2.278588, training acuracy = 14.75%, test loss = 2.278057, training acuracy = 15.44%
Epoch 3: training loss = 2.2763, training acuracy = 15.45%, test loss = 2.275706, training acuracy = 16.15%
Epoch 4: training loss = 2.273972, training acuracy = 16.29%, test loss = 2.273316, training acuracy = 17.03%
Epoch 5: training loss = 2.2716, training acuracy = 17.2%, test loss = 2.27088, training acuracy = 17.97%
Epoch 6: training loss = 2.26918, training acuracy = 18.25%, test loss = 2.268395, training acuracy = 18.92%
Epoch 7: training loss = 2.266706, training acuracy = 19.19%, test loss = 2.265855, training acuracy = 19.97%
Epoch 8: training loss = 2.264174, training acuracy = 20.31%, test loss = 2.263255, training acuracy = 20.99%
Epoch 9: training loss = 2.261578, training acuracy = 21.42%, test loss = 2.260591, training acuracy = 22.11%
Epoch 10: trainin

## 과제 3
Deep Learning Basic 코드 파일의 MLP implementation with Pytorch library using MNIST dataset 코드 참고해서
Three layer MLP를 구한후, 학습을 돌려 보세요

hyperparameter는 다음과 같이 설정

- epochs : 100
- hiddensize : 128, 64 (two layer)
- learning_rate : 0.5

In [18]:
from torchvision import transforms, datasets
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

In [19]:
# 이미지를 텐서로 변경
transform = transforms.Compose([
    transforms.ToTensor()
])

In [20]:
trainset = datasets.MNIST(
    root      = './.data/', 
    train     = True,
    download  = True,
    transform = transform
)
testset = datasets.MNIST(
    root      = './.data/', 
    train     = False,
    download  = True,
    transform = transform
)

In [21]:
BATCH_SIZE = 512
# train set과 test set 각각에 대하여 DataLoader를 생성합니다.
# shuffle=True 매개변수를 넣어 데이터를 섞어주세요.
train_loader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
test_loader =  DataLoader(testset, batch_size=BATCH_SIZE, shuffle=True)

In [22]:
# Assignment 3 구현은 여기서 ()
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(784,128)
        self.layer2 = nn.Linear(128,64)
        self.layer3 = nn.Linear(64,10)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = x.view(-1, 784)
        out = self.layer1(x)
        out = self.relu(out)
        out = self.layer2(out)
        out = self.relu(out)
        out = self.layer3(out)

        return out

In [23]:
model = Net()
model

Net(
  (layer1): Linear(in_features=784, out_features=128, bias=True)
  (layer2): Linear(in_features=128, out_features=64, bias=True)
  (layer3): Linear(in_features=64, out_features=10, bias=True)
  (relu): ReLU()
)

In [24]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.5)

In [25]:
def train(model, train_loader, optimizer):
    model.train()
    # 배치 당 loss 값을 담을 리스트 생성
    batch_losses = []

    for data, target in train_loader:
        # 옵티마이저의 기울기 초기화
        optimizer.zero_grad()

        # y pred 값 산출
        output = model(data)
        # loss 계산
        # 정답 데이터와의 cross entropy loss 계산
        # 이 loss를 배치 당 loss로 보관
        loss = criterion(output, target)
        batch_losses.append(loss)

        # 기울기 계산
        loss.backward()

        # 가중치 업데이트!
        optimizer.step()
        
    # 배치당 평균 loss 계산
    avg_loss = sum(batch_losses) / len(batch_losses)
    
    return avg_loss

In [26]:
def evaluate(model, test_loader):
    # 모델을 평가 모드로 전환
    model.eval()

    batch_losses = []
    correct = 0 

    with torch.no_grad(): 
        for data, target in test_loader:
            # 예측값 생성
            output = model(data)

            # loss 계산 (이전과 동일)
            loss = criterion(output, target)
            batch_losses.append(loss)

           # Accuracy 계산
           # y pred와 y가 일치하면 correct에 1을 더해주기
            pred = output.max(1, keepdim=True)[1]

            # eq() 함수는 값이 일치하면 1을, 아니면 0을 출력.
            correct += pred.eq(target.view_as(pred)).sum().item()

    # 배치 당 평균 loss 계산 
    avg_loss =  sum(batch_losses) / len(batch_losses)

    #정확도 계산
    accuracy = 100. * correct / len(test_loader.dataset)

    return avg_loss, accuracy

In [27]:
EPOCHS = 100

for epoch in(range(1, EPOCHS + 1)):
    train_loss = train(model, train_loader, optimizer)
    test_loss, test_accuracy = evaluate(model, test_loader)
    
    print('[{}] Train Loss: {:.4f}\tTest Loss: {:.4f}\tAccuracy: {:.2f}%'.format(
          epoch, train_loss, test_loss, test_accuracy))

[1] Train Loss: 0.7900	Test Loss: 0.3325	Accuracy: 89.35%
[2] Train Loss: 0.2408	Test Loss: 0.2499	Accuracy: 91.88%
[3] Train Loss: 0.1679	Test Loss: 0.1756	Accuracy: 94.37%
[4] Train Loss: 0.1288	Test Loss: 0.1684	Accuracy: 94.19%
[5] Train Loss: 0.1066	Test Loss: 0.1443	Accuracy: 95.51%
[6] Train Loss: 0.0970	Test Loss: 0.2304	Accuracy: 92.82%
[7] Train Loss: 0.0970	Test Loss: 0.0970	Accuracy: 97.03%
[8] Train Loss: 0.0674	Test Loss: 0.0807	Accuracy: 97.47%
[9] Train Loss: 0.0591	Test Loss: 0.0781	Accuracy: 97.57%
[10] Train Loss: 0.0537	Test Loss: 0.1360	Accuracy: 95.80%
[11] Train Loss: 0.0473	Test Loss: 0.1103	Accuracy: 96.40%
[12] Train Loss: 0.2672	Test Loss: 0.1043	Accuracy: 96.85%
[13] Train Loss: 0.0778	Test Loss: 0.1370	Accuracy: 95.62%
[14] Train Loss: 0.0615	Test Loss: 0.0999	Accuracy: 97.05%
[15] Train Loss: 0.0524	Test Loss: 0.0874	Accuracy: 97.44%
[16] Train Loss: 0.0452	Test Loss: 0.0960	Accuracy: 96.99%
[17] Train Loss: 0.0398	Test Loss: 0.1042	Accuracy: 97.08%
[18] T

## 과제 4
과제 3 부분의 성능을 지금까지 배운 지식을 바탕으로 향상시켜보세요

- Hint : Activation function, hyperparameter setting

In [28]:
# Assignment 4 구현은 여기서 ()

In [48]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(784,128)
        self.layer2 = nn.Linear(128,64)
        self.layer3 = nn.Linear(64,10)
        self.leakyrelu = nn.LeakyReLU(0.1)
        
    def forward(self, x):
        x = x.view(-1, 784)
        out = self.layer1(x)
        out = self.leakyrelu(out)
        out = self.layer2(out)
        out = self.leakyrelu(out)
        out = self.layer3(out)

        return out

In [49]:
model = Net()
model

Net(
  (layer1): Linear(in_features=784, out_features=128, bias=True)
  (layer2): Linear(in_features=128, out_features=64, bias=True)
  (layer3): Linear(in_features=64, out_features=10, bias=True)
  (leakyrelu): LeakyReLU(negative_slope=0.1)
)

In [52]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.5)

In [53]:
EPOCHS = 100

for epoch in(range(1, EPOCHS + 1)):
    train_loss = train(model, train_loader, optimizer)
    test_loss, test_accuracy = evaluate(model, test_loader)
    
    print('[{}] Train Loss: {:.4f}\tTest Loss: {:.4f}\tAccuracy: {:.2f}%'.format(
          epoch, train_loss, test_loss, test_accuracy))

[1] Train Loss: 1975.9628	Test Loss: 439.9503	Accuracy: 87.37%
[2] Train Loss: 232.4814	Test Loss: 179.4079	Accuracy: 92.67%
[3] Train Loss: 140.7368	Test Loss: 165.9926	Accuracy: 92.36%
[4] Train Loss: 99.4893	Test Loss: 120.2303	Accuracy: 93.39%
[5] Train Loss: 64.4371	Test Loss: 98.0465	Accuracy: 93.29%
[6] Train Loss: 77.3351	Test Loss: 131.5321	Accuracy: 92.10%
[7] Train Loss: 66.5808	Test Loss: 91.3434	Accuracy: 94.12%
[8] Train Loss: 42.6393	Test Loss: 91.6166	Accuracy: 93.03%
[9] Train Loss: 47.8854	Test Loss: 139.5611	Accuracy: 88.22%
[10] Train Loss: 36.8201	Test Loss: 104.1044	Accuracy: 90.04%
[11] Train Loss: 39.9807	Test Loss: 103.6024	Accuracy: 93.18%
[12] Train Loss: 41.3582	Test Loss: 63.3776	Accuracy: 94.93%
[13] Train Loss: 47.7359	Test Loss: 150.7091	Accuracy: 93.06%
[14] Train Loss: 60.6608	Test Loss: 87.8861	Accuracy: 95.39%
[15] Train Loss: 53.8337	Test Loss: 52.2955	Accuracy: 94.88%
[16] Train Loss: 43.0411	Test Loss: 76.4864	Accuracy: 94.25%
[17] Train Loss: 48.

**무엇을 보완하였고, 왜 보완되었는지에 대한 자유 서술 (아래에)**

1. 위의 기존 모델에서 learning rate이 커서 오히려 정확도가 떨어지는 문제가 있었기 때문에 0.25로 줄여보았습니다. 그러나 오히려 정확도가 낮아졌기 때문에 다시 0.5로 학습률을 조정했습니다. 
2. optimizer로는 Adam을 사용하였습니다. 
3. Relu 대신 leakyrelu를 사용해보았습니다. 
4. elu나 gelu등은 시간관계상 사용해보지 못했습니다. 
5. leakyrelu의 기울기 값을 0.01로 둬보았지만 성능이 좋지않아 0.1로 설정했습니다.
6. 결과적으로 큰 차이는 보이지 않았습니다.