## 과제 1
ReLu activation function과 derivative function을 구현해보세요
- Hint : np.maximum 함수 사용하면 편리합니다
- 다른 방법 사용하셔도 무방합니다


In [50]:
def relu(x):
  return np.maximum(0,x)

In [51]:
def d_relu(x):
  return np.diff(relu(x)) # 0 혹은 1

## 과제 2
Deep Learning Basic 코드 파일의 MLP implementation with Numpy library using MNIST dataset 코드 참고해서
Three layer MLP 일 때의 backward_pass 함수를 완성해주세요.   
- Hint : 코드 파일의 예시는 Two layer MLP


In [52]:
from IPython import get_ipython
get_ipython().magic('reset -sf')

import numpy as np
import sklearn.datasets

mnist = sklearn.datasets.fetch_openml('mnist_784', data_home="mnist_784")

In [53]:
# data preprocessing

num_train = 60000
num_class = 10

x_train = np.float32(mnist.data[:num_train]).T
y_train_index = np.int32(mnist.target[:num_train]).T
x_test = np.float32(mnist.data[num_train:]).T
y_test_index = np.int32(mnist.target[num_train:]).T

# Normalization

x_train /= 255
x_test /= 255
x_size = x_train.shape[0]

y_train = np.zeros((num_class, y_train_index.shape[0]))
for idx in range(y_train_index.shape[0]):
  y_train[y_train_index[idx], idx] = 1

y_test = np.zeros((num_class, y_test_index.shape[0]))
for idx in range(y_test_index.shape[0]):
  y_test[y_test_index[idx], idx] = 1    

In [54]:
print(x_size)
print(num_class)

784
10


In [55]:
#parameter initialization

hidden_size1 = 128 # hidden unit size1
hidden_size2 = 64

# two-layer neural network

params = {"W1": np.random.randn(hidden_size1, x_size) * np.sqrt(1/ x_size),
          "b1": np.zeros((hidden_size1, 1)) * np.sqrt(1/ x_size),
          "W2": np.random.randn(hidden_size2, hidden_size1) * np.sqrt(1/ hidden_size1),
          "b2": np.zeros((hidden_size2, 1)) * np.sqrt(1/ hidden_size1),
          "W3": np.random.randn(num_class, hidden_size2) * np.sqrt(1/ hidden_size2),
          "b3": np.zeros((num_class, 1)) * np.sqrt(1/ hidden_size2)
          }
# Xavier initialization: https://reniew.github.io/13/

In [56]:
def sigmoid(x):
  return 1/(1+np.exp(-x))

def d_sigmoid(x):
  # derivative of sigmoid
  exp = np.exp(-x)
  return (exp)/((1+exp)**2)

def softmax(x):
  exp = np.exp(x)
  return exp/np.sum(exp, axis=0)

In [57]:
def compute_loss(y_true, y_pred):
  # loss calculation

  num_sample = y_true.shape[1]
  Li = -1 * np.sum(y_true * np.log(y_pred))
  
  return Li/num_sample

In [58]:
def foward_pass(x, params):
  
  params["S1"] = np.dot(params["W1"], x) + params["b1"]
  params["A1"] = sigmoid(params["S1"])
  params["S2"] = np.dot(params["W2"], params["A1"]) + params["b2"]
  params["A2"] = sigmoid(params["S2"])
  params["S3"] = np.dot(params["W3"], params["A2"]) + params["b3"]
  params["A3"] = softmax(params["S3"])

  return params

In [59]:
def foward_pass_test(x, params):

  params_test = {}
  
  params_test["S1"] = np.dot(params["W1"], x) + params["b1"]
  params_test["A1"] = sigmoid(params_test["S1"])
  params_test["S2"] = np.dot(params["W2"], params_test["A1"]) + params["b2"]
  params_test["A2"] = sigmoid(params_test["S2"])
  params_test["S3"] = np.dot(params["W3"], params_test["A2"]) + params["b3"]
  params_test["A3"] = softmax(params_test["S32"])

  return params_test

In [60]:
def compute_accuracy(y_true, y_pred):
  y_true_idx = np.argmax(y_true, axis = 0)
  y_pred_idx = np.argmax(y_pred, axis = 0)
  num_correct = np.sum(y_true_idx==y_pred_idx)

  accuracy = num_correct / y_true.shape[1] * 100

  return accuracy

In [61]:
# def backward_pass(x, y_true, params):

#   return 

def backward_pass(x, y_true, params):

  dS3 = params["A3"] - y_true
  # Please check http://machinelearningmechanic.com/deep_learning/2019/09/04/cross-entropy-loss-derivative.html
  # dS3 is softmax + CE loss derivative

  grads = {}

  grads["dW3"] =  np.dot(dS3, params["A2"].T)/x.shape[1]
  grads["db3"] =  (1/x.shape[1])*np.sum(dS3, axis=1, keepdims=True)/x.shape[1]

  dA2 = np.dot(params["W3"].T, dS3)
  dS2 = dA2 * d_sigmoid(params["S2"])

  grads["dW2"] =  np.dot(dS2, params["A1"].T)/x.shape[1]
  grads["db2"] =  (1/x.shape[1])*np.sum(dS2, axis=1, keepdims=True)/x.shape[1]

  dA1 = np.dot(params["W2"].T, dS2)
  dS1 = dA1 * d_sigmoid(params["S1"])

  grads["dW1"] = np.dot(dS1, x.T)/x.shape[1]
  grads["db1"] = np.sum(dS1, axis=1, keepdims=True)/x.shape[1]

  return grads

## 과제 3
Deep Learning Basic 코드 파일의 MLP implementation with Pytorch library using MNIST dataset 코드 참고해서
Three layer MLP를 구한후, 학습을 돌려 보세요

hyperparameter는 다음과 같이 설정

- epochs : 100
- hiddensize : 128, 64 (two layer)
- learning_rate : 0.5

In [62]:
# Assignment 3 구현은 여기서 ()

In [63]:
from torchvision import transforms, datasets
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [64]:
# 이미지를 텐서로 변경
transform = transforms.Compose([
    transforms.ToTensor()
])

In [65]:
trainset = datasets.MNIST(
    root      = './.data/', 
    train     = True,
    download  = True,
    transform = transform
)
testset = datasets.MNIST(
    root      = './.data/', 
    train     = False,
    download  = True,
    transform = transform
)

In [81]:
BATCH_SIZE = 512
# train set과 test set 각각에 대하여 DataLoader를 생성합니다.
# shuffle=True 매개변수를 넣어 데이터를 섞어주세요.
train_loader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
test_loader =  DataLoader(testset, batch_size=BATCH_SIZE, shuffle=True)

In [67]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(784,128)
        self.layer2 = nn.Linear(128,64)
        self.layer3 = nn.Linear(64,10)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = x.view(-1, 784)
        out = self.layer1(x)
        out = self.relu(out)
        out = self.layer2(out)
        out = self.relu(out)
        out = self.layer3(out)

        return out

In [82]:
model = Net()
model

Net(
  (layer1): Linear(in_features=784, out_features=256, bias=True)
  (layer2): Linear(in_features=256, out_features=128, bias=True)
  (layer3): Linear(in_features=128, out_features=10, bias=True)
  (relu): ReLU()
)

In [69]:
list(model.parameters()) # 행렬들을 직접 살펴볼 수 있음
                         # require_true 얘는 학습되는 애구나 알 수 있음

[Parameter containing:
 tensor([[ 1.8535e-02, -3.9359e-03,  1.5177e-02,  ...,  1.4075e-02,
          -1.5928e-02,  2.1335e-02],
         [ 3.3270e-02,  2.0526e-02,  2.2271e-02,  ..., -2.4958e-02,
          -1.3841e-02,  1.1795e-02],
         [ 3.0407e-05, -1.7983e-02,  1.0255e-02,  ..., -2.7280e-02,
          -1.2003e-02, -1.7709e-02],
         ...,
         [ 8.1233e-03,  2.3497e-02,  1.0214e-02,  ..., -7.7300e-03,
          -2.4493e-02, -2.6134e-02],
         [-2.7089e-02, -1.7551e-02, -2.4000e-02,  ..., -2.5035e-02,
           1.0469e-02,  1.3631e-02],
         [-2.0965e-02,  1.5920e-02,  3.1924e-02,  ..., -2.9084e-02,
           7.9370e-03, -2.5765e-02]], requires_grad=True),
 Parameter containing:
 tensor([-2.7589e-02, -9.2600e-03, -2.2104e-02,  2.3236e-02,  3.1727e-02,
          4.2281e-03, -2.5305e-02,  1.8140e-02,  1.6208e-03,  1.2046e-02,
         -6.4479e-03,  1.0262e-02, -3.5409e-02, -6.8497e-03,  5.2444e-03,
          1.9292e-03,  5.0846e-03,  3.3134e-02,  2.0722e-02, -6.07

In [83]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.5)

In [84]:
def train(model, train_loader, optimizer):
    model.train()
    # 배치 당 loss 값을 담을 리스트 생성
    batch_losses = []

    for data, target in train_loader:
        # 옵티마이저의 기울기 초기화
        optimizer.zero_grad()

        # y pred 값 산출
        output = model(data)
        # loss 계산
        # 정답 데이터와의 cross entropy loss 계산
        # 이 loss를 배치 당 loss로 보관
        loss = criterion(output, target)
        batch_losses.append(loss)

        # 기울기 계산
        loss.backward()

        # 가중치 업데이트!
        optimizer.step()
        
    # 배치당 평균 loss 계산
    avg_loss = sum(batch_losses) / len(batch_losses)
    
    return avg_loss

In [85]:
def evaluate(model, test_loader):
    # 모델을 평가 모드로 전환
    model.eval()

    batch_losses = []
    correct = 0 

    with torch.no_grad(): 
        for data, target in test_loader:
            # 예측값 생성
            output = model(data)

            # loss 계산 (이전과 동일)
            loss = criterion(output, target)
            batch_losses.append(loss)

           # Accuracy 계산
           # y pred와 y가 일치하면 correct에 1을 더해주기
            pred = output.max(1, keepdim=True)[1]

            # eq() 함수는 값이 일치하면 1을, 아니면 0을 출력.
            correct += pred.eq(target.view_as(pred)).sum().item()

    # 배치 당 평균 loss 계산 
    avg_loss =  sum(batch_losses) / len(batch_losses)

    #정확도 계산
    accuracy = 100. * correct / len(test_loader.dataset)

    return avg_loss, accuracy

In [86]:
EPOCHS = 30

for epoch in range(1, EPOCHS + 1):
    train_loss = train(model, train_loader, optimizer)
    test_loss, test_accuracy = evaluate(model, test_loader)
    
    print('[{}] Train Loss: {:.4f}\tTest Loss: {:.4f}\tAccuracy: {:.2f}%'.format(
          epoch, train_loss, test_loss, test_accuracy))

[1] Train Loss: 0.8140	Test Loss: 0.3978	Accuracy: 86.39%
[2] Train Loss: 0.2286	Test Loss: 0.2098	Accuracy: 93.69%
[3] Train Loss: 0.1585	Test Loss: 0.1681	Accuracy: 95.02%
[4] Train Loss: 0.1211	Test Loss: 0.1157	Accuracy: 96.35%
[5] Train Loss: 0.0971	Test Loss: 0.4150	Accuracy: 87.28%
[6] Train Loss: 0.1816	Test Loss: 0.1285	Accuracy: 95.67%
[7] Train Loss: 0.0796	Test Loss: 0.0882	Accuracy: 97.17%
[8] Train Loss: 0.0679	Test Loss: 0.0796	Accuracy: 97.44%
[9] Train Loss: 0.0576	Test Loss: 0.1063	Accuracy: 96.81%
[10] Train Loss: 0.0498	Test Loss: 0.0886	Accuracy: 97.29%
[11] Train Loss: 0.0681	Test Loss: 0.0809	Accuracy: 97.53%
[12] Train Loss: 0.0432	Test Loss: 0.0884	Accuracy: 97.40%
[13] Train Loss: 0.0348	Test Loss: 0.0980	Accuracy: 97.15%
[14] Train Loss: 0.0300	Test Loss: 0.0651	Accuracy: 97.80%
[15] Train Loss: 0.0254	Test Loss: 0.0758	Accuracy: 97.68%
[16] Train Loss: 0.0231	Test Loss: 0.0822	Accuracy: 97.48%
[17] Train Loss: 0.0196	Test Loss: 0.0692	Accuracy: 97.80%
[18] T

## 과제 4
과제 3 부분의 성능을 지금까지 배운 지식을 바탕으로 향상시켜보세요

- Hint : Activation function, hyperparameter setting

In [74]:
# Assignment 4 구현은 여기서 ()
# 4.1 학습률 0.05
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.05)
#충분히 좋은 결과를 내고 있기는 하나 학습률을 조금 낮춰서 천천히 학습되게 한 번 시도

In [75]:
#4.2 활성화 함수 시그모이드로 변경. 학습률은 원래대로 0.5
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(784,128)
        self.layer2 = nn.Linear(128,64)
        self.layer3 = nn.Linear(64,10)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = x.view(-1, 784)
        out = self.layer1(x)
        out = self.sigmoid(out)
        out = self.layer2(out)
        out = self.sigmoid(out)
        out = self.layer3(out)

        return out

In [76]:
model = Net()
model

Net(
  (layer1): Linear(in_features=784, out_features=128, bias=True)
  (layer2): Linear(in_features=128, out_features=64, bias=True)
  (layer3): Linear(in_features=64, out_features=10, bias=True)
  (sigmoid): Sigmoid()
)

In [77]:
#4.3 배치 크기 256
BATCH_SIZE = 256
# train set과 test set 각각에 대하여 DataLoader를 생성합니다.
# shuffle=True 매개변수를 넣어 데이터를 섞어주세요.
train_loader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
test_loader =  DataLoader(testset, batch_size=BATCH_SIZE, shuffle=True)

In [78]:
# 4.4 optimizer Adam
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.5)
#충분히 좋은 결과를 내고 있기는 하나 학습률을 조금 낮춰서 천천히 학습되게 한 번 시도

In [80]:
#4.5 hidden size 256,128
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(784,256)
        self.layer2 = nn.Linear(256,128)
        self.layer3 = nn.Linear(128,10)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = x.view(-1, 784)
        out = self.layer1(x)
        out = self.relu(out)
        out = self.layer2(out)
        out = self.relu(out)
        out = self.layer3(out)

        return out

**무엇을 보완하였고, 왜 보완되었는지에 대한 자유 서술 (아래에)**

In [79]:
'''
epoch는 30으로 줄이기(충분히 좋은 결과를 내고 있음) 97.61%
1.학습률을 조금 낮춰서 천천히 학습되게 한 번 시도 lr=0.05 accuracy = 96.54%
2.활성화 함수 시그모이드로 변경해서 학습시켜보기 lr = 0.5 accuracy = 96.17%
3.배치 사이즈 256으로 조절 accuracy = 98.13%
4.optimizer Adam으로 변경 accuracy = 10.30% 매우 안 좋은 성능. 이상하게 SGD optimizer가 더 좋은 성능
5.hidden size 258,126으로 변경 accuracy = 98.20%
'''

'\nepoch는 30으로 줄이기(충분히 좋은 결과를 내고 있음)\n1.학습률을 조금 낮춰서 천천히 학습되게 한 번 시도 lr=0.05 accuracy = 96.54%\n2.활성화 함수 시그모이드로 변경해서 학습시켜보기 lr = 0.5 accuracy = 96.17%\n3.배치 사이즈 256으로 조절 accuracy = 98.13%\n4.optimizer Adam으로 변경 accuracy = 10.30% 매우 안 좋은 성능\n'