## 과제 1
ReLu activation function과 derivative function을 구현해보세요
- Hint : np.maximum 함수 사용하면 편리합니다
- 다른 방법 사용하셔도 무방합니다


In [None]:
import numpy as np

def relu(x):

  output = np.maximum(x, 0)

  return output

print(relu(5))
print(relu(-4))

5
0


In [None]:
def d_relu(x):

  output = 0

  if x >= 0:
    output = 1
  else:
    pass

  return output

print(d_relu(2.5))
print(d_relu(-3))

1
0


## 과제 2
Deep Learning Basic 코드 파일의 MLP implementation with Numpy library using MNIST dataset 코드 참고해서
Three layer MLP 일 때의 backward_pass 함수를 완성해주세요.   
- Hint : 코드 파일의 예시는 Two layer MLP


In [None]:
from IPython import get_ipython
get_ipython().magic('reset -sf')

In [None]:
import numpy as np
import sklearn.datasets

mnist = sklearn.datasets.fetch_openml('mnist_784', data_home="mnist_784")

In [None]:
# data preprocessing

num_train = 60000 #Train 샘플의 크기
num_class = 10 #label이 가질 수 있는 클래스의 수

x_train = np.float32(mnist.data[:num_train]).T 
y_train_index = np.int32(mnist.target[:num_train]).T 
x_test = np.float32(mnist.data[num_train:]).T 
y_test_index = np.int32(mnist.target[num_train:]).T 

# Normalization

x_train /= 255
x_test /= 255
x_size = x_train.shape[0]

y_train = np.zeros((num_class, y_train_index.shape[0]))
for idx in range(y_train_index.shape[0]):
  y_train[y_train_index[idx], idx] = 1

y_test = np.zeros((num_class, y_test_index.shape[0]))
for idx in range(y_test_index.shape[0]):
  y_test[y_test_index[idx], idx] = 1    

In [None]:
x_train.shape

(784, 60000)

In [None]:
#parameter initialization

hidden_size1 = 64 # hidden unit size
hidden_size2 = 32 # hidden unit size

# two-layer neural network

params = {"W1": np.random.randn(hidden_size1, x_size) * np.sqrt(1/ x_size),
          "b1": np.zeros((hidden_size1, 1)) * np.sqrt(1/ x_size),
          "W2": np.random.randn(hidden_size2, hidden_size1) * np.sqrt(1/ hidden_size1),
          "b2": np.zeros((hidden_size2, 1)) * np.sqrt(1/ x_size),
          "W3": np.random.randn(num_class, hidden_size2) * np.sqrt(1/ hidden_size2),
          "b3": np.zeros((num_class, 1)) * np.sqrt(1/ hidden_size2)
          }
# Xavier initialization: https://reniew.github.io/13/

In [None]:
def sigmoid(x):
  return 1/(1+np.exp(-x))

def d_sigmoid(x):
  # derivative of sigmoid
  exp = np.exp(-x)
  return (exp)/((1+exp)**2)

def softmax(x):
  exp = np.exp(x)
  return exp/np.sum(exp, axis=0)

In [None]:
def compute_loss(y_true, y_pred):
  # loss calculation

  num_sample = y_true.shape[1]
  Li = -1 * np.sum(y_true * np.log(y_pred))
  
  return Li/num_sample

In [None]:
def foward_pass(x, params):
  
  params["S1"] = np.dot(params["W1"], x) + params["b1"]
  params["A1"] = sigmoid(params["S1"])
  params["S2"] = np.dot(params["W2"], params["A1"]) + params["b2"]
  params["A2"] = sigmoid(params["S2"])
  params["S3"] = np.dot(params["W3"], params["A2"]) + params["b3"]
  params["A3"] = softmax(params["S3"])

  return params

In [None]:
def foward_pass_test(x, params):

  params_test = {}

  params_test["S1"] = np.dot(params["W1"], x) + params["b1"]
  params_test["A1"] = sigmoid(params_test["S1"])
  params_test["S2"] = np.dot(params["W2"], params_test["A1"]) + params["b2"]
  params_test["A2"] = sigmoid(params_test["S2"])
  params_test["S3"] = np.dot(params["W3"], params_test["A2"]) + params["b3"]
  params_test["A3"] = softmax(params_test["S3"])

  return params_test

In [None]:
def compute_accuracy(y_true, y_pred):
  y_true_idx = np.argmax(y_true, axis = 0)
  y_pred_idx = np.argmax(y_pred, axis = 0)
  num_correct = np.sum(y_true_idx==y_pred_idx)

  accuracy = num_correct / y_true.shape[1] * 100

  return accuracy

In [None]:
def backward_pass(x, y_true, params):

  dS3 = params["A3"] - y_true

  grads = {}

  grads["dW3"] =  np.dot(dS3, params["A2"].T)/x.shape[1]
  grads["db3"] =  (1/x.shape[1])*np.sum(dS3, axis=1, keepdims=True)/x.shape[1]

  dA2 = np.dot(params["W3"].T, dS3)
  dS2 = dA2 * d_sigmoid(params["S2"])

  grads["dW2"] =  np.dot(dS2, params["A1"].T)/x.shape[1]
  grads["db2"] =  (1/x.shape[1])*np.sum(dS2, axis=1, keepdims=True)/x.shape[1]

  dA1 = np.dot(params["W2"].T, dS2)
  dS1 = dA1 * d_sigmoid(params["S1"])

  grads["dW1"] = np.dot(dS1, x.T)/x.shape[1]
  grads["db1"] = np.sum(dS1, axis=1, keepdims=True)/x.shape[1]

  return grads


In [None]:
epochs = 100
learning_rate = 0.5

for i in range(epochs):

  if i == 0:
    params = foward_pass(x_train, params)
    
  grads = backward_pass(x_train, y_train, params)

  params["W1"] -= learning_rate * grads["dW1"]
  params["b1"] -= learning_rate * grads["db1"]
  params["W2"] -= learning_rate * grads["dW2"]
  params["b2"] -= learning_rate * grads["db2"]
  params["W3"] -= learning_rate * grads["dW3"]
  params["b3"] -= learning_rate * grads["db3"]

  params = foward_pass(x_train, params)
  train_loss = compute_loss(y_train, params["A3"])
  train_acc = compute_accuracy(y_train, params["A3"])

  params_test = foward_pass_test(x_test, params)
  test_loss = compute_loss(y_test, params_test["A3"])
  test_acc = compute_accuracy(y_test, params_test["A3"])

  print("Epoch {}: training loss = {}, training acuracy = {}%, test loss = {}, test acuracy = {}%"
  .format(i + 1, np.round(train_loss, 6), np.round(train_acc, 2), np.round(test_loss, 6), np.round(test_acc, 2)))

Epoch 1: training loss = 2.371842, training acuracy = 9.9%, test loss = 2.375679, training acuracy = 10.27%
Epoch 2: training loss = 2.332802, training acuracy = 11.24%, test loss = 2.33546, training acuracy = 11.36%
Epoch 3: training loss = 2.313462, training acuracy = 11.24%, test loss = 2.315108, training acuracy = 11.35%
Epoch 4: training loss = 2.303396, training acuracy = 11.24%, test loss = 2.304251, training acuracy = 11.35%
Epoch 5: training loss = 2.298107, training acuracy = 11.24%, test loss = 2.298388, training acuracy = 11.35%
Epoch 6: training loss = 2.295128, training acuracy = 11.24%, test loss = 2.295014, training acuracy = 11.35%
Epoch 7: training loss = 2.293149, training acuracy = 11.24%, test loss = 2.292772, training acuracy = 11.35%
Epoch 8: training loss = 2.29156, training acuracy = 11.24%, test loss = 2.291008, training acuracy = 11.35%
Epoch 9: training loss = 2.290109, training acuracy = 11.25%, test loss = 2.289435, training acuracy = 11.35%
Epoch 10: trai

## 과제 3
Deep Learning Basic 코드 파일의 MLP implementation with Pytorch library using MNIST dataset 코드 참고해서
Three layer MLP를 구한후, 학습을 돌려 보세요

hyperparameter는 다음과 같이 설정

- epochs : 100
- hiddensize : 128, 64 (two layer)
- learning_rate : 0.5

In [None]:
from torchvision import transforms, datasets
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [None]:
# 이미지를 텐서로 변경
transform = transforms.Compose([
    transforms.ToTensor()
])

In [None]:
trainset = datasets.MNIST(
    root      = './.data/', 
    train     = True,
    download  = True,
    transform = transform
)
testset = datasets.MNIST(
    root      = './.data/', 
    train     = False,
    download  = True,
    transform = transform
)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./.data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting ./.data/MNIST/raw/train-images-idx3-ubyte.gz to ./.data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./.data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting ./.data/MNIST/raw/train-labels-idx1-ubyte.gz to ./.data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./.data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ./.data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./.data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./.data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting ./.data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./.data/MNIST/raw



In [None]:
BATCH_SIZE = 512
# train set과 test set 각각에 대하여 DataLoader를 생성합니다.
# shuffle=True 매개변수를 넣어 데이터를 섞어주세요.
train_loader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
test_loader =  DataLoader(testset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(784,128)
        self.layer2 = nn.Linear(128,64)
        self.layer3 = nn.Linear(64,10)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = x.view(-1, 784)
        out = self.layer1(x)
        out = self.relu(out)
        out = self.layer2(out)
        out = self.relu(out)
        out = self.layer3(out)

        return out

In [None]:
model = Net()
model

Net(
  (layer1): Linear(in_features=784, out_features=128, bias=True)
  (layer2): Linear(in_features=128, out_features=64, bias=True)
  (layer3): Linear(in_features=64, out_features=10, bias=True)
  (relu): ReLU()
)

In [None]:
list(model.parameters()) # 행렬들을 직접 살펴볼 수 있음
                         # require_true 얘는 학습되는 애구나 알 수 있음

[Parameter containing:
 tensor([[-0.0354,  0.0022, -0.0296,  ...,  0.0228,  0.0289, -0.0352],
         [-0.0044, -0.0308, -0.0116,  ...,  0.0316,  0.0183,  0.0253],
         [-0.0095, -0.0296, -0.0299,  ..., -0.0045, -0.0122, -0.0325],
         ...,
         [-0.0030,  0.0301, -0.0337,  ...,  0.0073, -0.0335, -0.0143],
         [-0.0311, -0.0218, -0.0331,  ..., -0.0018, -0.0137,  0.0016],
         [ 0.0138, -0.0274,  0.0111,  ..., -0.0340,  0.0242,  0.0053]],
        requires_grad=True), Parameter containing:
 tensor([ 0.0150, -0.0297,  0.0307, -0.0355,  0.0235,  0.0296, -0.0064, -0.0091,
         -0.0252,  0.0179,  0.0195, -0.0248,  0.0078,  0.0186,  0.0029,  0.0137,
          0.0149,  0.0276,  0.0241, -0.0091, -0.0046, -0.0202,  0.0196,  0.0334,
         -0.0125, -0.0176, -0.0035,  0.0132, -0.0124,  0.0223, -0.0066, -0.0051,
          0.0249, -0.0308, -0.0110,  0.0236,  0.0155, -0.0255, -0.0011, -0.0256,
          0.0333,  0.0284, -0.0107, -0.0250,  0.0036,  0.0170,  0.0296,  0.0341,

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.5)

In [None]:
def train(model, train_loader, optimizer):
    model.train()
    # 배치 당 loss 값을 담을 리스트 생성
    batch_losses = []

    for data, target in train_loader:
        # 옵티마이저의 기울기 초기화
        optimizer.zero_grad()

        # y pred 값 산출
        output = model(data)
        # loss 계산
        # 정답 데이터와의 cross entropy loss 계산
        # 이 loss를 배치 당 loss로 보관
        loss = criterion(output, target)
        batch_losses.append(loss)

        # 기울기 계산
        loss.backward()

        # 가중치 업데이트!
        optimizer.step()
        
    # 배치당 평균 loss 계산
    avg_loss = sum(batch_losses) / len(batch_losses)
    
    return avg_loss

In [None]:
def evaluate(model, test_loader):
    # 모델을 평가 모드로 전환
    model.eval()

    batch_losses = []
    correct = 0 

    with torch.no_grad(): 
        for data, target in test_loader:
            # 예측값 생성
            output = model(data)

            # loss 계산 (이전과 동일)
            loss = criterion(output, target)
            batch_losses.append(loss)

           # Accuracy 계산
           # y pred와 y가 일치하면 correct에 1을 더해주기
            pred = output.max(1, keepdim=True)[1]

            # eq() 함수는 값이 일치하면 1을, 아니면 0을 출력.
            correct += pred.eq(target.view_as(pred)).sum().item()

    # 배치 당 평균 loss 계산 
    avg_loss =  sum(batch_losses) / len(batch_losses)

    #정확도 계산
    accuracy = 100. * correct / len(test_loader.dataset)

    return avg_loss, accuracy

In [None]:
EPOCHS = 100

for epoch in range(1, EPOCHS + 1):
    train_loss = train(model, train_loader, optimizer)
    test_loss, test_accuracy = evaluate(model, test_loader)
    
    print('[{}] Train Loss: {:.4f}\tTest Loss: {:.4f}\tAccuracy: {:.2f}%'.format(
          epoch, train_loss, test_loss, test_accuracy))

[1] Train Loss: 0.8473	Test Loss: 0.3494	Accuracy: 88.84%
[2] Train Loss: 0.2382	Test Loss: 0.3438	Accuracy: 88.89%
[3] Train Loss: 0.1841	Test Loss: 0.3928	Accuracy: 88.14%
[4] Train Loss: 0.1373	Test Loss: 0.1258	Accuracy: 96.02%
[5] Train Loss: 0.1026	Test Loss: 0.1321	Accuracy: 95.73%
[6] Train Loss: 0.0873	Test Loss: 0.1590	Accuracy: 95.15%
[7] Train Loss: 0.0751	Test Loss: 0.0892	Accuracy: 97.14%
[8] Train Loss: 0.0649	Test Loss: 0.1052	Accuracy: 96.76%
[9] Train Loss: 0.0585	Test Loss: 0.1203	Accuracy: 96.15%
[10] Train Loss: 0.0514	Test Loss: 0.1603	Accuracy: 95.16%
[11] Train Loss: 0.0463	Test Loss: 0.0944	Accuracy: 97.01%
[12] Train Loss: 0.0391	Test Loss: 0.0943	Accuracy: 97.16%
[13] Train Loss: 0.0349	Test Loss: 0.0790	Accuracy: 97.56%
[14] Train Loss: 0.0309	Test Loss: 0.0764	Accuracy: 97.76%
[15] Train Loss: 0.0277	Test Loss: 0.1327	Accuracy: 96.12%
[16] Train Loss: 0.0264	Test Loss: 0.0746	Accuracy: 97.70%
[17] Train Loss: 0.0221	Test Loss: 0.0952	Accuracy: 96.94%
[18] T

## 과제 4
과제 3 부분의 성능을 지금까지 배운 지식을 바탕으로 향상시켜보세요

- Hint : Activation function, hyperparameter setting

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(784,128)
        self.layer2 = nn.Linear(128,64)
        self.layer3 = nn.Linear(64,10)
        self.leakyrelu = nn.LeakyReLU(0.1)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = x.view(-1, 784)
        out = self.layer1(x)
        out = self.leakyrelu(out)
        out = self.layer2(out)
        out = self.leakyrelu(out)
        out = self.layer3(out)

        return out

In [None]:
model = Net()
model

Net(
  (layer1): Linear(in_features=784, out_features=128, bias=True)
  (layer2): Linear(in_features=128, out_features=64, bias=True)
  (layer3): Linear(in_features=64, out_features=10, bias=True)
  (leakyrelu): LeakyReLU(negative_slope=0.1)
  (relu): ReLU()
)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.3, momentum = 0.9)

In [None]:
EPOCHS = 100

for epoch in range(1, EPOCHS + 1):
    train_loss = train(model, train_loader, optimizer)
    test_loss, test_accuracy = evaluate(model, test_loader)
    
    print('[{}] Train Loss: {:.4f}\tTest Loss: {:.4f}\tAccuracy: {:.2f}%'.format(
          epoch, train_loss, test_loss, test_accuracy))

[1] Train Loss: 0.6055	Test Loss: 0.1884	Accuracy: 94.26%
[2] Train Loss: 0.1535	Test Loss: 0.1251	Accuracy: 96.05%
[3] Train Loss: 0.1146	Test Loss: 0.1164	Accuracy: 96.44%
[4] Train Loss: 0.0890	Test Loss: 0.1108	Accuracy: 96.80%
[5] Train Loss: 0.0759	Test Loss: 0.1036	Accuracy: 97.07%
[6] Train Loss: 0.0629	Test Loss: 0.1030	Accuracy: 97.14%
[7] Train Loss: 0.0564	Test Loss: 0.0988	Accuracy: 97.10%
[8] Train Loss: 0.0474	Test Loss: 0.1025	Accuracy: 97.36%
[9] Train Loss: 0.0477	Test Loss: 0.1018	Accuracy: 97.30%
[10] Train Loss: 0.0392	Test Loss: 0.1024	Accuracy: 97.32%
[11] Train Loss: 0.0395	Test Loss: 0.1098	Accuracy: 97.18%
[12] Train Loss: 0.0330	Test Loss: 0.0958	Accuracy: 97.62%
[13] Train Loss: 0.0279	Test Loss: 0.1112	Accuracy: 97.32%
[14] Train Loss: 0.0295	Test Loss: 0.1040	Accuracy: 97.57%
[15] Train Loss: 0.0186	Test Loss: 0.1001	Accuracy: 97.78%
[16] Train Loss: 0.0163	Test Loss: 0.1056	Accuracy: 97.68%
[17] Train Loss: 0.0137	Test Loss: 0.1075	Accuracy: 97.62%
[18] T

**무엇을 보완하였고, 왜 보완되었는지에 대한 자유 서술 (아래에)**

1) 기존 모형의 accuracy가 0.95~0.97에서 증감을 반복<br>
 -> local minima에 갇히는 문제가 있을 수도 있어서 <br>
    momentum 파라미터를 추가하였습니다.
<br>

2) Dyling Relu 문제 <br>
 -> Dying Relu 문제가 있을 수도 있으므로 <br> 
    leaky Relu를 대신 사용해보았습니다.
<br>

결과적으로 Accuracy가 98% 이상으로 높아졌고, <br>
train 과정에서 성능이 감소하는 경우가 <br>
줄어들었습니다. 하지만 큰 차이는 없었습니다.