## 과제 1
ReLu activation function과 derivative function을 구현해보세요
- Hint : np.maximum 함수 사용하면 편리합니다
- 다른 방법 사용하셔도 무방합니다


In [2]:
import numpy as np
from torchvision import transforms, datasets
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [13]:
def relu(x):
  x = np.maximum(0, x)
  return x

In [16]:
def d_relu(x):
  if x > 0:
    x = 1
  else:
    x = 0
  return x

In [17]:
# 확인
print(relu(3))
print(relu(-3))
print(d_relu(3))
print(d_relu(-3))

3
0
1
0


## 과제 2
Deep Learning Basic 코드 파일의 MLP implementation with Numpy library using MNIST dataset 코드 참고해서
Three layer MLP 일 때의 backward_pass 함수를 완성해주세요.   
- Hint : 코드 파일의 예시는 Two layer MLP


In [48]:
def backward_pass(x, y_true, params):

  dS3 = params["A3"] - y_true

  grads = {}

  grads["dW3"] =  np.dot(dS3, params["A1"].T)/x.shape[1]
  grads["db3"] =  (1/x.shape[1])*np.sum(dS3, axis=1, keepdims=True)/x.shape[1]

  dA2 = np.dot(params["W3"].T, dS3)
  dS2 = dA2 * d_relu(params["S2"])

  grads["dW2"] =  np.dot(dS2, params["A1"].T)/x.shape[1]
  grads["db2"] =  (1/x.shape[1])*np.sum(dS2, axis=1, keepdims=True)/x.shape[1]

  dA1 = np.dot(params["W2"].T, dS2)
  dS1 = dA1 * d_relu(params["S1"])

  grads["dW1"] = np.dot(dS1, x.T)/x.shape[1]
  grads["db1"] = np.sum(dS1, axis=1, keepdims=True)/x.shape[1]

  return grads

## 과제 3
Deep Learning Basic 코드 파일의 MLP implementation with Pytorch library using MNIST dataset 코드 참고해서
Three layer MLP를 구한후, 학습을 돌려 보세요

hyperparameter는 다음과 같이 설정

- epochs : 100
- hiddensize : 128, 64 (two layer)
- learning_rate : 0.5

In [30]:
from torchvision import transforms, datasets
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [31]:
# 이미지를 텐서로 변경
transform = transforms.Compose([
    transforms.ToTensor()
])

In [32]:
trainset = datasets.MNIST(
    root      = './.data/', 
    train     = True,
    download  = True,
    transform = transform
)
testset = datasets.MNIST(
    root      = './.data/', 
    train     = False,
    download  = True,
    transform = transform
)

In [33]:
BATCH_SIZE = 512
# train set과 test set 각각에 대하여 DataLoader를 생성합니다.
# shuffle=True 매개변수를 넣어 데이터를 섞어주세요.
train_loader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
test_loader =  DataLoader(testset, batch_size=BATCH_SIZE, shuffle=True)

In [34]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(784,128)
        self.layer2 = nn.Linear(128,64)
        self.layer3 = nn.Linear(64,10)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = x.view(-1, 784)
        out = self.layer1(x)
        out = self.relu(out)
        out = self.layer2(out)
        out = self.relu(out)
        out = self.layer3(out)

        return out

In [35]:
model = Net()
model

Net(
  (layer1): Linear(in_features=784, out_features=128, bias=True)
  (layer2): Linear(in_features=128, out_features=64, bias=True)
  (layer3): Linear(in_features=64, out_features=10, bias=True)
  (relu): ReLU()
)

In [36]:
list(model.parameters()) # 행렬들을 직접 살펴볼 수 있음
                         # require_true 얘는 학습되는 애구나 알 수 있음

[Parameter containing:
 tensor([[-0.0093, -0.0057,  0.0309,  ..., -0.0231,  0.0118,  0.0155],
         [ 0.0173,  0.0223,  0.0060,  ...,  0.0044,  0.0307,  0.0142],
         [ 0.0312,  0.0067,  0.0244,  ...,  0.0203, -0.0123, -0.0250],
         ...,
         [ 0.0270, -0.0056,  0.0139,  ...,  0.0078,  0.0293, -0.0171],
         [-0.0240, -0.0301,  0.0191,  ..., -0.0102,  0.0320, -0.0180],
         [ 0.0233, -0.0222,  0.0061,  ...,  0.0218,  0.0128,  0.0099]],
        requires_grad=True), Parameter containing:
 tensor([-0.0056, -0.0282,  0.0297,  0.0225, -0.0174,  0.0254, -0.0090,  0.0216,
          0.0345, -0.0172,  0.0190,  0.0134, -0.0248, -0.0110,  0.0100,  0.0300,
          0.0127,  0.0285,  0.0185, -0.0028,  0.0283,  0.0073,  0.0332, -0.0248,
          0.0231,  0.0242,  0.0039,  0.0252, -0.0035, -0.0294,  0.0150, -0.0260,
         -0.0169, -0.0276, -0.0033,  0.0061,  0.0031,  0.0334, -0.0071, -0.0322,
          0.0084,  0.0234,  0.0310, -0.0108, -0.0204,  0.0154, -0.0306,  0.0254,

In [37]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.5)

In [38]:
def train(model, train_loader, optimizer):
    model.train()
    # 배치 당 loss 값을 담을 리스트 생성
    batch_losses = []

    for data, target in train_loader:
        # 옵티마이저의 기울기 초기화
        optimizer.zero_grad()

        # y pred 값 산출
        output = model(data)
        # loss 계산
        # 정답 데이터와의 cross entropy loss 계산
        # 이 loss를 배치 당 loss로 보관
        loss = criterion(output, target)
        batch_losses.append(loss)

        # 기울기 계산
        loss.backward()

        # 가중치 업데이트!
        optimizer.step()
        
    # 배치당 평균 loss 계산
    avg_loss = sum(batch_losses) / len(batch_losses)
    
    return avg_loss

In [39]:
def evaluate(model, test_loader):
    # 모델을 평가 모드로 전환
    model.eval()

    batch_losses = []
    correct = 0 

    with torch.no_grad(): 
        for data, target in test_loader:
            # 예측값 생성
            output = model(data)

            # loss 계산 (이전과 동일)
            loss = criterion(output, target)
            batch_losses.append(loss)

           # Accuracy 계산
           # y pred와 y가 일치하면 correct에 1을 더해주기
            pred = output.max(1, keepdim=True)[1]

            # eq() 함수는 값이 일치하면 1을, 아니면 0을 출력.
            correct += pred.eq(target.view_as(pred)).sum().item()

    # 배치 당 평균 loss 계산 
    avg_loss =  sum(batch_losses) / len(batch_losses)

    #정확도 계산
    accuracy = 100. * correct / len(test_loader.dataset)

    return avg_loss, accuracy

In [40]:
EPOCHS = 100

for epoch in range(1, EPOCHS + 1):
    train_loss = train(model, train_loader, optimizer)
    test_loss, test_accuracy = evaluate(model, test_loader)
    
    print('[{}] Train Loss: {:.4f}\tTest Loss: {:.4f}\tAccuracy: {:.2f}%'.format(
          epoch, train_loss, test_loss, test_accuracy))

[1] Train Loss: 0.8366	Test Loss: 0.3117	Accuracy: 90.36%
[2] Train Loss: 0.2432	Test Loss: 0.2620	Accuracy: 91.71%
[3] Train Loss: 0.1716	Test Loss: 0.1963	Accuracy: 93.74%
[4] Train Loss: 0.1311	Test Loss: 0.1516	Accuracy: 95.44%
[5] Train Loss: 0.2099	Test Loss: 0.1535	Accuracy: 95.20%
[6] Train Loss: 0.1121	Test Loss: 0.2499	Accuracy: 92.00%
[7] Train Loss: 0.1100	Test Loss: 0.1855	Accuracy: 94.51%
[8] Train Loss: 0.0837	Test Loss: 0.1163	Accuracy: 96.24%
[9] Train Loss: 0.0677	Test Loss: 0.1187	Accuracy: 96.29%
[10] Train Loss: 0.0711	Test Loss: 0.0957	Accuracy: 96.94%
[11] Train Loss: 0.0520	Test Loss: 0.0857	Accuracy: 97.36%
[12] Train Loss: 0.0467	Test Loss: 0.0873	Accuracy: 97.24%
[13] Train Loss: 0.0412	Test Loss: 0.0733	Accuracy: 97.79%
[14] Train Loss: 0.0381	Test Loss: 0.1079	Accuracy: 96.60%
[15] Train Loss: 0.0342	Test Loss: 0.0724	Accuracy: 97.76%
[16] Train Loss: 0.0298	Test Loss: 0.0838	Accuracy: 97.56%
[17] Train Loss: 0.0271	Test Loss: 0.1024	Accuracy: 97.00%
[18] T

## 과제 4
과제 3 부분의 성능을 지금까지 배운 지식을 바탕으로 향상시켜보세요

- Hint : Activation function, hyperparameter setting

In [41]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(784,128)
        self.layer2 = nn.Linear(128,10)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = x.view(-1, 784)
        out = self.layer1(x)
        out = self.relu(out)
        out = self.layer2(out)

        return out

In [42]:
model = Net()
model

Net(
  (layer1): Linear(in_features=784, out_features=128, bias=True)
  (layer2): Linear(in_features=128, out_features=10, bias=True)
  (relu): ReLU()
)

In [43]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.05)

In [44]:
def train(model, train_loader, optimizer):
    model.train()
    # 배치 당 loss 값을 담을 리스트 생성
    batch_losses = []

    for data, target in train_loader:
        # 옵티마이저의 기울기 초기화
        optimizer.zero_grad()

        # y pred 값 산출
        output = model(data)
        # loss 계산
        # 정답 데이터와의 cross entropy loss 계산
        # 이 loss를 배치 당 loss로 보관
        loss = criterion(output, target)
        batch_losses.append(loss)

        # 기울기 계산
        loss.backward()

        # 가중치 업데이트!
        optimizer.step()
        
    # 배치당 평균 loss 계산
    avg_loss = sum(batch_losses) / len(batch_losses)
    
    return avg_loss

In [45]:
def evaluate(model, test_loader):
    # 모델을 평가 모드로 전환
    model.eval()

    batch_losses = []
    correct = 0 

    with torch.no_grad(): 
        for data, target in test_loader:
            # 예측값 생성
            output = model(data)

            # loss 계산 (이전과 동일)
            loss = criterion(output, target)
            batch_losses.append(loss)

           # Accuracy 계산
           # y pred와 y가 일치하면 correct에 1을 더해주기
            pred = output.max(1, keepdim=True)[1]

            # eq() 함수는 값이 일치하면 1을, 아니면 0을 출력.
            correct += pred.eq(target.view_as(pred)).sum().item()

    # 배치 당 평균 loss 계산 
    avg_loss =  sum(batch_losses) / len(batch_losses)

    #정확도 계산
    accuracy = 100. * correct / len(test_loader.dataset)

    return avg_loss, accuracy

In [47]:
EPOCHS = 50

for epoch in range(1, EPOCHS + 1):
    train_loss = train(model, train_loader, optimizer)
    test_loss, test_accuracy = evaluate(model, test_loader)
    
    print('[{}] Train Loss: {:.4f}\tTest Loss: {:.4f}\tAccuracy: {:.2f}%'.format(
          epoch, train_loss, test_loss, test_accuracy))

[1] Train Loss: 0.2699	Test Loss: 0.2592	Accuracy: 92.61%
[2] Train Loss: 0.2635	Test Loss: 0.2529	Accuracy: 92.77%
[3] Train Loss: 0.2572	Test Loss: 0.2493	Accuracy: 92.89%
[4] Train Loss: 0.2514	Test Loss: 0.2435	Accuracy: 93.12%
[5] Train Loss: 0.2455	Test Loss: 0.2379	Accuracy: 93.33%
[6] Train Loss: 0.2402	Test Loss: 0.2321	Accuracy: 93.44%
[7] Train Loss: 0.2352	Test Loss: 0.2271	Accuracy: 93.52%
[8] Train Loss: 0.2302	Test Loss: 0.2242	Accuracy: 93.64%
[9] Train Loss: 0.2253	Test Loss: 0.2208	Accuracy: 93.84%
[10] Train Loss: 0.2206	Test Loss: 0.2145	Accuracy: 93.87%
[11] Train Loss: 0.2163	Test Loss: 0.2102	Accuracy: 93.94%
[12] Train Loss: 0.2116	Test Loss: 0.2081	Accuracy: 94.05%
[13] Train Loss: 0.2078	Test Loss: 0.2031	Accuracy: 94.13%
[14] Train Loss: 0.2044	Test Loss: 0.1997	Accuracy: 94.27%
[15] Train Loss: 0.2005	Test Loss: 0.1961	Accuracy: 94.28%
[16] Train Loss: 0.1973	Test Loss: 0.1937	Accuracy: 94.41%
[17] Train Loss: 0.1927	Test Loss: 0.1919	Accuracy: 94.53%
[18] T

**무엇을 보완하였고, 왜 보완되었는지에 대한 자유 서술 (아래에)**

과제 3번 모델의 출력 결과를 살펴보자. 훈련세트의 손실함수 값은 작은 값으로 계속 줄어들고 있다. 그러나 테스트세트의 손실함수 값은 일정 구간부터 증가와 감소를 반복하고 있다. 모델의 정확도 역시 13번째 에포크 이후 유의미한 증가추세를 보이고 있지 않다. 이는 모델이 훈련세트에만 과적합되었음을 의미한다.

이를 해결하기 위해 다음과 같이 보완하였다.   
1) learning rate = 0.05  
학습률이 너무 크면 손실함수 값이 오히려 더 커지는 오버슈팅 문제가 발생한다. 조금씩 가중치를 업데이트해가면서 이를 방지하도록 하겠다.    
2) epoch = 50: 에포크 횟수를 줄여 과적합 문제를 완화해보겠다.   
3) layer 개수 : 2  
레이어 개수가 많아질 수록 모델이 더 정교해진다는 장점이 있지만, 과적합되기도 쉬워진다.    
레이어 개수를 2개로 줄여 일반화 성능을 높여보겠다. 대신 첫 레이어 노드 수는 128개를 그대로 유지하겠다.
