In [5]:
# 평범한 tensorflow 활용한 mnist 코드

import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

# MNIST 데이터를 로드. 다운로드하지 않았다면 다운로드까지 자동으로 진행됩니다. 
mnist = keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()   

# 모델에 맞게 데이터 가공
x_train_norm, x_test_norm = x_train / 255.0, x_test / 255.0
x_train_reshaped = x_train_norm.reshape(-1, x_train_norm.shape[1]*x_train_norm.shape[2])
x_test_reshaped = x_test_norm.reshape(-1, x_test_norm.shape[1]*x_test_norm.shape[2])

# 딥러닝 모델 구성 - 2 Layer Perceptron
model=keras.models.Sequential()
model.add(keras.layers.Dense(50, activation='sigmoid', input_shape=(784,)))  # 입력층 d=784, 은닉층 레이어 H=50
model.add(keras.layers.Dense(10, activation='softmax'))   # 출력층 레이어 K=10
model.summary()

# 모델 구성과 학습
model.compile(optimizer='adam',
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])
model.fit(x_train_reshaped, y_train, epochs=10)

# 모델 테스트 결과
test_loss, test_accuracy = model.evaluate(x_test_reshaped,y_test, verbose=2)
print("test_loss: {} ".format(test_loss))
print("test_accuracy: {}".format(test_accuracy))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                39250     
_________________________________________________________________
dense_1 (Dense)              (None, 10)                510       
Total params: 39,760
Trainable params: 39,760
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
313/313 - 1s - loss: 0.1060 - accuracy: 0.9684
test_loss: 0.10600697249174118 
test_accuracy: 0.9684000015258789


In [6]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                39250     
_________________________________________________________________
dense_1 (Dense)              (None, 10)                510       
Total params: 39,760
Trainable params: 39,760
Non-trainable params: 0
_________________________________________________________________


2개 이상의 레이어를 쌓아 만든 것을 다층 퍼셉트론 (Multio-Layer Perceptron; MLP) 더 깊어질 수록 deep 하다   
보통 이것을 DNN 이라고 부른다   
Fully-Connected Neural Network - 서로 다른 층에 위치한 노드와는 연결 관계가 없고, 인접한 층의 노드들만 연결   

In [3]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

# MNIST 데이터를 로드. 다운로드하지 않았다면 다운로드까지 자동으로 진행됩니다. 
mnist = keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()   

# 모델에 맞게 데이터 가공
x_train_norm, x_test_norm = x_train / 255.0, x_test / 255.0
x_train_reshaped = x_train_norm.reshape(-1, x_train_norm.shape[1]*x_train_norm.shape[2])
x_test_reshaped = x_test_norm.reshape(-1, x_test_norm.shape[1]*x_test_norm.shape[2])

2021-11-20 15:30:06.497623: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [4]:
# 입력층 데이터의 모양(shape)
print(x_train_reshaped.shape)

# 테스트를 위해 x_train_reshaped의 앞 5개의 데이터를 가져온다.
X=x_train_reshaped[:5]
print(X.shape)

(60000, 784)
(5, 784)


In [5]:
WEIGT_INIT_STD = 0.1
INPUT_SIZE = 784
HIDDEN_SIZE = 50

# 인접 레이어간 관계를 나타내는 파라미터 W를 생성하고 random 초기화
# np.random.randn(m, n) : 평균 0, 표준편차 1의 가우시안 표준정규분포 난수를
# matrix array(m, n) 생성
W1 = WEIGT_INIT_STD * np.random.randn(INPUT_SIZE, HIDDEN_SIZE)  
# bias 파라미터 b를 생성하고 Zero로 초기화
b1 = np.zeros(HIDDEN_SIZE)

a1 = np.dot(X, W1) + b1   # 은닉층 출력

print(W1.shape)
print(b1.shape)
print(a1.shape)

(784, 50)
(50,)
(5, 50)


In [6]:
# 첫 번째 데이터의 은닉층 출력을 확인해보자. 50 dim
a1[0]

array([ 4.63315340e-01,  4.12567543e-01, -3.04813543e-01,  1.36268976e+00,
        3.22801035e-01, -5.44425574e-01, -4.90577199e-01, -3.26182010e-01,
        1.66256334e-01,  2.62710612e-02,  9.19342698e-01,  9.59103027e-01,
       -2.50801740e-01,  1.22884698e+00, -6.32247972e-01, -6.08466103e-01,
        6.64044197e-01,  1.73433455e-01,  9.51240857e-01,  8.36139134e-02,
        7.17880412e-01, -2.01048774e+00, -4.88704375e-01,  7.54879846e-04,
        7.77322713e-01,  4.66057293e-01, -3.38112922e+00,  1.68680236e+00,
        7.66138233e-01,  7.77960435e-01, -8.74642537e-01,  1.38583591e+00,
        1.93723113e-01, -3.91851803e-01,  1.93209934e-01,  2.12234064e-01,
       -1.14647865e+00,  3.33189651e-01, -7.28379379e-01,  1.32925949e+00,
       -1.56580560e-02,  6.39346601e-01,  1.14255703e+00, -4.74488630e-01,
       -1.37163974e-01,  3.63165311e-01,  1.27462102e+00, -9.52524069e-01,
       -3.28961356e-02, -1.75095378e-01])

## 활성화 함수 (Activation Functions)
nonlinear한 출력으로 만들어줄 수 있다.

### Sigmoid

In [7]:
def sigmoid(x):
    return 1 / (1+np.exp(-x))

z1 = sigmoid(a1)
print(z1[0])

[0.61380037 0.60170337 0.4243812  0.79619651 0.58000673 0.36715868
 0.3797576  0.41916989 0.54146861 0.50656739 0.71490816 0.72294218
 0.43762617 0.7736167  0.34700099 0.35240918 0.66016828 0.54325001
 0.72136466 0.52089131 0.6721401  0.11810617 0.38019883 0.50018872
 0.68510281 0.61445015 0.03289046 0.84380318 0.68268492 0.68524037
 0.29428921 0.79992664 0.54827988 0.4032716  0.54815278 0.55286025
 0.24113286 0.58253526 0.32555046 0.79071812 0.49608557 0.65460574
 0.7581488  0.3835544  0.46576267 0.58980645 0.78153276 0.27837749
 0.49177671 0.45633765]


https://reniew.github.io/12/
https://pozalabs.github.io/Activation_Function/

1. 하이퍼볼릭 탄젠트 (tanh)
2. ReLU

In [8]:
#단일 레이어 구현 함수
def affine_layer_forward(X, W, b):
    y = np.dot(X, W) + b
    cache = (X, W, b)
    return y, cache

In [9]:
INPUT_SIZE = 784
HIDDEN_SIZE = 50
OUTPUT_SIZE = 10

W1 = WEIGT_INIT_STD * np.random.randn(INPUT_SIZE, HIDDEN_SIZE)
b1 = np.zeros(HIDDEN_SIZE)
W2 = WEIGT_INIT_STD * np.random.randn(HIDDEN_SIZE, OUTPUT_SIZE)
b2 = np.zeros(OUTPUT_SIZE)

a1, cache1 = affine_layer_forward(X, W1, b1)
z1 = sigmoid(a1)
a2, cache2 = affine_layer_forward(z1, W2, b2)    # z1이 다시 두번째 레이어의 입력이 됩니다. 

print(a2[0])  # 최종 출력이 OUTPUT_SIZE만큼의 벡터가 되었습니다.

[-0.21283341 -0.32247143  0.25920851 -0.17063848 -0.6178833   0.67250877
 -0.10709499  0.19553683  0.64657685  0.37855783]


In [10]:
def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 

    x = x - np.max(x) # 오버플로 대책
    return np.exp(x) / np.sum(np.exp(x))

In [11]:
y_hat = softmax(a2)
y_hat[0]  # 10개의 숫자 중 하나일 확률이 되었습니다.

array([0.06936044, 0.06215794, 0.11120314, 0.07234972, 0.0462595 ,
       0.16811678, 0.07709628, 0.10434335, 0.16381323, 0.12529963])

## 손실함수
https://towardsdatascience.com/understanding-different-loss-functions-for-neural-networks-dd1ed0274718   
Loss / Cost function

In [12]:
# 정답 - 원 핫 인코딩
# 정답 라벨을 One-hot 인코딩하는 함수
def _change_one_hot_label(X, num_category):
    T = np.zeros((X.size, num_category))
    for idx, row in enumerate(T):
        row[X[idx]] = 1
        
    return T

Y_digit = y_train[:5]
t = _change_one_hot_label(Y_digit, 10)
t     # 정답 라벨의 One-hot 인코딩

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [13]:
print(y_hat[0])
print(t[0])
# 아직은 정답과는 멀다. 대부분 10%

[0.06936044 0.06215794 0.11120314 0.07234972 0.0462595  0.16811678
 0.07709628 0.10434335 0.16381323 0.12529963]
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]


In [14]:
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    # 훈련 데이터가 원-핫 벡터라면 정답 레이블의 인덱스로 반환
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t])) / batch_size

Loss = cross_entropy_error(y_hat, t)
print(Loss)
# 이제 이 오차를 줄여보자

2.510560713162438


## 경사하강법
Gradient Descent   

https://aileen93.tistory.com/71   
https://reniew.github.io/13/

In [15]:
batch_num = y_hat.shape[0]
dy = (y_hat - t) / batch_num
dy    # softmax값의 출력으로 Loss를 미분한 값

array([[ 0.01387209,  0.01243159,  0.02224063,  0.01446994,  0.0092519 ,
        -0.16637664,  0.01541926,  0.02086867,  0.03276265,  0.02505993],
       [-0.1839906 ,  0.01433841,  0.02333718,  0.01413361,  0.00992812,
         0.03940503,  0.01583154,  0.01850919,  0.03088004,  0.01762748],
       [ 0.01509773,  0.01301962,  0.02357189,  0.01569102, -0.19018926,
         0.03591147,  0.01345941,  0.01888795,  0.03198581,  0.02256437],
       [ 0.01423034, -0.18919411,  0.02218519,  0.01512545,  0.01024672,
         0.03258061,  0.0157972 ,  0.0199486 ,  0.03718204,  0.02189795],
       [ 0.01488176,  0.0142834 ,  0.0220065 ,  0.01622971,  0.0109531 ,
         0.03471935,  0.01856496,  0.01808552,  0.0304532 , -0.18017751]])

In [16]:
batch_num = y_hat.shape[0]
dy = (y_hat -t) / batch_num
dy # softmax값의 출력으로 Loss를 미분한 값

array([[ 0.01387209,  0.01243159,  0.02224063,  0.01446994,  0.0092519 ,
        -0.16637664,  0.01541926,  0.02086867,  0.03276265,  0.02505993],
       [-0.1839906 ,  0.01433841,  0.02333718,  0.01413361,  0.00992812,
         0.03940503,  0.01583154,  0.01850919,  0.03088004,  0.01762748],
       [ 0.01509773,  0.01301962,  0.02357189,  0.01569102, -0.19018926,
         0.03591147,  0.01345941,  0.01888795,  0.03198581,  0.02256437],
       [ 0.01423034, -0.18919411,  0.02218519,  0.01512545,  0.01024672,
         0.03258061,  0.0157972 ,  0.0199486 ,  0.03718204,  0.02189795],
       [ 0.01488176,  0.0142834 ,  0.0220065 ,  0.01622971,  0.0109531 ,
         0.03471935,  0.01856496,  0.01808552,  0.0304532 , -0.18017751]])

In [17]:
dW2 = np.dot(z1.T, dy)    
dW2

array([[-0.01022122, -0.0973856 ,  0.0422367 ,  0.02889826, -0.04104455,
         0.02272346,  0.03048704,  0.03602699,  0.06234168, -0.07406277],
       [-0.09766386, -0.10080256,  0.07604881,  0.05064675, -0.08220126,
        -0.01585542,  0.053239  ,  0.06470221,  0.10983811, -0.0579518 ],
       [-0.03780634, -0.10747497,  0.07298723,  0.0488542 , -0.1428833 ,
        -0.0183457 ,  0.04995405,  0.06224179,  0.10595596, -0.03348291],
       [-0.05166253, -0.07385317,  0.04441939,  0.02991886, -0.06171192,
         0.03915615,  0.03133707,  0.03727777,  0.06415531, -0.05903693],
       [-0.07047946, -0.08387616,  0.06262421,  0.04183992, -0.10277732,
         0.01605836,  0.04338249,  0.05285661,  0.09014291, -0.04977156],
       [-0.04716135, -0.10171745,  0.06399824,  0.04258134, -0.11930311,
        -0.02088364,  0.04353996,  0.05467023,  0.0931125 , -0.00883671],
       [-0.11629521, -0.09634314,  0.0721907 ,  0.04801194, -0.05038333,
        -0.00695382,  0.05107695,  0.06131665

In [18]:
dW2 = np.dot(z1.T, dy)
db2 = np.sum(dy, axis=0)

In [19]:
def sigmoid_grad(x):
    return (1.0 - sigmoid(x)) * sigmoid(x)

In [20]:
dz1 = np.dot(dy, W2.T)
da1 = sigmoid_grad(a1) * dz1
dW1 = np.dot(X.T, da1)
db1 = np.sum(dz1, axis=0)

In [21]:
learning_rate = 0.1

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    W1 = W1 - learning_rate*dW1
    b1 = b1 - learning_rate*db1
    W2 = W2 - learning_rate*dW2
    b2 = b2 - learning_rate*db2
    return W1, b1, W2, b2

In [22]:
def affine_layer_backward(dy, cache):
    X, W, b = cache
    dX = np.dot(dy, W.T)
    dW = np.dot(X.T, dy)
    db = np.sum(dy, axis=0)
    return dX, dW, db

In [25]:
# 파라미터 초기화
W1 = WEIGT_INIT_STD * np.random.randn(INPUT_SIZE, HIDDEN_SIZE)
b1 = np.zeros(HIDDEN_SIZE)
W2 = WEIGT_INIT_STD * np.random.randn(HIDDEN_SIZE, OUTPUT_SIZE)
b2 = np.zeros(OUTPUT_SIZE)

# Forward Propagation
a1, cache1 = affine_layer_forward(X, W1, b1)
z1 = sigmoid(a1)
a2, cache2 = affine_layer_forward(z1, W2, b2)

# 추론과 오차(Loss) 계산
y_hat = softmax(a2)
t = _change_one_hot_label(Y_digit, 10)   # 정답 One-hot 인코딩
Loss = cross_entropy_error(y_hat, t)

print(y_hat)
print(t)
print('Loss: ', Loss)
        
dy = (y_hat - t) / X.shape[0]
dz1, dW2, db2 = affine_layer_backward(dy, cache2)
da1 = sigmoid_grad(a1) * dz1
dX, dW1, db1 = affine_layer_backward(da1, cache1)

# 경사하강법을 통한 파라미터 업데이트    
learning_rate = 0.1
W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)

[[0.10948105 0.1098637  0.10721648 0.0765648  0.1284041  0.19490329
  0.04847601 0.10367837 0.05369728 0.06771493]
 [0.09775628 0.11266899 0.10090375 0.06121997 0.13763614 0.21295546
  0.04875051 0.09919768 0.06398848 0.06492273]
 [0.12563775 0.11649893 0.10457832 0.07721386 0.11725622 0.17420178
  0.0577359  0.10297777 0.0545546  0.06934486]
 [0.11961497 0.10103441 0.08865831 0.07807109 0.12503143 0.21391614
  0.05888162 0.09579128 0.04977706 0.06922369]
 [0.11009738 0.096251   0.12540097 0.08209934 0.14224166 0.17196689
  0.04966646 0.0994686  0.05284783 0.06995987]]
[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
Loss:  2.211210200285573


In [26]:
W1 = WEIGT_INIT_STD * np.random.randn(INPUT_SIZE, HIDDEN_SIZE)
b1 = np.zeros(HIDDEN_SIZE)
W2 = WEIGT_INIT_STD * np.random.randn(HIDDEN_SIZE, OUTPUT_SIZE)
b2 = np.zeros(OUTPUT_SIZE)

def train_step(X, Y, W1, b1, W2, b2, learning_rate=0.1, verbose=False):
    a1, cache1 = affine_layer_forward(X, W1, b1)
    z1 = sigmoid(a1)
    a2, cache2 = affine_layer_forward(z1, W2, b2)
    y_hat = softmax(a2)
    t = _change_one_hot_label(Y, 10)
    Loss = cross_entropy_error(y_hat, t)

    if verbose:
        print('---------')
        print(y_hat)
        print(t)
        print('Loss: ', Loss)
        
    dy = (y_hat - t) / X.shape[0]
    dz1, dW2, db2 = affine_layer_backward(dy, cache2)
    da1 = sigmoid_grad(a1) * dz1
    dX, dW1, db1 = affine_layer_backward(da1, cache1)
    
    W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)
    
    return W1, b1, W2, b2, Loss

In [27]:
X = x_train_reshaped[:5]
Y = y_train[:5]

# train_step을 다섯 번 반복 돌립니다.
for i in range(5):
    W1, b1, W2, b2, _ = train_step(X, Y, W1, b1, W2, b2, learning_rate=0.1, verbose=True)

---------
[[0.05329784 0.18300357 0.07334883 0.07266068 0.11142243 0.04807258
  0.11299143 0.12410736 0.11765101 0.10344427]
 [0.04704412 0.19584562 0.05930816 0.06631135 0.1179936  0.03477827
  0.12305053 0.14118924 0.12020245 0.09427666]
 [0.0545041  0.17387154 0.0803206  0.06931697 0.12576656 0.05502122
  0.11463467 0.12795124 0.11236331 0.0862498 ]
 [0.05728375 0.20350763 0.07331705 0.06420148 0.10141083 0.03603776
  0.13149275 0.12989703 0.11254147 0.09031025]
 [0.05723744 0.17528686 0.05191575 0.07001755 0.11793383 0.03656092
  0.12156407 0.15103921 0.12184859 0.09659577]]
[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
Loss:  2.418862535455883
---------
[[0.0688426  0.18406041 0.0675754  0.06675389 0.12570096 0.06376175
  0.09662957 0.10362442 0.10096901 0.12208201]
 [0.06539252 0.19913323 0.05440873 0.06032941 0.13588917 0.04616033
  0.10441974 0.11637903 0.1022

In [28]:
def predict(W1, b1, W2, b2, X):
    a1 = np.dot(X, W1) + b1
    z1 = sigmoid(a1)
    a2 = np.dot(z1, W2) + b2
    y = softmax(a2)

    return y

In [29]:
# X = x_train[:100] 에 대해 모델 추론을 시도합니다. 
X = x_train_reshaped[:100]
Y = y_test[:100]
result = predict(W1, b1, W2, b2, X)
result[0]

array([0.12667082, 0.16385654, 0.04583853, 0.04493287, 0.1472176 ,
       0.13722601, 0.05640605, 0.05782246, 0.05952545, 0.16050365])

In [30]:
def accuracy(W1, b1, W2, b2, x, y):
    y_hat = predict(W1, b1, W2, b2, x)
    y_hat = np.argmax(y_hat, axis=1)

    accuracy = np.sum(y_hat == y) / float(x.shape[0])
    return accuracy

In [31]:
acc = accuracy(W1, b1, W2, b2, X, Y)

t = _change_one_hot_label(Y, 10)
print(result[0])
print(t[0])
print(acc)

[0.12667082 0.16385654 0.04583853 0.04493287 0.1472176  0.13722601
 0.05640605 0.05782246 0.05952545 0.16050365]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
0.14


In [32]:
# 하이퍼파라미터
iters_num = 50000  # 반복 횟수를 적절히 설정한다.
train_size = x_train.shape[0]
batch_size = 100   # 미니배치 크기
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

# 1에폭당 반복 수
iter_per_epoch = max(train_size / batch_size, 1)

W1, b1, W2, b2 = init_params(784, 50, 10)

for i in range(iters_num):
    # 미니배치 획득
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train_reshaped[batch_mask]
    y_batch = y_train[batch_mask]
    
    W1, b1, W2, b2, Loss = train_step(x_batch, y_batch, W1, b1, W2, b2, learning_rate=0.1, verbose=False)

    # 학습 경과 기록
    train_loss_list.append(Loss)
    
    # 1에폭당 정확도 계산
    if i % iter_per_epoch == 0:
        print('Loss: ', Loss)
        train_acc = accuracy(W1, b1, W2, b2, x_train_reshaped, y_train)
        test_acc = accuracy(W1, b1, W2, b2, x_test_reshaped, y_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))

NameError: name 'init_params' is not defined