# 과제 1

### L2 regularization

- compute_cost 함수에 regularization을 위한 부분 추가
- gradient 계산에 변경된 부분 추가

### Regularization을 위한 가중치 lambda 튜닝

- lambda가 0일 때 점수가 0.9점으로 가장 높았고 lambda값을 키울수록 점수가 낮아지는 것으로 보아 이 모델은 overfitting이 되어있지 않았다고 생각할 수 있다.

In [124]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
import seaborn as sns

In [125]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [126]:
X, y = mnist["data"], mnist["target"]

In [127]:
y = y.astype(np.uint8)

In [128]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

In [129]:
enc.fit(y[:,np.newaxis])

  enc.fit(y[:,np.newaxis])


OneHotEncoder()

In [130]:
Y = enc.transform(y[:,np.newaxis]).toarray()

  Y = enc.transform(y[:,np.newaxis]).toarray()


In [131]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], Y[:60000], Y[60000:]

In [132]:
X_train = X_train / 255
X_test = X_test / 255

In [133]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [134]:
def softmax(X, W):
    K = np.size(W, 1)
    A = np.exp(X @ W)
    B = np.diag(1 / (np.reshape(A @ np.ones((K,1)), -1)))
    Y = B @ A
    return Y

In [135]:
def compute_cost(X, T, W, lamb):
    epsilon = 1e-5
    N = len(T)
    K = np.size(T, 1)
    cost = - (1/N) * np.ones((1,N)) @ (np.multiply(np.log(softmax(X, W) + epsilon), T)) @ np.ones((K,1)) + (lamb * 0.5 * np.linalg.norm(W) ** 2)
    return cost

# + (0.5) * (W.T @ W)

In [136]:
def predict(X, W):
    return np.argmax((X @ W), axis=1)

In [146]:
def batch_gd(X, T, W, learning_rate, iterations, batch_size):
    N = len(T)
    cost_history = np.zeros((iterations,1))
    shuffled_indices = np.random.permutation(N)
    X_shuffled = X[shuffled_indices]
    T_shuffled = T[shuffled_indices]
    
    lamb = 0.5

    for i in range(iterations):
        j = i % N
        X_batch = X_shuffled[j:j+batch_size]
        T_batch = T_shuffled[j:j+batch_size]
        # batch가 epoch 경계를 넘어가는 경우, 앞 부분으로 채워줌
        if X_batch.shape[0] < batch_size:
            X_batch = np.vstack((X_batch, X_shuffled[:(batch_size - X_batch.shape[0])]))
            T_batch = np.vstack((T_batch, T_shuffled[:(batch_size - T_batch.shape[0])]))
        # w 뒷 부분이 gradient 계산 부분
        W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch) + lamb * W)
        cost_history[i] = compute_cost(X_batch, T_batch, W, lamb)
        if i % 1000 == 0:
            print(cost_history[i][0])

    return (cost_history, W)

In [138]:
# X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
# T = y_train

# K = np.size(T, 1)
# M = np.size(X, 1)
# W = np.zeros((M,K))

# iterations = 50000
# learning_rate = 0.01

# initial_cost = compute_cost(X, T, W)
# print(initial_cost)

#print("Initial Cost is: {} \n".format(initial_cost[0][0]))

#(cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64)

In [147]:
X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
T = y_train

K = np.size(T, 1)
M = np.size(X, 1)
W = np.zeros((M,K))

iterations = 50000
learning_rate = 0.01

initial_cost = compute_cost(X, T, W, 0.5)

print("Initial Cost is: {} \n".format(initial_cost[0][0]))

(cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64)

Initial Cost is: 2.3024850979937166 

2.2827187120330423
3.3800105618636906
4.9959607494039835
6.043634767127019
6.7458471273084974
7.169421370643374
7.674860354557423
7.823776966884948
8.138082058237092
8.317466965461742
8.53864958448371
8.63489740090746
8.68638432324448
8.745857123470007
8.785015462244697
8.979236925894398
9.062519305990566
8.902882513958327
9.069262256087905
8.996625655502104
8.97630454559014
9.068843409178278
9.060354789284876
9.098343044773472
9.112138734963114
8.958269988244723
9.209524035856916
9.283111214158616
9.306469027503832
9.268199373661458
9.387220049240932
9.330178738991481
9.316643172946698
9.175694783295343
9.166058112529667
9.163391785282062
9.255940957532317
9.407530356483022
9.328625478527062
9.376926026522522
9.33462851573931
9.423758549257066
9.147655954492981
9.224748057358166
9.31815070969632
9.303611934385728
9.274414363199604
9.170391505578818
9.471512421588274
9.280225933281107


In [148]:
## Accuracy
X_ = np.hstack((np.ones((np.size(X_test, 0),1)),X_test))
T_ = y_test
y_pred = predict(X_, W_optimal)
score = float(sum(y_pred == np.argmax(T_, axis=1)))/ float(len(y_test))

print(score)

0.8983
