In [1]:
import numpy as np

def softmax(a):
    y = np.exp(a) / np.sum(np.exp(a))
    return y

In [3]:
a = np.array([1,2,3,4])
print(sum(softmax(a)))


1.0


In [7]:
a = np.array([[1,2,3,4],
             [5,6,7,8]])
print(np.max(a))
print(a - np.max(a))

8
[[-7 -6 -5 -4]
 [-3 -2 -1  0]]


In [12]:
def softmax(a):
    # a: (b, n)
    if a.ndim == 2:
        # a.max(axis=1, keepdims=True): (b, 1)
        a = a - a.max(axis=1, keepdims=True)
        a = np.exp(a)
        # np.sum(a, axis=1, keepdims=True): (b, 1)
        y = a / np.sum(a, axis=1, keepdims=True)
    elif a.ndim == 1:
        # 이전 구현과 동일
        a = a - np.max(a)
        y = np.exp(a) / np.sum(np.exp(a))
    return y

a_1 = np.array([1,2,3,4])
a_2 = np.array([[1,2,3,4],
                [5,6,7,8],
                [9,10,11,13]])

print("ndim: 1")
print(softmax(a_1))

print("\nndim:2")
print(softmax(a_2))

ndim: 1
[0.0320586  0.08714432 0.23688282 0.64391426]

ndim:2
[[0.0320586  0.08714432 0.23688282 0.64391426]
 [0.0320586  0.08714432 0.23688282 0.64391426]
 [0.01521943 0.0413707  0.11245721 0.83095266]]


In [13]:
def softmax_2(a):
    # a: (b, n)
    if a.ndim == 2:
        a = a.T # (a.T): (n, b)
        a = a - a.max(axis=0) # a.max(axis=0): (b,)
        y = np.exp(a) / np.sum(np.exp(a), axis=0) # np.sum(np.exp(a), axis=0): (b,)
        return y.T # (n, b) -> (b, n)
    elif a.ndim == 1:
        # 이전 구현과 동일
        a = a - np.max(a)
        y = np.exp(a) / np.sum(np.exp(a))
    return y

a_1 = np.array([1,2,3,4])
a_2 = np.array([[1,2,3,4],
                [5,6,7,8],
                [9,10,11,13]])

print("ndim: 1")
print(softmax_2(a_1))

print("\nndim:2")
print(softmax_2(a_2))

ndim: 1
[0.0320586  0.08714432 0.23688282 0.64391426]

ndim:2
[[0.0320586  0.08714432 0.23688282 0.64391426]
 [0.0320586  0.08714432 0.23688282 0.64391426]
 [0.01521943 0.0413707  0.11245721 0.83095266]]


In [17]:
k = a_2.T - a_2.T.max(axis=0)
print(np.exp(k) / np.sum(np.exp(k), axis=0))

[[0.0320586  0.0320586  0.01521943]
 [0.08714432 0.08714432 0.0413707 ]
 [0.23688282 0.23688282 0.11245721]
 [0.64391426 0.64391426 0.83095266]]


In [18]:
class Softmax:
    def __init__(self):
        self.params, grads = [], []
        self.out = None
    
    def forward(self, a):
        out = softmax(a)
        self.out = out
        return self.out
    
    def backward(self, dout):
        # dout = (∂L/∂y)
        dx = self.out * dout
        sumdx = np.sum(dx, axis=1, keepdims=True)
        dx -= self.out * sumdx
        return dx

In [None]:
def cross_entropy_error(y, t):
    if y.ndim == 1: # 1차원인 경우 배치 처리할 때와 동일하게 2차원으로 맞춰주기
        y = y.reshpae(1, y.size)
        t = t.reshape(1, t.size)
    
    if t.size == y.size: # 원 핫 벡터를 레이블 인덱스로 변환하기
        t = t.argmax(axis=1)
    
    batch_size = y.shape[0] # 샘플 개수로 나눠 평균 취하기
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

In [None]:
class SoftmaxWithLoss:
    def __init__(self):
        self.params, grads = [], []
        self.y = None # 소프트맥스 출력값
        self.t = None # 정답 레이블 인덱스

    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)

        if self.t.size == self.y.size: # 원핫 벡터인 경우 레이블 인덱스로 변환
            self.t = self.t.argmax(axis=1)
        
        loss = cross_entropy_error(self.y, self.t)
        return loss
    
    def backward(self, dout=1):
        batch_size = self.t.shape[0] # 샘플 개수 체크

        dx = self.y.copy() # 입력을 복사
        dx[np.arange(batch_size), self.t] -= 1 # 정답 인덱스에 접근하여 -1
        dx *= dout # 이전 계층 미분 계수 곱해주기
        dx = dx / batch_size # 샘플 개수로 나눠 평균 취하기

        return dx
        

In [21]:
y = np.array([[0.1, 0.2, 0.7],
              [0.2, 0.3, 0.5]])

batch_size = y.shape[0]
print(f"batch size: {batch_size}") # 2

t = np.array([[1, 0, 0],
              [0, 1, 0]])
t = t.argmax(axis=1)
print(f"t: {t}") # [0 1]

print(f"y[np.arange(batch_size), t]: {y[np.arange(batch_size), t]}") # [0.1 0.3]

batch size: 2
t: [0 1]
y[np.arange(batch_size), t]: [0.1 0.3]


In [3]:
import numpy as np
np.random.seed(1)

T, H = 5, 4
hs = np.random.randn(T, H)
a = np.array([0.8, 0.1, 0.03, 0.05, 0.02])

ar = a.reshape(5,1).repeat(4, axis=1) ## repeat를 사용하지 않아도 broadcasting이 일어날 것이지만 눈에 잘 띄지 않음. 역전파도 수행해야 함. repeat에 대한 역전파는 각 미분 계수의 누적합으로 계산됨
print(ar)
print(f"ar: {ar.shape}")
print()

t = hs * ar

c = np.sum(t, axis=0)
print(f"c: {c.shape}")

[[0.8  0.8  0.8  0.8 ]
 [0.1  0.1  0.1  0.1 ]
 [0.03 0.03 0.03 0.03]
 [0.05 0.05 0.05 0.05]
 [0.02 0.02 0.02 0.02]]
ar: (5, 4)

c: (4,)


In [4]:
import numpy as np
np.random.seed(1)

T, H = 5, 4
hs = np.random.randn(T, H)
a = np.array([0.8, 0.1, 0.03, 0.05, 0.02])

ar = a.reshape(5,1).repeat(4, axis=1) ## repeat를 사용하지 않아도 broadcasting이 일어날 것이지만 눈에 잘 띄지 않음. 역전파도 수행해야 함. repeat에 대한 역전파는 각 미분 계수의 누적합으로 계산됨
print(ar)
print(f"ar: {ar.shape}")
print()

t = np.matmul(hs, a)


c = np.sum(t, axis=0)
print(f"c: {c.shape}")

[[0.8  0.8  0.8  0.8 ]
 [0.1  0.1  0.1  0.1 ]
 [0.03 0.03 0.03 0.03]
 [0.05 0.05 0.05 0.05]
 [0.02 0.02 0.02 0.02]]
ar: (5, 4)

c: (4,)


## 24.01.16(화)

In [1]:
def AND(x1, x2):
    w1, w2, theta = 0.5, 0.5, 0.7
    y = x1*w1 + x2*w2
    if y < theta:
        return 0
    else:
        return 1
    
print(AND(1,1))
print(AND(1,0))
print(AND(0,1))
print(AND(0,0))

1
0
0
0


In [3]:
def OR(x1, x2):
    w1, w2, theta = 0.5, 0.5, 0.5
    y = x1*w1 + x2*w2
    if y < theta:
        return 0
    else:
        return 1
    
print(OR(1,1))
print(OR(1,0))
print(OR(0,1))
print(OR(0,0))    

1
1
1
0


In [4]:
def NAND(x1, x2):
    w1, w2, theta = 0.5, 0.5, 0.5
    y = x1*w1 + x2*w2
    if y > theta:
        return 0
    else:
        return 1
    
print(NAND(1,1))
print(NAND(1,0))
print(NAND(0,1))
print(NAND(0,0))    

0
1
1
1


In [5]:
def XOR(x1, x2):
    t1, t2 = NAND(x1, x2), OR(x1, x2)
    y = AND(t1, t2)
    return y

print(XOR(0,0))
print(XOR(0,1))
print(XOR(1,0))
print(XOR(1,1))

0
1
1
0


## 24.01.07(수)

In [6]:
import numpy as np

def step_function(x):
    if x > 0:
        return 1
    else:
        return 0
    
step_function(1)

1

In [7]:
def step_function(x):
    y = x > 0
    return y.astype(np.int32)

In [9]:
step_function(np.array([1,-1,-2]))

array([1, 0, 0], dtype=int32)