# NumPy로 신경망 구현하기

이 노트북에서는 NumPy만으로 간단한 신경망을 구현한다.
Forward pass, Backward pass (backpropagation), Gradient Descent를 직접 구현해본다.

## 학습 목표
- 행렬 연산으로 신경망 forward pass 구현
- Chain rule을 적용한 backward pass 구현
- Gradient descent로 학습 수행

In [None]:
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

## 1. 활성화 함수 구현

In [None]:
def sigmoid(x):
    """Sigmoid 활성화 함수"""
    return 1 / (1 + np.exp(-np.clip(x, -500, 500)))

def sigmoid_derivative(x):
    """Sigmoid의 도함수"""
    s = sigmoid(x)
    return s * (1 - s)

def relu(x):
    """ReLU 활성화 함수"""
    return np.maximum(0, x)

def relu_derivative(x):
    """ReLU의 도함수"""
    return (x > 0).astype(float)

def softmax(x):
    """Softmax 함수 (수치 안정성 포함)"""
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

## 2. Loss 함수 구현

In [None]:
def mse_loss(y_pred, y_true):
    """Mean Squared Error Loss"""
    return np.mean((y_pred - y_true) ** 2)

def mse_loss_derivative(y_pred, y_true):
    """MSE의 도함수"""
    return 2 * (y_pred - y_true) / y_true.size

def cross_entropy_loss(y_pred, y_true):
    """Cross-Entropy Loss (softmax 출력 가정)"""
    eps = 1e-10
    return -np.mean(np.sum(y_true * np.log(y_pred + eps), axis=-1))

def softmax_cross_entropy_derivative(y_pred, y_true):
    """Softmax + Cross-Entropy의 combined derivative"""
    return (y_pred - y_true) / y_true.shape[0]

## 3. 신경망 레이어 클래스

In [None]:
class Linear:
    """선형 변환 레이어: y = Wx + b"""
    
    def __init__(self, in_features, out_features):
        # Xavier initialization
        self.W = np.random.randn(in_features, out_features) * np.sqrt(2.0 / in_features)
        self.b = np.zeros(out_features)
        
        # Gradient 저장용
        self.dW = None
        self.db = None
        
        # Forward pass 중간값 저장 (backward에서 사용)
        self.x = None
    
    def forward(self, x):
        """Forward pass: y = Wx + b"""
        self.x = x  # 저장 (backward에서 필요)
        return x @ self.W + self.b
    
    def backward(self, dout):
        """Backward pass
        
        Args:
            dout: 출력에 대한 gradient (dL/dy)
        
        Returns:
            dx: 입력에 대한 gradient (dL/dx)
        """
        # dL/dW = x^T @ dL/dy
        self.dW = self.x.T @ dout
        
        # dL/db = sum(dL/dy, axis=0)
        self.db = np.sum(dout, axis=0)
        
        # dL/dx = dL/dy @ W^T
        dx = dout @ self.W.T
        
        return dx
    
    def update(self, lr):
        """파라미터 업데이트"""
        self.W -= lr * self.dW
        self.b -= lr * self.db

In [None]:
class ReLU:
    """ReLU 활성화 레이어"""
    
    def __init__(self):
        self.x = None
    
    def forward(self, x):
        self.x = x
        return relu(x)
    
    def backward(self, dout):
        return dout * relu_derivative(self.x)
    
    def update(self, lr):
        pass  # 학습할 파라미터 없음

In [None]:
class Sigmoid:
    """Sigmoid 활성화 레이어"""
    
    def __init__(self):
        self.out = None
    
    def forward(self, x):
        self.out = sigmoid(x)
        return self.out
    
    def backward(self, dout):
        # sigmoid의 미분: sigmoid(x) * (1 - sigmoid(x))
        return dout * self.out * (1 - self.out)
    
    def update(self, lr):
        pass

## 4. 신경망 클래스

In [None]:
class NeuralNetwork:
    """다층 퍼셉트론 (MLP)"""
    
    def __init__(self, layers):
        """
        Args:
            layers: 레이어 객체들의 리스트
        """
        self.layers = layers
    
    def forward(self, x):
        """Forward pass"""
        for layer in self.layers:
            x = layer.forward(x)
        return x
    
    def backward(self, dout):
        """Backward pass (역순으로)"""
        for layer in reversed(self.layers):
            dout = layer.backward(dout)
        return dout
    
    def update(self, lr):
        """모든 레이어 파라미터 업데이트"""
        for layer in self.layers:
            layer.update(lr)
    
    def train_step(self, x, y, lr=0.01):
        """한 번의 학습 스텝"""
        # Forward
        y_pred = self.forward(x)
        
        # Loss 계산
        loss = mse_loss(y_pred, y)
        
        # Backward
        dout = mse_loss_derivative(y_pred, y)
        self.backward(dout)
        
        # Update
        self.update(lr)
        
        return loss

## 5. 실습: XOR 문제 풀기

XOR은 선형으로 분리 불가능한 문제다. 신경망이 필요한 대표적 예시.

In [None]:
# XOR 데이터
X = np.array([
    [0, 0],
    [0, 1],
    [1, 0],
    [1, 1]
])

y = np.array([
    [0],
    [1],
    [1],
    [0]
])

print("XOR 데이터:")
for i in range(4):
    print(f"  {X[i]} -> {y[i][0]}")

In [None]:
# 신경망 생성: 2 -> 8 -> 1
model = NeuralNetwork([
    Linear(2, 8),    # 입력 2 -> 은닉 8
    ReLU(),          # 활성화
    Linear(8, 1),    # 은닉 8 -> 출력 1
    Sigmoid()        # 0-1 범위로
])

# 학습
losses = []
lr = 0.5
epochs = 1000

for epoch in range(epochs):
    loss = model.train_step(X, y, lr=lr)
    losses.append(loss)
    
    if epoch % 200 == 0:
        print(f"Epoch {epoch:4d} | Loss: {loss:.6f}")

In [None]:
# Loss 시각화
plt.figure(figsize=(10, 4))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('XOR 학습 곡선')
plt.yscale('log')
plt.grid(True)
plt.show()

In [None]:
# 예측 결과
print("학습 후 예측:")
predictions = model.forward(X)
for i in range(4):
    pred = predictions[i][0]
    print(f"  {X[i]} -> {pred:.4f} (정답: {y[i][0]}, 반올림: {round(pred)})")

## 6. 실습: MNIST 손글씨 분류 (간단 버전)

실제 이미지 분류 문제에 도전해보자.

In [None]:
# 간단한 MNIST 대체 데이터 생성
# (실제로는 sklearn이나 tensorflow에서 로드)

def generate_simple_digit_data(n_samples=1000):
    """간단한 숫자 분류 데이터 생성 (0 vs 1)"""
    X = []
    y = []
    
    for _ in range(n_samples):
        label = np.random.randint(0, 2)
        
        # 간단한 패턴: 0은 원형, 1은 세로선
        if label == 0:
            # 원형 패턴 (대각선 활성화)
            pattern = np.array([
                0.1, 0.8, 0.8, 0.1,
                0.8, 0.1, 0.1, 0.8,
                0.8, 0.1, 0.1, 0.8,
                0.1, 0.8, 0.8, 0.1
            ])
        else:
            # 세로선 패턴 (중앙 활성화)
            pattern = np.array([
                0.1, 0.8, 0.8, 0.1,
                0.1, 0.8, 0.8, 0.1,
                0.1, 0.8, 0.8, 0.1,
                0.1, 0.8, 0.8, 0.1
            ])
        
        # 노이즈 추가
        noise = np.random.randn(16) * 0.1
        X.append(np.clip(pattern + noise, 0, 1))
        y.append(label)
    
    return np.array(X), np.array(y)

# 데이터 생성
X_train, y_train = generate_simple_digit_data(800)
X_test, y_test = generate_simple_digit_data(200)

# One-hot encoding
def to_onehot(y, num_classes=2):
    return np.eye(num_classes)[y]

y_train_oh = to_onehot(y_train)
y_test_oh = to_onehot(y_test)

print(f"학습 데이터: {X_train.shape}, {y_train_oh.shape}")
print(f"테스트 데이터: {X_test.shape}, {y_test_oh.shape}")

In [None]:
class SoftmaxCrossEntropy:
    """Softmax + Cross-Entropy 결합 레이어"""
    
    def __init__(self):
        self.y_pred = None
        self.y_true = None
    
    def forward(self, x, y_true):
        self.y_pred = softmax(x)
        self.y_true = y_true
        return cross_entropy_loss(self.y_pred, y_true)
    
    def backward(self):
        return softmax_cross_entropy_derivative(self.y_pred, self.y_true)

In [None]:
class Classifier:
    """분류를 위한 신경망"""
    
    def __init__(self, layers):
        self.layers = layers
        self.loss_fn = SoftmaxCrossEntropy()
    
    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x
    
    def predict(self, x):
        logits = self.forward(x)
        return softmax(logits)
    
    def train_step(self, x, y, lr=0.01):
        # Forward
        logits = self.forward(x)
        loss = self.loss_fn.forward(logits, y)
        
        # Backward
        dout = self.loss_fn.backward()
        for layer in reversed(self.layers):
            dout = layer.backward(dout)
        
        # Update
        for layer in self.layers:
            layer.update(lr)
        
        return loss
    
    def accuracy(self, x, y):
        probs = self.predict(x)
        preds = np.argmax(probs, axis=1)
        targets = np.argmax(y, axis=1)
        return np.mean(preds == targets)

In [None]:
# 분류기 생성
classifier = Classifier([
    Linear(16, 32),
    ReLU(),
    Linear(32, 16),
    ReLU(),
    Linear(16, 2)  # 2개 클래스
])

# 학습
train_losses = []
train_accs = []
test_accs = []

epochs = 500
batch_size = 32
lr = 0.1

for epoch in range(epochs):
    # Mini-batch 학습
    indices = np.random.permutation(len(X_train))
    epoch_loss = 0
    n_batches = 0
    
    for i in range(0, len(X_train), batch_size):
        batch_idx = indices[i:i+batch_size]
        X_batch = X_train[batch_idx]
        y_batch = y_train_oh[batch_idx]
        
        loss = classifier.train_step(X_batch, y_batch, lr=lr)
        epoch_loss += loss
        n_batches += 1
    
    epoch_loss /= n_batches
    train_losses.append(epoch_loss)
    
    # 정확도 계산
    train_acc = classifier.accuracy(X_train, y_train_oh)
    test_acc = classifier.accuracy(X_test, y_test_oh)
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    
    if epoch % 100 == 0:
        print(f"Epoch {epoch:3d} | Loss: {epoch_loss:.4f} | Train Acc: {train_acc:.3f} | Test Acc: {test_acc:.3f}")

In [None]:
# 학습 곡선 시각화
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(train_losses)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Loss Curve')
axes[0].grid(True)

axes[1].plot(train_accs, label='Train')
axes[1].plot(test_accs, label='Test')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Accuracy Curve')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

## 7. Gradient 검증 (Gradient Checking)

구현한 backward pass가 올바른지 수치 미분으로 검증한다.

In [None]:
def numerical_gradient(model, x, y, param, h=1e-5):
    """수치 미분으로 gradient 계산"""
    grad = np.zeros_like(param)
    
    it = np.nditer(param, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        original = param[idx]
        
        # f(x + h)
        param[idx] = original + h
        loss_plus = mse_loss(model.forward(x), y)
        
        # f(x - h)
        param[idx] = original - h
        loss_minus = mse_loss(model.forward(x), y)
        
        # 원래 값 복원
        param[idx] = original
        
        grad[idx] = (loss_plus - loss_minus) / (2 * h)
        it.iternext()
    
    return grad

In [None]:
# Gradient 검증
print("Gradient Checking...")

# 작은 네트워크로 테스트
test_model = NeuralNetwork([
    Linear(2, 3),
    Sigmoid(),
    Linear(3, 1),
    Sigmoid()
])

# 테스트 데이터
test_x = np.array([[0.5, 0.3]])
test_y = np.array([[0.8]])

# Forward & Backward
pred = test_model.forward(test_x)
dout = mse_loss_derivative(pred, test_y)
test_model.backward(dout)

# 첫 번째 Linear 레이어의 gradient 비교
layer = test_model.layers[0]
analytical_grad = layer.dW
numerical_grad = numerical_gradient(test_model, test_x, test_y, layer.W)

# 상대 오차
diff = np.abs(analytical_grad - numerical_grad)
rel_error = diff / (np.abs(analytical_grad) + np.abs(numerical_grad) + 1e-8)

print(f"\nAnalytical gradient:\n{analytical_grad}")
print(f"\nNumerical gradient:\n{numerical_grad}")
print(f"\nMax relative error: {np.max(rel_error):.2e}")

if np.max(rel_error) < 1e-5:
    print("\n✓ Gradient check PASSED!")
else:
    print("\n✗ Gradient check FAILED!")

## 8. 정리

이 노트북에서 배운 내용:

1. **Forward Pass**: 입력 → 출력 계산 (행렬 곱 + 활성화)
2. **Backward Pass**: Chain Rule로 gradient 전파
3. **Gradient Descent**: gradient 반대 방향으로 파라미터 업데이트
4. **Gradient Checking**: 수치 미분으로 구현 검증

### 핵심 수식

- Forward: $y = \sigma(Wx + b)$
- Backward: $\frac{\partial L}{\partial W} = x^T \frac{\partial L}{\partial y} \cdot \sigma'$
- Update: $W \leftarrow W - \eta \frac{\partial L}{\partial W}$