In [1]:
import numpy as np
import matplotlib.pyplot as plt
from typing import Optional, List

In [2]:
class Layer:

    def __init__(self):
        self.training = True

    def forward(self, x):
        pass

    def backward(self, grad_output):
        pass

    def train(self):
        self.training = True

    def eval(self):
        self.training = False

    def __call__(self, x):
        return self.forward(x)

In [3]:
class RBF(Layer):

    def __init__(self, input_size, output_size, bias=True):
        super().__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.use_bias = bias

        self.weight = np.random.normal(loc=0, scale=2/input_size, size=(input_size, output_size)).astype(np.float32)

        if self.use_bias:
            self.bias = np.zeros(output_size, dtype=np.float32)
        else:
            self.bias = None

    def forward(self, x):
        self.input = x

        output = np.square(self.input - self.weight)
        if self.use_bias:
            output = output - np.square(self.bias)
        return output

    def backward(self, grad_output):
        grad_input = grad_output.T @ (2 * self.input - 2 * self.weight).T

        self.grad_weight = grad_output @ (-2 * self.input + 2 * self.weight).T

        if self.use_bias:
            self.grad_bias = np.sum(grad_output, axis=0) * (2 * self.bias).T

        return grad_input

    def update_weights(self, learning_rate=0.01):
        if self.grad_weight is not None:
            self.weight.T -= learning_rate * self.grad_weight

        if self.use_bias and self.grad_bias is not None:
            self.bias -= learning_rate * self.grad_bias

In [4]:
### –§—É–Ω–∫—Ü–∏—è –∞–∫—Ç–∏–≤–∞—Ü–∏–∏ ReLU

class ReLU(Layer):

    def __init__(self):
        super().__init__()
        self.input = None

    def forward(self, x):
        self.input = x
        output = np.maximum(0, x)
        return output

    def backward(self, grad_output):
        grad_input = grad_output.copy()
        grad_input[self.input <= 0] = 0 ### –ü—Ä–∏ –æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏—è—Ö –≤—Ö–æ–¥–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö ReLU –ø—Ä–µ–≤—Ä–∞—â–∞–µ—Ç—Å—è –≤ –≥–æ—Ä–∏–∑–æ–Ω—Ç–∞–ª—å–Ω—É—é –ø—Ä—è–º—É—é, –µ—ë –ø—Ä–æ–∏–∑–≤–æ–¥–Ω–∞—è –Ω–æ–ª—å
                                   ### –ü—Ä–∏ –∑–Ω–∞—á–µ–Ω–∏—è—Ö –±–æ–ª—å—à–µ –Ω—É–ª—è –ø—Ä–µ–¥—Å—Ç–∞–≤–ª—è–µ—Ç —Å–æ–±–æ–π –ø—Ä—è–º—É—é —Å –∫–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç–æ–º —É–≥–ª–∞ –Ω–∞–∫–ª–æ–Ω–∞ 1
                                   ### –ï—Å–ª–∏ —è –ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø–æ–º–Ω—é –≤ —á–µ–º —Å–º—ã—Å–ª backward —É —Ñ—É–Ω–∫—Ü–∏–∏ –∞–∫—Ç–∏–≤–∞—Ü–∏–∏, —Ç–æ –º—ã –≤–æ–∑–≤—Ä–∞—â–∞–µ–º –ø—Ä–æ–∏–∑–≤–µ–¥–µ–Ω–∏–µ –≤–µ—Å–æ–≤ –Ω–∞ grad
                                   ### –¢–æ –µ—Å—Ç—å –∫–æ–≥–¥–∞ –ø—Ä–æ–∏–∑–≤–æ–¥–Ω–∞—è —Ä–∞–≤–Ω–∞ 1, –º—ã –≤–æ–∑–≤—Ä–∞—â–∞–µ–º –ø—Ä–æ—Å—Ç–æ –≤–µ—Å–∞
        return grad_input

In [5]:
# –¢–µ—Å—Ç ReLU (–∑–∞–ø—É—Å—Ç–∏—Ç–µ —ç—Ç–æ—Ç –∫–æ–¥ –ø–æ—Å–ª–µ —Ä–µ–∞–ª–∏–∑–∞—Ü–∏–∏ ReLU)
relu = ReLU()

# # –¢–µ—Å—Ç–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ
x_test = np.array([[-2, -1, 0, 1, 2]], dtype=np.float32)
expected_forward = np.array([[0, 0, 0, 1, 2]], dtype=np.float32)

# # Forward pass
output = relu.forward(x_test)
print(f"Input: {x_test}")
print(f"Output: {output}")
print(f"Expected: {expected_forward}")

# # –ü—Ä–æ–≤–µ—Ä–∫–∞ forward pass
assert np.allclose(output, expected_forward), "ReLU forward pass –Ω–µ —Ä–∞–±–æ—Ç–∞–µ—Ç –∫–æ—Ä—Ä–µ–∫—Ç–Ω–æ!"

# # Backward pass
grad_output = np.ones_like(output)
grad_input = relu.backward(grad_output)
expected_backward = np.array([[0, 0, 0, 1, 1]], dtype=np.float32)

print(f"Gradient output: {grad_output}")
print(f"Gradient input: {grad_input}")
print(f"Expected gradient: {expected_backward}")

# # –ü—Ä–æ–≤–µ—Ä–∫–∞ backward pass
assert np.allclose(grad_input, expected_backward), "ReLU backward pass –Ω–µ —Ä–∞–±–æ—Ç–∞–µ—Ç –∫–æ—Ä—Ä–µ–∫—Ç–Ω–æ!"

print("‚úÖ ReLU —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω —É—Å–ø–µ—à–Ω–æ!")

#print("‚ö†Ô∏è –†–µ–∞–ª–∏–∑—É–π—Ç–µ ReLU –∫–ª–∞—Å—Å –≤—ã—à–µ, –∑–∞—Ç–µ–º —Ä–∞—Å–∫–æ–º–º–µ–Ω—Ç–∏—Ä—É–π—Ç–µ —ç—Ç–æ—Ç –∫–æ–¥ –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è")


Input: [[-2. -1.  0.  1.  2.]]
Output: [[0. 0. 0. 1. 2.]]
Expected: [[0. 0. 0. 1. 2.]]
Gradient output: [[1. 1. 1. 1. 1.]]
Gradient input: [[0. 0. 0. 1. 1.]]
Expected gradient: [[0. 0. 0. 1. 1.]]
‚úÖ ReLU —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω —É—Å–ø–µ—à–Ω–æ!


In [6]:
### –§—É–Ω–∫—Ü–∏—è –∞–∫—Ç–∏–≤–∞—Ü–∏–∏ Sigmoid

class Sigmoid(Layer):
    def __init__(self):
        super().__init__()
        self.output = None

    def forward(self, x):
        self.output = 1 / (1 + np.exp(-x))
        return self.output

    def backward(self, grad_output):
        ### –ò–∑ —á–∏—Å—Ç–æ–≥–æ –∏–Ω—Ç–µ—Ä–µ—Å–∞ –±—Ä–∞–ª –ø—Ä–æ–∏–∑–≤–æ–¥–Ω—É—é, –≤—ã—Ö–æ–¥–∏—Ç (–µ^(-x)/(1+–µ^(-x))^2)
        ### –¢–∞–∫ –ø–æ–Ω–∏–º–∞—é –∏–º–µ–Ω–Ω–æ –µ—ë –º—ã –Ω–µ –∏—Å–ø–æ–ª—å–∑—É–µ–º –≤ —Å–∏–ª—É —Ç–æ–≥–æ, —á—Ç–æ –ø–æ—è–≤–ª—è–µ—Ç—Å—è –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–∞—è –≤—ã—á–∏—Å–ª–∏—Ç–µ–ª—å–Ω–∞—è —Å–ª–æ–∂–Ω–æ—Å—Ç—å
        return grad_output * self.output * (1 - self.output)

In [7]:
# –¢–µ—Å—Ç Sigmoid (–∑–∞–ø—É—Å—Ç–∏—Ç–µ —ç—Ç–æ—Ç –∫–æ–¥ –ø–æ—Å–ª–µ —Ä–µ–∞–ª–∏–∑–∞—Ü–∏–∏ Sigmoid)
#print("‚ö†Ô∏è –†–µ–∞–ª–∏–∑—É–π—Ç–µ Sigmoid –∫–ª–∞—Å—Å –≤—ã—à–µ, –∑–∞—Ç–µ–º —Ä–∞—Å–∫–æ–º–º–µ–Ω—Ç–∏—Ä—É–π—Ç–µ —ç—Ç–æ—Ç –∫–æ–¥ –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è")

sigmoid = Sigmoid()

# # –¢–µ—Å—Ç–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ
x_test = np.array([[-10, -1, 0, 1, 10]], dtype=np.float32)

# # Forward pass
output = sigmoid.forward(x_test)
print(f"Input: {x_test}")
print(f"Output: {output}")

# # –ü—Ä–æ–≤–µ—Ä–∏–º, —á—Ç–æ –≤—ã—Ö–æ–¥–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –≤ –¥–∏–∞–ø–∞–∑–æ–Ω–µ (0, 1)
assert np.all(output > 0) and np.all(output < 1), "Sigmoid –¥–æ–ª–∂–µ–Ω –≤–æ–∑–≤—Ä–∞—â–∞—Ç—å –∑–Ω–∞—á–µ–Ω–∏—è –≤ –¥–∏–∞–ø–∞–∑–æ–Ω–µ (0, 1)"

# # –ü—Ä–æ–≤–µ—Ä–∏–º —Å–∏–º–º–µ—Ç—Ä–∏—á–Ω–æ—Å—Ç—å: sigmoid(-x) = 1 - sigmoid(x)
x_sym = np.array([[1]], dtype=np.float32)
out_pos = sigmoid.forward(x_sym)
out_neg = sigmoid.forward(-x_sym)
assert np.allclose(out_neg, 1 - out_pos, atol=1e-6), "Sigmoid –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å —Å–∏–º–º–µ—Ç—Ä–∏—á–Ω—ã–º"

# # Backward pass
grad_output = np.ones_like(output)
grad_input = sigmoid.backward(grad_output)
print(f"Gradient input: {grad_input}")

# # –ü—Ä–æ–≤–µ—Ä–∏–º, —á—Ç–æ –≥—Ä–∞–¥–∏–µ–Ω—Ç –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω—ã–π (sigmoid –º–æ–Ω–æ—Ç–æ–Ω–Ω–æ –≤–æ–∑—Ä–∞—Å—Ç–∞–µ—Ç)
assert np.all(grad_input >= 0), "–ì—Ä–∞–¥–∏–µ–Ω—Ç Sigmoid –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å –Ω–µ–æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω—ã–º"

print("‚úÖ Sigmoid —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω —É—Å–ø–µ—à–Ω–æ!")


Input: [[-10.  -1.   0.   1.  10.]]
Output: [[4.5397872e-05 2.6894143e-01 5.0000000e-01 7.3105860e-01 9.9995458e-01]]
Gradient input: [[0.19661194 0.19661194 0.19661194 0.19661194 0.19661194]]
‚úÖ Sigmoid —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω —É—Å–ø–µ—à–Ω–æ!


In [8]:
### –§—É–Ω–∫—Ü–∏—è –∞–∫—Ç–∏–≤–∞—Ü–∏–∏ Tanh

class Tanh(Layer):
    def __init__(self):
        super().__init__()
        self.output = None

    def forward(self, x):
        self.output = np.clip((np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x)), -0.9999, 0.9999)
        return self.output

    def backward(self, grad_output):
        return grad_output * (1 - np.square(self.output))

In [9]:
# –¢–µ—Å—Ç Tanh (–∑–∞–ø—É—Å—Ç–∏—Ç–µ —ç—Ç–æ—Ç –∫–æ–¥ –ø–æ—Å–ª–µ —Ä–µ–∞–ª–∏–∑–∞—Ü–∏–∏ Tanh)
print("‚ö†Ô∏è –†–µ–∞–ª–∏–∑—É–π—Ç–µ Tanh –∫–ª–∞—Å—Å –≤—ã—à–µ, –∑–∞—Ç–µ–º —Ä–∞—Å–∫–æ–º–º–µ–Ω—Ç–∏—Ä—É–π—Ç–µ —ç—Ç–æ—Ç –∫–æ–¥ –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è")

tanh = Tanh()

# # –¢–µ—Å—Ç–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ
x_test = np.array([[-10, -1, 0, 1, 10]], dtype=np.float32)

# # Forward pass
output = tanh.forward(x_test)
print(f"Input: {x_test}")
print(f"Output: {output}")

# # –ü—Ä–æ–≤–µ—Ä–∏–º, —á—Ç–æ –≤—ã—Ö–æ–¥–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –≤ –¥–∏–∞–ø–∞–∑–æ–Ω–µ (-1, 1)
assert np.all(output > -1) and np.all(output < 1), "Tanh –¥–æ–ª–∂–µ–Ω –≤–æ–∑–≤—Ä–∞—â–∞—Ç—å –∑–Ω–∞—á–µ–Ω–∏—è –≤ –¥–∏–∞–ø–∞–∑–æ–Ω–µ (-1, 1)"

# # –ü—Ä–æ–≤–µ—Ä–∏–º –∞–Ω—Ç–∏—Å–∏–º–º–µ—Ç—Ä–∏—á–Ω–æ—Å—Ç—å: tanh(-x) = -tanh(x)
x_antisym = np.array([[2]], dtype=np.float32)
out_pos = tanh.forward(x_antisym)
out_neg = tanh.forward(-x_antisym)
assert np.allclose(out_neg, -out_pos, atol=1e-6), "Tanh –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å –∞–Ω—Ç–∏—Å–∏–º–º–µ—Ç—Ä–∏—á–Ω—ã–º"

# # –ü—Ä–æ–≤–µ—Ä–∏–º, —á—Ç–æ tanh(0) = 0
zero_out = tanh.forward(np.array([[0]], dtype=np.float32))
assert np.allclose(zero_out, 0, atol=1e-6), "tanh(0) –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å —Ä–∞–≤–µ–Ω 0"

# # Backward pass
grad_output = np.ones_like(output)
grad_input = tanh.backward(grad_output)
print(f"Gradient input: {grad_input}")

# # –ü—Ä–æ–≤–µ—Ä–∏–º, —á—Ç–æ –≥—Ä–∞–¥–∏–µ–Ω—Ç –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω—ã–π (tanh –º–æ–Ω–æ—Ç–æ–Ω–Ω–æ –≤–æ–∑—Ä–∞—Å—Ç–∞–µ—Ç)
assert np.all(grad_input >= 0), "–ì—Ä–∞–¥–∏–µ–Ω—Ç Tanh –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å –Ω–µ–æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω—ã–º"

print("‚úÖ Tanh —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω —É—Å–ø–µ—à–Ω–æ!")


‚ö†Ô∏è –†–µ–∞–ª–∏–∑—É–π—Ç–µ Tanh –∫–ª–∞—Å—Å –≤—ã—à–µ, –∑–∞—Ç–µ–º —Ä–∞—Å–∫–æ–º–º–µ–Ω—Ç–∏—Ä—É–π—Ç–µ —ç—Ç–æ—Ç –∫–æ–¥ –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è
Input: [[-10.  -1.   0.   1.  10.]]
Output: [[-0.9999    -0.7615942  0.         0.7615942  0.9999   ]]
Gradient input: [[1. 1. 1. 1. 1.]]
‚úÖ Tanh —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω —É—Å–ø–µ—à–Ω–æ!


In [10]:
### –õ–∏–Ω–µ–π–Ω—ã–π —Å–ª–æ–π

class Linear(Layer):
    def __init__(self, input_size, output_size, bias=True):
        super().__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.use_bias = bias

        ### Kaiming He –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –≤–µ—Å–æ–≤
        self.weight = np.random.normal(loc=0, scale=2/input_size, size=(input_size, output_size)).astype(np.float32)

        if self.use_bias:
            self.bias = np.zeros(output_size, dtype=np.float32)
        else:
            self.bias = None

    def forward(self, x):
        self.input = x

        output = self.weight.T @ x.T
        if self.use_bias:
            output = output.T + self.bias
        return output

    def backward(self, grad_output):
        grad_input = grad_output @ self.weight.T

        self.grad_weight = self.input.T @ grad_output

        if self.use_bias:
            self.grad_bias = np.sum(grad_output, axis=0)

        return grad_input

    def update_weights(self, learning_rate=0.01):
        if self.grad_weight is not None:
            self.weight.T -= learning_rate * self.grad_weight

        if self.use_bias and self.grad_bias is not None:
            self.bias -= learning_rate * self.grad_bias

In [11]:
# –¢–µ—Å—Ç Linear (–∑–∞–ø—É—Å—Ç–∏—Ç–µ —ç—Ç–æ—Ç –∫–æ–¥ –ø–æ—Å–ª–µ —Ä–µ–∞–ª–∏–∑–∞—Ü–∏–∏ Linear)
#print("‚ö†Ô∏è –†–µ–∞–ª–∏–∑—É–π—Ç–µ Linear –∫–ª–∞—Å—Å –≤—ã—à–µ, –∑–∞—Ç–µ–º —Ä–∞—Å–∫–æ–º–º–µ–Ω—Ç–∏—Ä—É–π—Ç–µ —ç—Ç–æ—Ç –∫–æ–¥ –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è")

linear = Linear(input_size=3, output_size=2, bias=True)

# # –ü—Ä–æ–≤–µ—Ä–∏–º —Ñ–æ—Ä–º—É –≤–µ—Å–æ–≤
assert linear.weight.shape == (3, 2), f"–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–∞—è —Ñ–æ—Ä–º–∞ –≤–µ—Å–æ–≤: {linear.weight.shape}"
assert linear.bias.shape == (2,), f"–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–∞—è —Ñ–æ—Ä–º–∞ bias: {linear.bias.shape}"

print(f"–í–µ—Å–∞: \n{linear.weight}")
print(f"Bias: {linear.bias}")

# # –¢–µ—Å—Ç–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ
batch_size = 4
x_test = np.random.randn(batch_size, 3).astype(np.float32)

# # Forward pass
output = linear.forward(x_test)
expected_shape = (batch_size, 2)

print(f"Input shape: {x_test.shape}")
print(f"Output shape: {output.shape}")
print(f"Expected shape: {expected_shape}")

assert output.shape == expected_shape, f"–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–∞—è —Ñ–æ—Ä–º–∞ –≤—ã—Ö–æ–¥–∞: {output.shape}"

# # Backward pass
grad_output = np.random.randn(*output.shape).astype(np.float32)
grad_input = linear.backward(grad_output)

print(f"Gradient input shape: {grad_input.shape}")
print(f"Gradient weight shape: {linear.grad_weight.shape}")
print(f"Gradient bias shape: {linear.grad_bias.shape}")

# # –ü—Ä–æ–≤–µ—Ä–∏–º —Ñ–æ—Ä–º—ã –≥—Ä–∞–¥–∏–µ–Ω—Ç–æ–≤
assert grad_input.shape == x_test.shape, "–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–∞—è —Ñ–æ—Ä–º–∞ –≥—Ä–∞–¥–∏–µ–Ω—Ç–∞ –ø–æ –≤—Ö–æ–¥—É"
assert linear.grad_weight.shape == linear.weight.shape, "–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–∞—è —Ñ–æ—Ä–º–∞ –≥—Ä–∞–¥–∏–µ–Ω—Ç–∞ –ø–æ –≤–µ—Å–∞–º"
assert linear.grad_bias.shape == linear.bias.shape, "–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–∞—è —Ñ–æ—Ä–º–∞ –≥—Ä–∞–¥–∏–µ–Ω—Ç–∞ –ø–æ bias"

print("‚úÖ Linear —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω —É—Å–ø–µ—à–Ω–æ!")

–í–µ—Å–∞: 
[[-0.08106621 -1.4308075 ]
 [-0.1198379  -1.1728611 ]
 [ 0.21903415  1.1588073 ]]
Bias: [0. 0.]
Input shape: (4, 3)
Output shape: (4, 2)
Expected shape: (4, 2)
Gradient input shape: (4, 3)
Gradient weight shape: (3, 2)
Gradient bias shape: (2,)
‚úÖ Linear —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω —É—Å–ø–µ—à–Ω–æ!


In [12]:
### –ü–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω—ã–π –∫–æ–Ω—Ç–µ–π–Ω–µ—Ä (Sequential)

class Sequential(Layer):
    def __init__(self, *layers):
        super().__init__()
        self.layers = list(layers)
        self.layer_outputs = [] ### –ù–µ –ø–æ–Ω—è–ª –∑–∞—á–µ–º —ç—Ç–æ –∑–¥–µ—Å—å –Ω—É–∂–Ω–æ, –≤–µ–¥—å –¥–∞–ª—å—à–µ –Ω–∏–≥–¥–µ –Ω–µ –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è

    def add(self, layer):
        self.layers.append(layer)

    def forward(self, x):
        self.layer_outputs = []
        output = x
        for layer in self.layers:
            self.layer_outputs.append(output)
            output = layer.forward(output)

        return output

    def backward(self, grad_output):
        grad = grad_output
        for layer in reversed(self.layers):
            grad = layer.backward(grad)
        return grad

    def train(self):
        super().train()
        for layer in self.layers:
            layer.train()

    def eval(self):
        super().eval()
        for layer in self.layers:
            layer.eval()

    def __len__(self):
        return len(self.layers)

    def __getitem__(self, idx):
        return self.layers[idx]

In [13]:
### Dropout

class Dropout(Layer):
    def __init__(self, dropout_rate=0.5):
        super().__init__()
        self.dropout_rate = dropout_rate
        self.mask = None

    def forward(self, x):
        if self.training:
            self.mask = np.random.uniform(0, 1, size=np.shape(x))
            x[self.mask > self.dropout_rate] = 0
            output = x
        else:
            output = x
            self.mask = None
        return output

    def backward(self, grad_output):
        if self.training:
            grad_output[self.mask > self.dropout_rate] = 0
            grad_input = grad_output
        else:
            grad_input = grad_output
        return grad_input

In [14]:
# –¢–µ—Å—Ç Dropout (–∑–∞–ø—É—Å—Ç–∏—Ç–µ –ø–æ—Å–ª–µ —Ä–µ–∞–ª–∏–∑–∞—Ü–∏–∏ Dropout)
#print("‚ö†Ô∏è –†–µ–∞–ª–∏–∑—É–π—Ç–µ Dropout –∫–ª–∞—Å—Å –≤—ã—à–µ, –∑–∞—Ç–µ–º —Ä–∞—Å–∫–æ–º–º–µ–Ω—Ç–∏—Ä—É–π—Ç–µ —ç—Ç–æ—Ç –∫–æ–¥ –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è")

dropout = Dropout(dropout_rate=0.5)

# # –¢–µ—Å—Ç–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ
x_test = np.ones((100, 10), dtype=np.float32)

# # –¢–µ—Å—Ç –≤ —Ä–µ–∂–∏–º–µ –æ–±—É—á–µ–Ω–∏—è
dropout.train()
output_train = dropout.forward(x_test)

print(f"–†–µ–∂–∏–º –æ–±—É—á–µ–Ω–∏—è:")
print(f"Input mean: {x_test.mean():.3f}")
print(f"Output mean: {output_train.mean():.3f}")
print(f"Proportion of zeros: {(output_train == 0).mean():.3f}")

# # –ü—Ä–æ–≤–µ—Ä–∏–º, —á—Ç–æ —á–∞—Å—Ç—å –Ω–µ–π—Ä–æ–Ω–æ–≤ "–≤—ã–∫–ª—é—á–µ–Ω–∞"
zeros_ratio = (output_train == 0).mean()
expected_zeros = 0.5  # dropout_rate
assert abs(zeros_ratio - expected_zeros) < 0.1, f"–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–∞—è –¥–æ–ª—è –Ω—É–ª–µ–≤—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π: {zeros_ratio}"

# # –ü—Ä–æ–≤–µ—Ä–∏–º –º–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏–µ
assert abs(output_train.mean() - x_test.mean()) < 0.1, "–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ–µ –º–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏–µ –≤ —Ä–µ–∂–∏–º–µ –æ–±—É—á–µ–Ω–∏—è"

# # –¢–µ—Å—Ç –≤ —Ä–µ–∂–∏–º–µ –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞
dropout.eval()
output_eval = dropout.forward(x_test)

print(f"\n–†–µ–∂–∏–º –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞:")
print(f"Output mean: {output_eval.mean():.3f}")
print(f"Proportion of zeros: {(output_eval == 0).mean():.3f}")

# # –í —Ä–µ–∂–∏–º–µ –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞ –≤—Å–µ –∑–Ω–∞—á–µ–Ω–∏—è –¥–æ–ª–∂–Ω—ã –æ—Å—Ç–∞—Ç—å—Å—è
assert np.allclose(output_eval, x_test), "–í —Ä–µ–∂–∏–º–µ –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞ –≤—ã—Ö–æ–¥ –¥–æ–ª–∂–µ–Ω —Å–æ–≤–ø–∞–¥–∞—Ç—å —Å –≤—Ö–æ–¥–æ–º"

# # –¢–µ—Å—Ç backward pass
dropout.train()
output_train = dropout.forward(x_test)
grad_output = np.ones_like(output_train)
grad_input = dropout.backward(grad_output)

print(f"\nGradient test:")
print(f"Grad input shape: {grad_input.shape}")
print(f"Grad input mean: {grad_input.mean():.3f}")

assert grad_input.shape == x_test.shape, "–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–∞—è —Ñ–æ—Ä–º–∞ –≥—Ä–∞–¥–∏–µ–Ω—Ç–∞"

print("‚úÖ Dropout —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω —É—Å–ø–µ—à–Ω–æ!")

–†–µ–∂–∏–º –æ–±—É—á–µ–Ω–∏—è:
Input mean: 0.521
Output mean: 0.521
Proportion of zeros: 0.479

–†–µ–∂–∏–º –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞:
Output mean: 0.521
Proportion of zeros: 0.479

Gradient test:
Grad input shape: (100, 10)
Grad input mean: 0.501
‚úÖ Dropout —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω —É—Å–ø–µ—à–Ω–æ!


In [15]:
### Batch Normalization

class BatchNorm(Layer):
    def __init__(self, num_features, eps=1e-5, momentum=0.1):
        super().__init__()
        self.num_features = num_features
        self.eps = eps
        self.momentum = momentum

        self.gamma = np.ones(num_features, dtype=np.float32)
        self.beta = np.zeros(num_features, dtype=np.float32)

        self.running_mean = np.zeros(num_features, dtype=np.float32)
        self.running_var = np.ones(num_features, dtype=np.float32)

        self.batch_mean = None
        self.batch_var = None
        self.normalized = None
        self.input = None
        self.grad_gamma = None
        self.grad_beta = None

    def forward(self, x):
        self.input = x.astype(np.float32)
        #self.input = x

        if self.training:
            self.batch_mean = np.mean(x, axis=0, keepdims=True).astype(np.float32)
            self.batch_var = np.var(x, axis=0, keepdims=True).astype(np.float32)

            self.running_mean = (self.momentum * self.batch_mean + (1 - self.momentum) * self.running_mean).astype(np.float32)
            self.running_var = (self.momentum * self.batch_var + (1 - self.momentum) * self.running_var).astype(np.float32)

            mean = self.batch_mean
            var = self.batch_var
        else:
            mean = self.running_mean.reshape(1, -1)
            var = self.running_var.reshape(1, -1)

        self.normalized = ((x - mean) / np.sqrt(var + self.eps)).astype(np.float32)

        output = (self.gamma * self.normalized + self.beta).astype(np.float32)

        return output

    def backward(self, grad_output):
        batch_size = grad_output.shape[0]
        grad_output = grad_output.astype(np.float32)

        self.grad_gamma = np.sum(grad_output * self.normalized, axis=0).astype(np.float32)
        self.grad_beta = np.sum(grad_output, axis=0).astype(np.float32)
        ### –ò —Ç—É—Ç –Ω–∞—á–∞–ª—Å—è —Å—É—â–∏–π –∫–æ—à–º–∞—Ä
        ### –ß–µ—Å—Ç–Ω–æ –ø—Ä–∏–∑–Ω–∞—é—Å—å, —Å–ø–∏—Å–∞–ª, –Ω–æ –¥–∞–∂–µ —Ç–∞–∫ –ø—Ä–æ–∏–∑–≤–æ–¥–Ω–∞—è BatchNorm —ç—Ç–æ –∫–∞–∫–∞—è-—Ç–æ –∂–µ—Å—Ç—å
        ### –°–ø–∏—Å–∞–ª –æ—Ç—Å—é–¥–∞: https://blog.tnichols.org/posts/batchnorm-backward/

        if self.training:
            grad_normalized = grad_output * self.gamma
            grad_var = np.sum(grad_normalized * (self.input - self.batch_mean) * -0.5 * 
                            np.power(self.batch_var + self.eps, -1.5), axis=0, keepdims=True)
            grad_mean = np.sum(grad_normalized * -1.0 / np.sqrt(self.batch_var + self.eps), axis=0, keepdims=True) + \
                       grad_var * np.mean(-2.0 * (self.input - self.batch_mean), axis=0, keepdims=True)
            
            grad_input = (grad_normalized / np.sqrt(self.batch_var + self.eps) + 
                         grad_var * 2.0 * (self.input - self.batch_mean) / batch_size + 
                         grad_mean / batch_size).astype(np.float32)
        else:
            grad_input = (grad_output * self.gamma / np.sqrt(self.running_var.reshape(1, -1) + self.eps)).astype(np.float32)
        
        return grad_input

    def update_weights(self, learning_rate=0.001):
        """
        –û–±–Ω–æ–≤–ª–µ–Ω–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ gamma –∏ beta
        """
        if self.grad_gamma is not None:
            self.gamma -= learning_rate * self.grad_gamma
        
        if self.grad_beta is not None:
            self.beta -= learning_rate * self.grad_beta

In [16]:
# –¢–µ—Å—Ç BatchNorm (–∑–∞–ø—É—Å—Ç–∏—Ç–µ –ø–æ—Å–ª–µ —Ä–µ–∞–ª–∏–∑–∞—Ü–∏–∏ BatchNorm)
#print("‚ö†Ô∏è –†–µ–∞–ª–∏–∑—É–π—Ç–µ BatchNorm –∫–ª–∞—Å—Å –≤—ã—à–µ, –∑–∞—Ç–µ–º —Ä–∞—Å–∫–æ–º–º–µ–Ω—Ç–∏—Ä—É–π—Ç–µ —ç—Ç–æ—Ç –∫–æ–¥ –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è")

batch_norm = BatchNorm(num_features=4)

# # –ü—Ä–æ–≤–µ—Ä–∏–º –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—é
assert np.allclose(batch_norm.gamma, 1.0), "Gamma –¥–æ–ª–∂–Ω–æ –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞—Ç—å—Å—è –µ–¥–∏–Ω–∏—Ü–∞–º–∏"
assert np.allclose(batch_norm.beta, 0.0), "Beta –¥–æ–ª–∂–Ω–æ –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞—Ç—å—Å—è –Ω—É–ª—è–º–∏"
assert np.allclose(batch_norm.running_mean, 0.0), "Running mean –¥–æ–ª–∂–Ω–æ –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞—Ç—å—Å—è –Ω—É–ª—è–º–∏"
assert np.allclose(batch_norm.running_var, 1.0), "Running var –¥–æ–ª–∂–Ω–æ –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞—Ç—å—Å—è –µ–¥–∏–Ω–∏—Ü–∞–º–∏"

print(f"Gamma: {batch_norm.gamma}")
print(f"Beta: {batch_norm.beta}")

# # –¢–µ—Å—Ç–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ —Å –∏–∑–≤–µ—Å—Ç–Ω–æ–π —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–æ–π
x_test = np.array([
     [1, 2, 3, 4],
     [2, 3, 4, 5],
     [3, 4, 5, 6]
 ], dtype=np.float32)

print(f"Input: \n{x_test}")
print(f"Input mean per feature: {x_test.mean(axis=0)}")
print(f"Input std per feature: {x_test.std(axis=0)}")

# # Forward pass –≤ —Ä–µ–∂–∏–º–µ –æ–±—É—á–µ–Ω–∏—è
batch_norm.train()
output = batch_norm.forward(x_test)

print(f"\nOutput: \n{output}")
print(f"Output mean per feature: {output.mean(axis=0)}")
print(f"Output std per feature: {output.std(axis=0)}")

# # –ü—Ä–æ–≤–µ—Ä–∏–º, —á—Ç–æ –≤—ã—Ö–æ–¥ –Ω–æ—Ä–º–∞–ª–∏–∑–æ–≤–∞–Ω (—Å—Ä–µ–¥–Ω–µ–µ ‚âà 0, std ‚âà 1)
assert np.allclose(output.mean(axis=0), 0, atol=1e-6), "–°—Ä–µ–¥–Ω–µ–µ –¥–æ–ª–∂–Ω–æ –±—ã—Ç—å –±–ª–∏–∑–∫–æ –∫ 0"
assert np.allclose(output.std(axis=0), 1, atol=1e-6), "–°—Ç–∞–Ω–¥–∞—Ä—Ç–Ω–æ–µ –æ—Ç–∫–ª–æ–Ω–µ–Ω–∏–µ –¥–æ–ª–∂–Ω–æ –±—ã—Ç—å –±–ª–∏–∑–∫–æ –∫ 1"

# # –ü—Ä–æ–≤–µ—Ä–∏–º –æ–±–Ω–æ–≤–ª–µ–Ω–∏–µ running —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏
print(f"\nRunning mean: {batch_norm.running_mean}")
print(f"Running var: {batch_norm.running_var}")

# # Backward pass
grad_output = np.ones_like(output)
grad_input = batch_norm.backward(grad_output)

print(f"\nGradient input shape: {grad_input.shape}")
print(f"Gradient gamma shape: {batch_norm.grad_gamma.shape}")
print(f"Gradient beta shape: {batch_norm.grad_beta.shape}")

assert grad_input.shape == x_test.shape, "–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–∞—è —Ñ–æ—Ä–º–∞ –≥—Ä–∞–¥–∏–µ–Ω—Ç–∞ –ø–æ –≤—Ö–æ–¥—É"
assert batch_norm.grad_gamma.shape == batch_norm.gamma.shape, "–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–∞—è —Ñ–æ—Ä–º–∞ –≥—Ä–∞–¥–∏–µ–Ω—Ç–∞ gamma"
assert batch_norm.grad_beta.shape == batch_norm.beta.shape, "–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–∞—è —Ñ–æ—Ä–º–∞ –≥—Ä–∞–¥–∏–µ–Ω—Ç–∞ beta"

# # –¢–µ—Å—Ç —Ä–µ–∂–∏–º–∞ –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞
batch_norm.eval()
output_eval = batch_norm.forward(x_test)
print(f"\nInference mode output mean: {output_eval.mean(axis=0)}")

print("‚úÖ BatchNorm —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω —É—Å–ø–µ—à–Ω–æ!")

Gamma: [1. 1. 1. 1.]
Beta: [0. 0. 0. 0.]
Input: 
[[1. 2. 3. 4.]
 [2. 3. 4. 5.]
 [3. 4. 5. 6.]]
Input mean per feature: [2. 3. 4. 5.]
Input std per feature: [0.8164966 0.8164966 0.8164966 0.8164966]

Output: 
[[-1.2247356 -1.2247356 -1.2247356 -1.2247356]
 [ 0.         0.         0.         0.       ]
 [ 1.2247356  1.2247356  1.2247356  1.2247356]]
Output mean per feature: [0. 0. 0. 0.]
Output std per feature: [0.99999243 0.99999243 0.99999243 0.99999243]

Running mean: [[0.2 0.3 0.4 0.5]]
Running var: [[0.96666664 0.96666664 0.96666664 0.96666664]]

Gradient input shape: (3, 4)
Gradient gamma shape: (4,)
Gradient beta shape: (4,)

Inference mode output mean: [1.8307619 2.746143  3.6615238 4.5769053]
‚úÖ BatchNorm —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω —É—Å–ø–µ—à–Ω–æ!


In [17]:
### Adam –û–ø—Ç–∏–º–∏–∑–∞—Ç–æ—Ä

class Adam:

    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
        self.learning_rate = np.float32(learning_rate)
        self.beta1 = np.float32(beta1)
        self.beta2 = np.float32(beta2)
        self.eps = np.float32(eps)
        
        # –°–ª–æ–≤–∞—Ä–∏ –¥–ª—è —Ö—Ä–∞–Ω–µ–Ω–∏—è –º–æ–º–µ–Ω—Ç–æ–≤ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ —Å–ª–æ—è
        self.m = {}  # first moment
        self.v = {}  # second moment
        self.t = 0   # time step

    def update(self, layer, layer_id):
        self.t += 1

        if hasattr(layer, 'grad_weight') and layer.grad_weight is not None:
            if f"{layer_id}_weight" not in self.m:
                self.m[f"{layer_id}_weight"] = np.zeros_like(layer.grad_weight, dtype=np.float32)
                self.v[f"{layer_id}_weight"] = np.zeros_like(layer.grad_weight, dtype=np.float32)
            # –û–±–Ω–æ–≤–ª–µ–Ω–∏–µ –º–æ–º–µ–Ω—Ç–æ–≤
            self.m[f"{layer_id}_weight"] = (self.beta1 * self.m[f"{layer_id}_weight"] + (1 - self.beta1) * layer.grad_weight).astype(np.float32)
            self.v[f"{layer_id}_weight"] = (self.beta2 * self.v[f"{layer_id}_weight"] + (1 - self.beta2) * np.square(layer.grad_weight)).astype(np.float32)
            # –ö–æ—Ä—Ä–µ–∫—Ü–∏—è —Å–º–µ—â–µ–Ω–∏—è
            m_corrected = self.m[f"{layer_id}_weight"] / (1 - self.beta1 ** self.t).astype(np.float32)
            v_corrected = self.v[f"{layer_id}_weight"] / (1 - self.beta2 ** self.t).astype(np.float32)
            #–û–±–Ω–æ–≤–ª–µ–Ω–∏–µ –≤–µ—Å–æ–≤
            layer.weight -= self.learning_rate * m_corrected / (np.sqrt(v_corrected) + self.eps).astype(np.float32)

        if hasattr(layer, 'grad_bias') and layer.grad_bias is not None:
            if f"{layer_id}_bias" not in self.m:
                self.m[f"{layer_id}_bias"] = np.zeros_like(layer.grad_bias, dtype=np.float32)
                self.v[f"{layer_id}_bias"] = np.zeros_like(layer.grad_bias, dtype=np.float32)
            
            # TODO: –†–µ–∞–ª–∏–∑—É–π—Ç–µ –æ–±–Ω–æ–≤–ª–µ–Ω–∏–µ bias
            self.m[f"{layer_id}_bias"] = (self.beta1 * self.m[f"{layer_id}_bias"] + (1 - self.beta1) * layer.grad_bias).astype(np.float32)
            self.v[f"{layer_id}_bias"] = (self.beta2 * self.v[f"{layer_id}_bias"] + (1 - self.beta2) * np.square(layer.grad_bias)).astype(np.float32)
            
            m_corrected_bias = (self.m[f"{layer_id}_bias"] / ( 1 - self.beta1 ** self.t)).astype(np.float32)
            v_corrected_bias = (self.v[f"{layer_id}_bias"] / ( 1 - self.beta2 ** self.t)).astype(np.float32)
            
            layer.bias -= (self.learning_rate * m_corrected_bias / (np.sqrt(v_corrected_bias) + self.eps)).astype(np.float32) 

    def zero_grad(self, layers):
        """
        –û–±–Ω—É–ª–µ–Ω–∏–µ –≥—Ä–∞–¥–∏–µ–Ω—Ç–æ–≤
        """
        for layer in layers:
            if hasattr(layer, 'grad_weight'):
                layer.grad_weight = None
            if hasattr(layer, 'grad_bias'):
                layer.grad_bias = None
            if hasattr(layer, 'grad_gamma'):
                layer.grad_gamma = None
            if hasattr(layer, 'grad_beta'):
                layer.grad_beta = None


In [18]:
# –¢–µ—Å—Ç Adam (–∑–∞–ø—É—Å—Ç–∏—Ç–µ –ø–æ—Å–ª–µ —Ä–µ–∞–ª–∏–∑–∞—Ü–∏–∏ Adam)
#print("‚ö†Ô∏è –†–µ–∞–ª–∏–∑—É–π—Ç–µ Adam –∫–ª–∞—Å—Å –≤—ã—à–µ, –∑–∞—Ç–µ–º —Ä–∞—Å–∫–æ–º–º–µ–Ω—Ç–∏—Ä—É–π—Ç–µ —ç—Ç–æ—Ç –∫–æ–¥ –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è")

# # –°–æ–∑–¥–∞–Ω–∏–µ —Ç–µ—Å—Ç–æ–≤–æ–≥–æ —Å–ª–æ—è
layer = Linear(3, 2)
adam = Adam(learning_rate=0.01)

# # –°–æ–∑–¥–∞–Ω–∏–µ —Ñ–∏–∫—Ç–∏–≤–Ω—ã—Ö –≥—Ä–∞–¥–∏–µ–Ω—Ç–æ–≤
layer.grad_weight = np.array([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]], dtype=np.float32)
layer.grad_bias = np.array([0.1, 0.2], dtype=np.float32)

# # –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –Ω–∞—á–∞–ª—å–Ω—ã—Ö –≤–µ—Å–æ–≤
initial_weight = layer.weight.copy()
initial_bias = layer.bias.copy()

print(f"Initial weight: \n{initial_weight}")
print(f"Initial bias: {initial_bias}")
print(f"Weight gradient: \n{layer.grad_weight}")
print(f"Bias gradient: {layer.grad_bias}")

# # –ü—Ä–æ–≤–µ—Ä–∏–º –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—é Adam
assert len(adam.m) == 0, "–ú–æ–º–µ–Ω—Ç—ã –¥–æ–ª–∂–Ω—ã –±—ã—Ç—å –ø—É—Å—Ç—ã–º–∏ –ø—Ä–∏ –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏–∏"
assert len(adam.v) == 0, "–ú–æ–º–µ–Ω—Ç—ã –¥–æ–ª–∂–Ω—ã –±—ã—Ç—å –ø—É—Å—Ç—ã–º–∏ –ø—Ä–∏ –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏–∏"
assert adam.t == 0, "Time step –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å —Ä–∞–≤–µ–Ω 0"

# # –í—ã–ø–æ–ª–Ω–∏–º –æ–¥–∏–Ω —à–∞–≥ –æ–ø—Ç–∏–º–∏–∑–∞—Ü–∏–∏
adam.update(layer, "test_layer")

print(f"\nAfter 1 step:")
print(f"Updated weight: \n{layer.weight}")
print(f"Updated bias: {layer.bias}")
print(f"Time step: {adam.t}")

# # –ü—Ä–æ–≤–µ—Ä–∏–º, —á—Ç–æ –≤–µ—Å–∞ –∏–∑–º–µ–Ω–∏–ª–∏—Å—å
assert not np.allclose(layer.weight, initial_weight), "–í–µ—Å–∞ –¥–æ–ª–∂–Ω—ã –∏–∑–º–µ–Ω–∏—Ç—å—Å—è –ø–æ—Å–ª–µ –æ–±–Ω–æ–≤–ª–µ–Ω–∏—è"
assert not np.allclose(layer.bias, initial_bias), "Bias –¥–æ–ª–∂–µ–Ω –∏–∑–º–µ–Ω–∏—Ç—å—Å—è –ø–æ—Å–ª–µ –æ–±–Ω–æ–≤–ª–µ–Ω–∏—è"

# # –ü—Ä–æ–≤–µ—Ä–∏–º, —á—Ç–æ –º–æ–º–µ–Ω—Ç—ã –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω—ã
assert "test_layer_weight" in adam.m, "–ú–æ–º–µ–Ω—Ç –¥–ª—è –≤–µ—Å–æ–≤ –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å —Å–æ–∑–¥–∞–Ω"
assert "test_layer_bias" in adam.m, "–ú–æ–º–µ–Ω—Ç –¥–ª—è bias –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å —Å–æ–∑–¥–∞–Ω"
assert "test_layer_weight" in adam.v, "–ú–æ–º–µ–Ω—Ç –¥–ª—è –≤–µ—Å–æ–≤ –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å —Å–æ–∑–¥–∞–Ω"
assert "test_layer_bias" in adam.v, "–ú–æ–º–µ–Ω—Ç –¥–ª—è bias –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å —Å–æ–∑–¥–∞–Ω"

# # –ü—Ä–æ–≤–µ—Ä–∏–º —Ñ–æ—Ä–º—ã –º–æ–º–µ–Ω—Ç–æ–≤
assert adam.m["test_layer_weight"].shape == layer.weight.shape, "–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–∞—è —Ñ–æ—Ä–º–∞ –º–æ–º–µ–Ω—Ç–∞ –≤–µ—Å–æ–≤"
assert adam.m["test_layer_bias"].shape == layer.bias.shape, "–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–∞—è —Ñ–æ—Ä–º–∞ –º–æ–º–µ–Ω—Ç–∞ bias"

# # –¢–µ—Å—Ç zero_grad
adam.zero_grad([layer])
assert layer.grad_weight is None, "–ì—Ä–∞–¥–∏–µ–Ω—Ç—ã –≤–µ—Å–æ–≤ –¥–æ–ª–∂–Ω—ã –±—ã—Ç—å –æ–±–Ω—É–ª–µ–Ω—ã"
assert layer.grad_bias is None, "–ì—Ä–∞–¥–∏–µ–Ω—Ç—ã bias –¥–æ–ª–∂–Ω—ã –±—ã—Ç—å –æ–±–Ω—É–ª–µ–Ω—ã"

print("‚úÖ Adam —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω —É—Å–ø–µ—à–Ω–æ!")


Initial weight: 
[[0.32108837 0.34717318]
 [0.4521888  1.1152909 ]
 [0.02874159 0.9214675 ]]
Initial bias: [0. 0.]
Weight gradient: 
[[0.1 0.2]
 [0.3 0.4]
 [0.5 0.6]]
Bias gradient: [0.1 0.2]

After 1 step:
Updated weight: 
[[0.31108838 0.3371732 ]
 [0.4421888  1.1052909 ]
 [0.01874159 0.9114675 ]]
Updated bias: [-0.01 -0.01]
Time step: 1
‚úÖ Adam —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω —É—Å–ø–µ—à–Ω–æ!


In [19]:
### –§—É–Ω–∫—Ü–∏–∏ –ø–æ—Ç–µ—Ä—å

class CrossEntropyLoss:
    def __init__(self):
        self.predictions = None
        self.targets = None
    
    def forward(self, predictions, targets):

        self.predictions = predictions
        self.targets = targets
        
        # TODO: –ü—Ä–∏–º–µ–Ω–∏—Ç–µ softmax –∫ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è–º
        self.softmax_pred = softmax(predictions)
        
        # TODO: –í—ã—á–∏—Å–ª–∏—Ç–µ cross-entropy loss
        clipped_softmax = np.clip(self.softmax_pred, 1e-4, 1 + 1e-4)
        loss = - np.mean(np.log(clipped_softmax[np.arange(predictions.shape[0]), targets]))
        
        return loss
    
    def backward(self):
        """
        –í—ã—á–∏—Å–ª–µ–Ω–∏–µ –≥—Ä–∞–¥–∏–µ–Ω—Ç–∞ Cross-Entropy Loss
        
        Returns:
            –≥—Ä–∞–¥–∏–µ–Ω—Ç –ø–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è–º
        """
        # TODO: –í—ã—á–∏—Å–ª–∏—Ç–µ –≥—Ä–∞–¥–∏–µ–Ω—Ç
        one_hot_targets = one_hot_encode(self.targets, self.predictions.shape[1])
        
        grad = (self.softmax_pred - one_hot_targets) / self.predictions.shape[0]
        
        return grad


class MSELoss:
    def __init__(self):
        self.predictions = None
        self.targets = None
    
    def forward(self, predictions, targets):
        """
        –í—ã—á–∏—Å–ª–µ–Ω–∏–µ Mean Squared Error
        
        Args:
            predictions: –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –º–æ–¥–µ–ª–∏
            targets: –∏—Å—Ç–∏–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è
        
        Returns:
            –∑–Ω–∞—á–µ–Ω–∏–µ —Ñ—É–Ω–∫—Ü–∏–∏ –ø–æ—Ç–µ—Ä—å
        """
        self.predictions = predictions
        self.targets = targets
        
        # TODO: –í—ã—á–∏—Å–ª–∏—Ç–µ MSE
        loss = np.mean(np.square(self.predictions - self.targets))
        
        return loss
    
    def backward(self):
        """
        –í—ã—á–∏—Å–ª–µ–Ω–∏–µ –≥—Ä–∞–¥–∏–µ–Ω—Ç–∞ MSE
        
        Returns:
            –≥—Ä–∞–¥–∏–µ–Ω—Ç –ø–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è–º
        """
        # TODO: –í—ã—á–∏—Å–ª–∏—Ç–µ –≥—Ä–∞–¥–∏–µ–Ω—Ç MSE
        grad = 2 * (self.predictions - self.targets) / self.predictions.shape[0]
        
        return grad


def softmax(x):
    """
    –£—Å—Ç–æ–π—á–∏–≤–∞—è —Ä–µ–∞–ª–∏–∑–∞—Ü–∏—è softmax
    """
    # TODO: –†–µ–∞–ª–∏–∑—É–π—Ç–µ softmax —Ñ—É–Ω–∫—Ü–∏—é
    x_shifted = x - np.max(x, axis=-1, keepdims=True)
    exp_x = np.exp(x_shifted)
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)


def one_hot_encode(labels, num_classes):
    """
    –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –º–µ—Ç–æ–∫ –≤ one-hot –∫–æ–¥–∏—Ä–æ–≤–∫—É
    """
    # TODO: –°–æ–∑–¥–∞–π—Ç–µ one-hot –∫–æ–¥–∏—Ä–æ–≤–∫—É
    one_hot = np.zeros((len(labels), num_classes))
    one_hot[np.arange(len(labels)), labels] = 1
    return one_hot

In [20]:
# –¢–µ—Å—Ç —Ñ—É–Ω–∫—Ü–∏–π –ø–æ—Ç–µ—Ä—å (–∑–∞–ø—É—Å—Ç–∏—Ç–µ –ø–æ—Å–ª–µ —Ä–µ–∞–ª–∏–∑–∞—Ü–∏–∏ Loss —Ñ—É–Ω–∫—Ü–∏–π)
#print("‚ö†Ô∏è –†–µ–∞–ª–∏–∑—É–π—Ç–µ —Ñ—É–Ω–∫—Ü–∏–∏ –ø–æ—Ç–µ—Ä—å –≤—ã—à–µ, –∑–∞—Ç–µ–º —Ä–∞—Å–∫–æ–º–º–µ–Ω—Ç–∏—Ä—É–π—Ç–µ —ç—Ç–æ—Ç –∫–æ–¥ –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è")

# # –¢–µ—Å—Ç CrossEntropyLoss
print("üî• –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ CrossEntropyLoss...")
ce_loss = CrossEntropyLoss()

# # –¢–µ—Å—Ç–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ
predictions = np.array([[2.0, 1.0, 0.1], [1.0, 3.0, 0.2]], dtype=np.float32)
targets = np.array([0, 1], dtype=np.int32)

print(f"Predictions: \n{predictions}")
print(f"Targets: {targets}")

# # Forward pass
loss_value = ce_loss.forward(predictions, targets)
print(f"CrossEntropy Loss: {loss_value:.4f}")

# # –ü—Ä–æ–≤–µ—Ä–∏–º, —á—Ç–æ loss –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω—ã–π
assert loss_value > 0, "CrossEntropy loss –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω—ã–º"

# # Backward pass
grad = ce_loss.backward()
print(f"Gradient shape: {grad.shape}")
print(f"Gradient: \n{grad}")

# # –ü—Ä–æ–≤–µ—Ä–∏–º —Ñ–æ—Ä–º—É –≥—Ä–∞–¥–∏–µ–Ω—Ç–∞
assert grad.shape == predictions.shape, "–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–∞—è —Ñ–æ—Ä–º–∞ –≥—Ä–∞–¥–∏–µ–Ω—Ç–∞ CrossEntropy"

# # –ü—Ä–æ–≤–µ—Ä–∏–º, —á—Ç–æ —Å—É–º–º–∞ –≥—Ä–∞–¥–∏–µ–Ω—Ç–æ–≤ –ø–æ –∫–ª–∞—Å—Å–∞–º —Ä–∞–≤–Ω–∞ 0 (—Å–≤–æ–π—Å—Ç–≤–æ softmax)
assert np.allclose(grad.sum(axis=1), 0, atol=1e-6), "–°—É–º–º–∞ –≥—Ä–∞–¥–∏–µ–Ω—Ç–æ–≤ –ø–æ –∫–ª–∞—Å—Å–∞–º –¥–æ–ª–∂–Ω–∞ –±—ã—Ç—å 0"

print("‚úÖ CrossEntropyLoss —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω!")

# # –¢–µ—Å—Ç MSELoss
print("\nüìä –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ MSELoss...")
mse_loss = MSELoss()

# # –¢–µ—Å—Ç–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ –¥–ª—è —Ä–µ–≥—Ä–µ—Å—Å–∏–∏
predictions_reg = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
targets_reg = np.array([[1.5, 2.5], [2.5, 3.5]], dtype=np.float32)

print(f"Predictions: \n{predictions_reg}")
print(f"Targets: \n{targets_reg}")

# # Forward pass
mse_value = mse_loss.forward(predictions_reg, targets_reg)
print(f"MSE Loss: {mse_value:.4f}")

# # –ü—Ä–æ–≤–µ—Ä–∏–º, —á—Ç–æ loss –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω—ã–π
assert mse_value >= 0, "MSE loss –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å –Ω–µ–æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω—ã–º"

# # Backward pass
grad_mse = mse_loss.backward()
print(f"MSE Gradient shape: {grad_mse.shape}")
print(f"MSE Gradient: \n{grad_mse}")

# # –ü—Ä–æ–≤–µ—Ä–∏–º —Ñ–æ—Ä–º—É –≥—Ä–∞–¥–∏–µ–Ω—Ç–∞
assert grad_mse.shape == predictions_reg.shape, "–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–∞—è —Ñ–æ—Ä–º–∞ –≥—Ä–∞–¥–∏–µ–Ω—Ç–∞ MSE"

print("‚úÖ MSELoss —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω!")

# # –¢–µ—Å—Ç softmax —Ñ—É–Ω–∫—Ü–∏–∏
print("\nüéØ –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ Softmax...")
x_softmax = np.array([[1.0, 2.0, 3.0], [1.0, 1.0, 1.0]], dtype=np.float32)
softmax_output = softmax(x_softmax)

print(f"Input: \n{x_softmax}")
print(f"Softmax output: \n{softmax_output}")

# # –ü—Ä–æ–≤–µ—Ä–∏–º, —á—Ç–æ —Å—É–º–º–∞ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–µ–π —Ä–∞–≤–Ω–∞ 1
assert np.allclose(softmax_output.sum(axis=1), 1.0), "–°—É–º–º–∞ softmax –¥–æ–ª–∂–Ω–∞ –±—ã—Ç—å —Ä–∞–≤–Ω–∞ 1"

# # –ü—Ä–æ–≤–µ—Ä–∏–º, —á—Ç–æ –≤—Å–µ –∑–Ω–∞—á–µ–Ω–∏—è –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω—ã–µ
assert np.all(softmax_output > 0), "–í—Å–µ –∑–Ω–∞—á–µ–Ω–∏—è softmax –¥–æ–ª–∂–Ω—ã –±—ã—Ç—å –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω—ã–º–∏"
assert np.all(softmax_output < 1), "–í—Å–µ –∑–Ω–∞—á–µ–Ω–∏—è softmax –¥–æ–ª–∂–Ω—ã –±—ã—Ç—å –º–µ–Ω—å—à–µ 1"

print("‚úÖ Softmax —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω!")

# # –¢–µ—Å—Ç one-hot encoding
print("\nüè∑Ô∏è –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ One-hot encoding...")
labels = np.array([0, 2, 1, 0])
one_hot = one_hot_encode(labels, num_classes=3)

print(f"Labels: {labels}")
print(f"One-hot: \n{one_hot}")

# # –ü—Ä–æ–≤–µ—Ä–∏–º —Ñ–æ—Ä–º—É
assert one_hot.shape == (4, 3), "–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–∞—è —Ñ–æ—Ä–º–∞ one-hot –∫–æ–¥–∏—Ä–æ–≤–∫–∏"

# # –ü—Ä–æ–≤–µ—Ä–∏–º, —á—Ç–æ –∫–∞–∂–¥–∞—è —Å—Ç—Ä–æ–∫–∞ —Å–æ–¥–µ—Ä–∂–∏—Ç —Ä–æ–≤–Ω–æ –æ–¥–Ω—É –µ–¥–∏–Ω–∏—Ü—É
assert np.all(one_hot.sum(axis=1) == 1), "–ö–∞–∂–¥–∞—è —Å—Ç—Ä–æ–∫–∞ –¥–æ–ª–∂–Ω–∞ —Å–æ–¥–µ—Ä–∂–∞—Ç—å —Ä–æ–≤–Ω–æ –æ–¥–Ω—É –µ–¥–∏–Ω–∏—Ü—É"

print("‚úÖ One-hot encoding —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω!")

print("\nüéâ –í—Å–µ —Ç–µ—Å—Ç—ã —Ñ—É–Ω–∫—Ü–∏–π –ø–æ—Ç–µ—Ä—å –ø—Ä–æ–π–¥–µ–Ω—ã —É—Å–ø–µ—à–Ω–æ!")


üî• –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ CrossEntropyLoss...
Predictions: 
[[2.  1.  0.1]
 [1.  3.  0.2]]
Targets: [0 1]
CrossEntropy Loss: 0.2981
Gradient shape: (2, 3)
Gradient: 
[[-0.17049941  0.1212165   0.04928295]
 [ 0.05657142 -0.08199063  0.02541918]]
‚úÖ CrossEntropyLoss —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω!

üìä –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ MSELoss...
Predictions: 
[[1. 2.]
 [3. 4.]]
Targets: 
[[1.5 2.5]
 [2.5 3.5]]
MSE Loss: 0.2500
MSE Gradient shape: (2, 2)
MSE Gradient: 
[[-0.5 -0.5]
 [ 0.5  0.5]]
‚úÖ MSELoss —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω!

üéØ –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ Softmax...
Input: 
[[1. 2. 3.]
 [1. 1. 1.]]
Softmax output: 
[[0.09003057 0.24472848 0.66524094]
 [0.33333334 0.33333334 0.33333334]]
‚úÖ Softmax —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω!

üè∑Ô∏è –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ One-hot encoding...
Labels: [0 2 1 0]
One-hot: 
[[1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]]
‚úÖ One-hot encoding —Ç–µ—Å—Ç –ø—Ä–æ–π–¥–µ–Ω!

üéâ –í—Å–µ —Ç–µ—Å—Ç—ã —Ñ—É–Ω–∫—Ü–∏–π –ø–æ—Ç–µ—Ä—å –ø—Ä–æ–π–¥–µ–Ω—ã —É—Å–ø–µ—à–Ω–æ!


In [21]:
class NeuralNetwork:
    def __init__(self):
        # TODO: –°–æ–∑–¥–∞–π—Ç–µ –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä—É –Ω–µ–π—Ä–æ–Ω–Ω–æ–π —Å–µ—Ç–∏
        self.model = Sequential(Linear(784, 512),
                                BatchNorm(512),
                                ReLU(),
                                Dropout(dropout_rate=0.5),
                                Linear(512, 256),
                                BatchNorm(256),
                                ReLU(),
                                Dropout(dropout_rate=0.3),
                                Linear(256, 128),
                                BatchNorm(128),
                                ReLU(),
                                Dropout(dropout_rate=0.2),
                                Linear(128, 10)

        )
    
    def forward(self, x):
        return self.model.forward(x)
    
    def backward(self, grad_output):
        return self.model.backward(grad_output)
    
    def train(self):
        self.model.train()
    
    def eval(self):
        self.model.eval()
    
    def get_trainable_layers(self):
        """
        –ü–æ–ª—É—á–µ–Ω–∏–µ –≤—Å–µ—Ö —Å–ª–æ–µ–≤ —Å –æ–±—É—á–∞–µ–º—ã–º–∏ –ø–∞—Ä–∞–º–µ—Ç—Ä–∞–º–∏
        """
        trainable_layers = []
        for layer in self.model.layers:
            if hasattr(layer, 'update_weights'):
                trainable_layers.append(layer)
        return trainable_layers

In [22]:
import torch
torch.cuda.is_available()

False

In [23]:
device = torch.device('cuda') 

In [25]:
model = NeuralNetwork()  ### –ù–µ –ø–æ–Ω—è–ª –∫–∞–∫ –Ω–∞ device –ø–µ—Ä–µ–Ω–µ—Å—Ç–∏
criterion = CrossEntropyLoss()
optimizer = Adam(learning_rate=0.001)

import pandas as pd
from sklearn.model_selection import train_test_split

dataset = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
X = dataset.drop('label', axis=1).values.astype(np.float32) / 255.0
y = dataset['label'].values

# –†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –Ω–∞ –æ–±—É—á–∞—é—â—É—é –∏ –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω—É—é –≤—ã–±–æ—Ä–∫–∏
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=123)

print(f"–†–∞–∑–º–µ—Ä –æ–±—É—á–∞—é—â–µ–π –≤—ã–±–æ—Ä–∫–∏: {X_train.shape}")
print(f"–†–∞–∑–º–µ—Ä –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–æ–π –≤—ã–±–æ—Ä–∫–∏: {X_val.shape}")

X_train = X_train.astype(np.float32)
X_val = X_val.astype(np.float32)
y_train = y_train.astype(np.int64)
y_val = y_val.astype(np.int64)

#/kaggle/input/digit-recognizer

–†–∞–∑–º–µ—Ä –æ–±—É—á–∞—é—â–µ–π –≤—ã–±–æ—Ä–∫–∏: (33600, 784)
–†–∞–∑–º–µ—Ä –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–æ–π –≤—ã–±–æ—Ä–∫–∏: (8400, 784)


In [26]:
def create_batches(X, y, batch_size, shuffle=True):
    """–°–æ–∑–¥–∞–Ω–∏–µ –±–∞—Ç—á–µ–π –¥–ª—è –æ–±—É—á–µ–Ω–∏—è"""
    n_samples = X.shape[0]
    indices = np.arange(n_samples)
    
    if shuffle:
        np.random.shuffle(indices)
    
    for start_idx in range(0, n_samples, batch_size):
        end_idx = min(start_idx + batch_size, n_samples)
        batch_indices = indices[start_idx:end_idx]
        yield X[batch_indices], y[batch_indices]

def calculate_accuracy(predictions, targets):
    """–í—ã—á–∏—Å–ª–µ–Ω–∏–µ —Ç–æ—á–Ω–æ—Å—Ç–∏"""
    predicted_classes = np.argmax(predictions, axis=1)
    return np.mean(predicted_classes == targets)

In [None]:
def train_model(model, optimizer, criterion, X_train, y_train, X_val, y_val, 
                num_epochs=30, batch_size=128, print_every=100, max_grad_norm=1.0):
    """
    –¶–∏–∫–ª —Ç—Ä–µ–Ω–∏—Ä–æ–≤–∫–∏ –Ω–µ–π—Ä–æ—Å–µ—Ç–∏
    """
    train_losses = []
    train_accuracies = []
    val_losses = []
    val_accuracies = []
    
    for epoch in range(num_epochs):
        # –†–µ–∂–∏–º –æ–±—É—á–µ–Ω–∏—è
        model.train()
        epoch_train_loss = 0.0
        epoch_train_acc = 0.0
        num_batches = 0

        indices = np.random.permutation(len(X_train))
        X_train_shuffled = X_train[indices]
        y_train_shuffled = y_train[indices]
        
        # –û–±—É—á–µ–Ω–∏–µ –Ω–∞ –±–∞—Ç—á–∞—Ö
        for batch_X, batch_y in create_batches(X_train_shuffled, y_train_shuffled, batch_size):
            # Forward pass
            predictions = model.forward(batch_X)
            loss = criterion.forward(predictions, batch_y)
            
            # Backward pass
            grad_output = criterion.backward()
            model.backward(grad_output)
            
            # –û–±–Ω–æ–≤–ª–µ–Ω–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤
            total_norm = 0
            trainable_layers = model.get_trainable_layers()
            for layer in trainable_layers:
                if hasattr(layer, 'grad_weight') and layer.grad_weight is not None:
                    total_norm += np.sum(layer.grad_weight ** 2)
                if hasattr(layer, 'grad_bias') and layer.grad_bias is not None:
                    total_norm += np.sum(layer.grad_bias ** 2)
            total_norm = np.sqrt(total_norm)

            if total_norm > max_grad_norm:
                clip_coef = max_grad_norm / (total_norm + 1e-6)
                for layer in trainable_layers:
                    if hasattr(layer, 'grad_weight') and layer.grad_weight is not None:
                        layer.grad_weight *= clip_coef
                    if hasattr(layer, 'grad_bias') and layer.grad_bias is not None:
                        layer.grad_bias *= clip_coef

            for i, layer in enumerate(trainable_layers):
                optimizer.update(layer, f"layer_{i}")
            
            # –û–±–Ω—É–ª–µ–Ω–∏–µ –≥—Ä–∞–¥–∏–µ–Ω—Ç–æ–≤
            optimizer.zero_grad(model.model.layers)
            
            
            # –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞
            epoch_train_loss += loss
            epoch_train_acc += calculate_accuracy(predictions, batch_y)
            num_batches += 1
            
            if num_batches % print_every == 0:
                print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{num_batches}], "
                      f"Loss: {loss:.4f}, Accuracy: {calculate_accuracy(predictions, batch_y):.4f}")
        
        # –°—Ä–µ–¥–Ω–∏–µ –∑–Ω–∞—á–µ–Ω–∏—è –∑–∞ —ç–ø–æ—Ö—É
        epoch_train_loss /= num_batches
        epoch_train_acc /= num_batches
        
        # –í–∞–ª–∏–¥–∞—Ü–∏—è
        model.eval()
        val_predictions = model.forward(X_val)
        val_loss = criterion.forward(val_predictions, y_val)
        val_acc = calculate_accuracy(val_predictions, y_val)
        
        # –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏
        train_losses.append(epoch_train_loss)
        train_accuracies.append(epoch_train_acc)
        val_losses.append(val_loss)
        val_accuracies.append(val_acc)
        
        print(f"Epoch [{epoch+1}/{num_epochs}] - "
              f"Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_acc:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
    
    return train_losses, train_accuracies, val_losses, val_accuracies

# –ó–∞–ø—É—Å–∫ –æ–±—É—á–µ–Ω–∏—è
print("–ù–∞—á–∏–Ω–∞–µ–º –æ–±—É—á–µ–Ω–∏–µ...")
train_losses, train_accs, val_losses, val_accs = train_model(
    model, optimizer, criterion, X_train, y_train, X_val, y_val,
    num_epochs=40, batch_size=256, print_every=50
)

print("–û–±—É—á–µ–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ!")

In [None]:
import pandas as pd
import numpy as np

test_data = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

X_test = test_data.values.astype(np.float32) / 255.0  # –ù–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è –∫ [0, 1]

In [None]:
def predict_on_test_data(model, X_test, batch_size=256):

    model.eval()
    
    all_predictions = []
    num_samples = X_test.shape[0]

    for start_idx in range(0, num_samples, batch_size):
        end_idx = min(start_idx + batch_size, num_samples)
        batch_X = X_test[start_idx:end_idx]

        batch_predictions = model.forward(batch_X)
        
        batch_probs = softmax(batch_predictions)
        
        batch_classes = np.argmax(batch_probs, axis=1)
        
        all_predictions.extend(batch_classes)
    
    return np.array(all_predictions)

In [None]:
def create_submission_file(predictions, filename='submission.csv'):
    submission = pd.DataFrame({
        'ImageId': range(1, len(predictions) + 1),  # ImageId –Ω–∞—á–∏–Ω–∞–µ—Ç—Å—è —Å 1
        'Label': predictions
    })

    submission.to_csv(filename, index=False)
    return submission

submission_df = create_submission_file(predictions, 'mnist_submission.csv')