Задача 1: BatchNorm

In [1]:
import numpy as np

class BatchNorm:
    def __init__(self, num_features, eps=1e-5, momentum=0.1):
        self.num_features = num_features
        self.eps = eps
        self.momentum = momentum
        
        # Параметры (обучаются)
        self.gamma = np.ones(num_features)
        self.beta = np.zeros(num_features)
        
        # Буферы для инференса (не обучаются)
        self.running_mean = np.zeros(num_features)
        self.running_var = np.ones(num_features)
        
        # Для backward
        self.x_norm = None
        self.x_centered = None
        self.std_inv = None
        self.input_shape = None

    def forward(self, x, train=True):
        self.input_shape = x.shape
        if x.ndim == 2:
            # Полносвязный случай: (N, D)
            x = x.T  # (D, N)
            axis = 1
        elif x.ndim == 4:
            # Свёрточный случай: (N, C, H, W) → переставляем в (C, N*H*W)
            N, C, H, W = x.shape
            x = x.transpose(1, 0, 2, 3).reshape(C, -1)
            axis = 1
        else:
            raise ValueError("Unsupported input shape")

        if train:
            batch_mean = np.mean(x, axis=axis, keepdims=True)  # (C, 1)
            batch_var = np.var(x, axis=axis, keepdims=True)    # (C, 1)

            # Обновляем running stats
            self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_mean.flatten()
            self.running_var = (1 - self.momentum) * self.running_var + self.momentum * batch_var.flatten()

            self.x_centered = x - batch_mean
            self.std_inv = 1.0 / np.sqrt(batch_var + self.eps)
            self.x_norm = self.x_centered * self.std_inv

            out = self.gamma[:, None] * self.x_norm + self.beta[:, None]
        else:
            # Используем накопленные средние
            x_norm = (x - self.running_mean[:, None]) / np.sqrt(self.running_var[:, None] + self.eps)
            out = self.gamma[:, None] * x_norm + self.beta[:, None]

        # Возвращаем в исходную форму
        if self.input_shape[1] == self.num_features:  # (N, D)
            return out.T
        else:  # (N, C, H, W)
            N, C, H, W = self.input_shape
            return out.reshape(C, N, H, W).transpose(1, 0, 2, 3)

    def backward(self, grad_output):
        if self.input_shape[1] == self.num_features:
            grad_output = grad_output.T  # (D, N)
        else:
            N, C, H, W = self.input_shape
            grad_output = grad_output.transpose(1, 0, 2, 3).reshape(C, -1)

        N = grad_output.shape[1]

        # Градиент по gamma и beta
        dgamma = np.sum(grad_output * self.x_norm, axis=1)
        dbeta = np.sum(grad_output, axis=1)

        # Градиент по входу
        dx_norm = grad_output * self.gamma[:, None]
        dvar = np.sum(dx_norm * self.x_centered, axis=1, keepdims=True) * (-0.5) * (self.std_inv ** 3)
        dmean = np.sum(dx_norm * (-self.std_inv), axis=1, keepdims=True) + dvar * np.mean(-2.0 * self.x_centered, axis=1, keepdims=True)
        dx = dx_norm * self.std_inv + dvar * (2.0 * self.x_centered) / N + dmean / N

        # Возвращаем в исходную форму
        if self.input_shape[1] == self.num_features:
            dx = dx.T
        else:
            N_orig, C, H, W = self.input_shape
            dx = dx.reshape(C, N_orig, H, W).transpose(1, 0, 2, 3)

        # Сохраняем градиенты для параметров (если нужен шаг оптимизатора)
        self.grad_gamma = dgamma
        self.grad_beta = dbeta

        return dx

Задача 2: Linear

In [2]:
class Linear:
    def __init__(self, in_features, out_features, bias=True):
        self.in_features = in_features
        self.out_features = out_features
        self.bias = bias

        # Инициализация весов (как в PyTorch: Kaiming Uniform по умолчанию, но для простоты — нормальное распределение)
        self.weight = np.random.randn(out_features, in_features) * np.sqrt(2.0 / in_features)
        if bias:
            self.bias_param = np.zeros(out_features)
        else:
            self.bias_param = None

        self.x = None  # для backward

    def forward(self, x):
        self.x = x  # сохраняем вход для backward
        out = x @ self.weight.T
        if self.bias:
            out += self.bias_param
        return out

    def backward(self, grad_output):
        # grad_output: (N, out_features)
        dx = grad_output @ self.weight  # (N, in_features)
        dW = grad_output.T @ self.x      # (out_features, in_features)
        db = np.sum(grad_output, axis=0) if self.bias else None

        self.grad_weight = dW
        if self.bias:
            self.grad_bias = db

        return dx

Задача 3: Dropout

In [3]:
class Dropout:
    def __init__(self, p=0.5):
        self.p = p
        self.mask = None

    def forward(self, x, train=True):
        if not train or self.p == 0:
            return x
        # Создаём маску: True с вероятностью (1 - p)
        self.mask = (np.random.rand(*x.shape) > self.p)
        # Масштабируем на (1 - p) для сохранения ожидания
        return x * self.mask / (1.0 - self.p)

    def backward(self, grad_output):
        if self.mask is None:
            return grad_output
        return grad_output * self.mask / (1.0 - self.p)

Задача 4: Активации

In [4]:
class ReLU:
    def __init__(self):
        self.x = None

    def forward(self, x):
        self.x = x
        return np.maximum(0, x)

    def backward(self, grad_output):
        return grad_output * (self.x > 0)

Sigmoid

In [5]:
class Sigmoid:
    def __init__(self):
        self.out = None

    def forward(self, x):
        # Устойчивая реализация
        self.out = np.where(x >= 0, 
                            1 / (1 + np.exp(-x)),
                            np.exp(x) / (1 + np.exp(x)))
        return self.out

    def backward(self, grad_output):
        return grad_output * self.out * (1 - self.out)

Softmax

In [6]:
class Softmax:
    def __init__(self):
        self.out = None

    def forward(self, x, axis=-1):
        # Численно устойчивый softmax
        x_max = np.max(x, axis=axis, keepdims=True)
        exp_x = np.exp(x - x_max)
        self.out = exp_x / np.sum(exp_x, axis=axis, keepdims=True)
        return self.out

    def backward(self, grad_output, axis=-1):
        # grad_output has same shape as self.out
        # Softmax Jacobian: J_ik = s_i (δ_ik - s_k)
        # => grad_input_i = sum_k grad_output_k * s_i (δ_ik - s_k) = s_i (grad_output_i - sum_k grad_output_k s_k)
        s = self.out
        sum_s_grad = np.sum(grad_output * s, axis=axis, keepdims=True)
        return s * (grad_output - sum_s_grad)