# ДЗ_10: Нейронные сети

In [201]:
import torch

## Задание 1: Написать на PyTorch forward и backward полносвязного слоя без использования autograd

In [237]:
def sigmoid(x):
    return 1. / (1 + torch.exp(-x))

def sigmoid_backward(da, x):
    sig = sigmoid(x)

    return da * sig * (1 - sig)

def relu(x):
    return torch.maximum(torch.zeros(1), x)

def relu_backward(da, x):
    da = torch.tensor(da)
    da[x <= 0] = 0
    return da

In [203]:
def mse_loss(t, y):
    return (t - y) ** 2

def d_mse_loss(t, y):
    return 2 * (y - t)


In [212]:
class LinearLayer:
    def __init__(self, n_inp, n_out, activation='sigmoid'):
        self.w = torch.rand(n_out, n_inp) * 0.1
        self.b = torch.rand(n_out, 1) * 0.1
        #print(n_inp, n_out, activation)
        if activation == 'sigmoid':
            self.activ = sigmoid
        elif activation == 'relu':
            self.activ = relu
        elif activation == 'None':
            self.activ = None
        else:
            raise Exception(f'Unknown activation "{activation}"')
        self._clear_state()

    def _clear_state(self):
        self.lin = None
        self.inp = None
        self.d_w = None
        self.d_b = None

    def forward(self, x):
        self.inp = x
        self.lin = self.w @ x + self.b
        activ = self.activ(self.lin) if self.activ is not None else self.lin

        return activ

    def backward(self, grad): # grad = d L / d z    Dout
        # grad * dz / d lin
        if self.activ == sigmoid:
            grad_lin = sigmoid_backward(grad, self.lin)
        elif self.activ == relu:
            grad_lin = relu_backward(grad, self.lin)
        else:
            grad_lin = grad
        # grad_lin * d lin / d w
        m = self.inp.shape[1]
        self.d_w = grad_lin @ self.inp.t() / m    # d_in dOut
        # grad_lin * d lin / d b
        self.d_b = torch.sum(grad_lin, axis=1, keepdims=True) / m
        grad = self.w.t() @ grad_lin

        return grad

# pred = model(x)
# loss = criterion(pred, target)
# grad = d loss / d pred
# model.backward(grad)

In [239]:
from typing import Tuple

class Model:
    def __init__(self, arch: Tuple[Tuple[int, int]], activation):
        self.layers = []
        for i, p in enumerate(arch):
            #print(i, p, activation)
            self.layers.append(
                LinearLayer(p[0], p[1], activation=activation if i < len(arch)-1 else 'None')
            )
        self._clear_state()

    def _clear_state(self):
        for l in self.layers:
            l._clear_state()

    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)

        return x

    def backward(self, grad):
        for layer in reversed(self.layers):
            grad = layer.backward(grad)

        return grad

## Задание 2: Написать 1-2 адаптивных оптимизатора

In [252]:
# SGD Momentum optimizer для всей модели
# velocity = momentum * velocity - lr * gradient
# w = w + velocity
class SGDMomentum:
    def __init__(self, model: Model, lr= 0.0001, momentum=0.99):
        self.model = model
        self.lr = lr
        self.m = momentum
        self.vel = [[torch.zeros_like(layer.w),
                     torch.zeros_like(layer.b)] for layer in model.layers]

    def step(self):
        for i, layer in enumerate(self.model.layers):
            self.vel[i][0] = self.vel[i][0] * self.m - self.lr * layer.d_w
            self.vel[i][1] = self.vel[i][1] * self.m - self.lr * layer.d_b
            layer.w += self.vel[i][0]
            layer.b += self.vel[i][1]

    def zero_grad(self):
        self.model._clear_state()

In [253]:
# Adagrad optimizer для всей модели
# accumulated += gradient ** 2
# adapt_lr = lr/sqrt(accumulated)
# w = w - adapt_lr * gradient

class Adagrad:
    def __init__(self, model: Model, lr= 0.0001):
        self.model = model
        self.lr = lr
        self.acc = [[torch.zeros_like(layer.w),
                     torch.zeros_like(layer.b)] for layer in model.layers]

    def step(self):
        for i, layer in enumerate(self.model.layers):
            self.acc[i][0] += layer.d_w ** 2
            self.acc[i][1] += layer.d_b ** 2
            adapt_lr_w = self.lr / torch.sqrt(self.acc[i][0])
            adapt_lr_b = self.lr / torch.sqrt(self.acc[i][1])
            layer.w -= adapt_lr_w * layer.d_w
            layer.b -= adapt_lr_b * layer.d_b


    def zero_grad(self):
        self.model._clear_state()

In [277]:
# RMSProp optimizer для всей модели
# accumulated = rho * accumulated + (1 - rho) * gradient ** 2
# adapt_lr = lr/sqrt(accumulated)
# w = w - adapt_lr * gradient

class RMSProp:
    def __init__(self, model: Model, lr= 0.0001, rho= 0.99):
        self.model = model
        self.lr = lr
        self.rho = rho
        self.acc = [[torch.zeros_like(layer.w),
                     torch.zeros_like(layer.b)] for layer in model.layers]

    def step(self):
        for i, layer in enumerate(self.model.layers):
            self.acc[i][0] = self.rho * self.acc[i][0] + (1 - self.rho) * layer.d_w ** 2
            self.acc[i][1] = self.rho * self.acc[i][1] + (1 - self.rho) * layer.d_b ** 2
            adapt_lr_w = self.lr / torch.sqrt(self.acc[i][0])
            adapt_lr_b = self.lr / torch.sqrt(self.acc[i][1])
            layer.w -= adapt_lr_w * layer.d_w
            layer.b -= adapt_lr_b * layer.d_b


    def zero_grad(self):
        self.model._clear_state()

In [280]:
# Adam optimizer для всей модели
# velocity = beta1 * velocity + (1 - beta1) * gradient
# accumulated = beta2 * accumulated + (1 - beta2) * gradient ** 2
# adapt_lr = lr/sqrt(accumulated)
# w = w - adapt_lr * velocity

class Adam:
    def __init__(self, model: Model, lr= 0.0001, beta1 = 0.99, beta2 = 0.99):
        self.model = model
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.acc = [[torch.zeros_like(layer.w),
                     torch.zeros_like(layer.b)] for layer in model.layers]
        self.vel = [[torch.zeros_like(layer.w),
                     torch.zeros_like(layer.b)] for layer in model.layers]

    def step(self):
        for i, layer in enumerate(self.model.layers):
            self.vel[i][0] = self.vel[i][0] * self.beta1 + (1 - self.beta1) * layer.d_w
            self.vel[i][1] = self.vel[i][1] * self.beta1 + (1 - self.beta1) * layer.d_b
            self.acc[i][0] = self.beta2 * self.acc[i][0] + (1 - self.beta2) * layer.d_w ** 2
            self.acc[i][1] = self.beta2 * self.acc[i][1] + (1 - self.beta2) * layer.d_b ** 2
            adapt_lr_w = self.lr / torch.sqrt(self.acc[i][0])
            adapt_lr_b = self.lr / torch.sqrt(self.acc[i][1])
            layer.w -= adapt_lr_w * self.vel[i][0]
            layer.b -= adapt_lr_b * self.vel[i][1]


    def zero_grad(self):
        self.model._clear_state()

In [254]:
# pred = model(x)
# loss = criterion(pred, target)
# grad = d loss / d pred
# model.backward(grad)
# optim.step()

In [255]:
x = (torch.rand(2000)-0.5)*4
y = x**2 + torch.randn(1)*0.1


In [272]:
model1 = Model(((1, 100), (100, 1)), activation='sigmoid')
optim_sgd = SGDMomentum(model1, lr=0.001)
for e in range(20):
    for i, (val, t) in enumerate(zip(x, y)):
        optim_sgd.zero_grad()
        pred = model1.forward(torch.tensor([[val]]))
        loss = mse_loss(t, pred)
        grad = d_mse_loss(t, pred)
        model1.backward(grad)
        optim_sgd.step()

    print(e, model1.forward(torch.tensor([[1.]])), model1.forward(torch.tensor([[2.]])), model1.forward(torch.tensor([[-1.]])), model1.forward(torch.tensor([[-2.]])))

0 tensor([[0.7576]]) tensor([[3.4543]]) tensor([[1.0391]]) tensor([[3.1115]])
1 tensor([[0.9312]]) tensor([[3.7176]]) tensor([[0.9092]]) tensor([[3.6250]])
2 tensor([[0.9225]]) tensor([[3.8022]]) tensor([[0.9177]]) tensor([[3.7616]])
3 tensor([[0.9215]]) tensor([[3.8281]]) tensor([[0.9252]]) tensor([[3.8132]])
4 tensor([[0.9229]]) tensor([[3.8399]]) tensor([[0.9285]]) tensor([[3.8345]])
5 tensor([[0.9246]]) tensor([[3.8475]]) tensor([[0.9303]]) tensor([[3.8465]])
6 tensor([[0.9261]]) tensor([[3.8532]]) tensor([[0.9315]]) tensor([[3.8551]])
7 tensor([[0.9274]]) tensor([[3.8579]]) tensor([[0.9323]]) tensor([[3.8620]])
8 tensor([[0.9285]]) tensor([[3.8620]]) tensor([[0.9330]]) tensor([[3.8679]])
9 tensor([[0.9294]]) tensor([[3.8655]]) tensor([[0.9336]]) tensor([[3.8729]])
10 tensor([[0.9302]]) tensor([[3.8686]]) tensor([[0.9341]]) tensor([[3.8774]])
11 tensor([[0.9309]]) tensor([[3.8714]]) tensor([[0.9344]]) tensor([[3.8813]])
12 tensor([[0.9316]]) tensor([[3.8739]]) tensor([[0.9348]]) te

In [273]:
model2 = Model(((1, 100), (100, 1)), activation='sigmoid')
optim_ada = Adagrad (model2, lr=0.001)
for e in range(20):
    for i, (val, t) in enumerate(zip(x, y)):
        optim_ada.zero_grad()
        pred = model2.forward(torch.tensor([[val]]))
        loss = mse_loss(t, pred)
        grad = d_mse_loss(t, pred)
        model2.backward(grad)
        optim_ada.step()

    print(e, model2.forward(torch.tensor([[1.]])), model2.forward(torch.tensor([[2.]])), model2.forward(torch.tensor([[-1.]])), model2.forward(torch.tensor([[-2.]])))

0 tensor([[1.3472]]) tensor([[1.3716]]) tensor([[1.2982]]) tensor([[1.2737]])
1 tensor([[1.3097]]) tensor([[1.3326]]) tensor([[1.2638]]) tensor([[1.2409]])
2 tensor([[1.3005]]) tensor([[1.3226]]) tensor([[1.2562]]) tensor([[1.2340]])
3 tensor([[1.2966]]) tensor([[1.3181]]) tensor([[1.2534]]) tensor([[1.2318]])
4 tensor([[1.2943]]) tensor([[1.3154]]) tensor([[1.2520]]) tensor([[1.2309]])
5 tensor([[1.2927]]) tensor([[1.3134]]) tensor([[1.2512]]) tensor([[1.2305]])
6 tensor([[1.2915]]) tensor([[1.3119]]) tensor([[1.2507]]) tensor([[1.2303]])
7 tensor([[1.2905]]) tensor([[1.3106]]) tensor([[1.2504]]) tensor([[1.2303]])
8 tensor([[1.2897]]) tensor([[1.3094]]) tensor([[1.2501]]) tensor([[1.2304]])
9 tensor([[1.2890]]) tensor([[1.3084]]) tensor([[1.2500]]) tensor([[1.2305]])
10 tensor([[1.2883]]) tensor([[1.3075]]) tensor([[1.2499]]) tensor([[1.2306]])
11 tensor([[1.2878]]) tensor([[1.3067]]) tensor([[1.2498]]) tensor([[1.2308]])
12 tensor([[1.2872]]) tensor([[1.3059]]) tensor([[1.2497]]) te

In [279]:
model3 = Model(((1, 100), (100, 1)), activation='sigmoid')
optim_rms = RMSProp(model3, lr=0.001)
for e in range(20):
    for i, (val, t) in enumerate(zip(x, y)):
        optim_rms.zero_grad()
        pred = model3.forward(torch.tensor([[val]]))
        loss = mse_loss(t, pred)
        grad = d_mse_loss(t, pred)
        model3.backward(grad)
        optim_rms.step()

    print(e, model3.forward(torch.tensor([[1.]])), model3.forward(torch.tensor([[2.]])), model3.forward(torch.tensor([[-1.]])), model3.forward(torch.tensor([[-2.]])))

0 tensor([[1.1976]]) tensor([[1.2030]]) tensor([[1.1868]]) tensor([[1.1814]])
1 tensor([[1.1923]]) tensor([[1.1889]]) tensor([[1.1993]]) tensor([[1.2028]])
2 tensor([[1.1945]]) tensor([[1.1885]]) tensor([[1.2066]]) tensor([[1.2125]])
3 tensor([[1.1986]]) tensor([[1.1918]]) tensor([[1.2125]]) tensor([[1.2195]])
4 tensor([[1.2023]]) tensor([[1.1945]]) tensor([[1.2189]]) tensor([[1.2276]])
5 tensor([[1.2001]]) tensor([[1.1887]]) tensor([[1.2293]]) tensor([[1.2465]])
6 tensor([[1.1692]]) tensor([[1.1534]]) tensor([[1.2552]]) tensor([[1.3177]])
7 tensor([[1.0922]]) tensor([[1.1112]]) tensor([[1.3035]]) tensor([[1.4962]])
8 tensor([[1.0605]]) tensor([[1.2587]]) tensor([[1.3149]]) tensor([[1.7102]])
9 tensor([[1.1019]]) tensor([[1.7249]]) tensor([[1.2392]]) tensor([[2.0335]])
10 tensor([[1.1591]]) tensor([[2.4716]]) tensor([[1.1000]]) tensor([[2.6737]])
11 tensor([[1.1823]]) tensor([[3.0212]]) tensor([[1.0002]]) tensor([[3.2392]])
12 tensor([[1.1593]]) tensor([[3.1362]]) tensor([[0.9631]]) te

In [281]:
model4 = Model(((1, 100), (100, 1)), activation='sigmoid')
optim_adam = Adam(model4, lr=0.001)
for e in range(20):
    for i, (val, t) in enumerate(zip(x, y)):
        optim_adam.zero_grad()
        pred = model4.forward(torch.tensor([[val]]))
        loss = mse_loss(t, pred)
        grad = d_mse_loss(t, pred)
        model4.backward(grad)
        optim_adam.step()

    print(e, model4.forward(torch.tensor([[1.]])), model4.forward(torch.tensor([[2.]])), model4.forward(torch.tensor([[-1.]])), model4.forward(torch.tensor([[-2.]])))

0 tensor([[1.2384]]) tensor([[1.2441]]) tensor([[1.2270]]) tensor([[1.2213]])
1 tensor([[1.2263]]) tensor([[1.2230]]) tensor([[1.2328]]) tensor([[1.2361]])
2 tensor([[1.2219]]) tensor([[1.2163]]) tensor([[1.2331]]) tensor([[1.2386]])
3 tensor([[1.2200]]) tensor([[1.2142]]) tensor([[1.2319]]) tensor([[1.2379]])
4 tensor([[1.2205]]) tensor([[1.2165]]) tensor([[1.2295]]) tensor([[1.2345]])
5 tensor([[1.2309]]) tensor([[1.2411]]) tensor([[1.2201]]) tensor([[1.2208]])
6 tensor([[1.2712]]) tensor([[1.3424]]) tensor([[1.1875]]) tensor([[1.1898]])
7 tensor([[1.3294]]) tensor([[1.5286]]) tensor([[1.1475]]) tensor([[1.2234]])
8 tensor([[1.3336]]) tensor([[1.7530]]) tensor([[1.1644]]) tensor([[1.4988]])
9 tensor([[1.2958]]) tensor([[2.2393]]) tensor([[1.2439]]) tensor([[2.1038]])
10 tensor([[1.2091]]) tensor([[3.0710]]) tensor([[1.3141]]) tensor([[2.9273]])
11 tensor([[1.0799]]) tensor([[3.4133]]) tensor([[1.2526]]) tensor([[3.1776]])
12 tensor([[1.0373]]) tensor([[3.4970]]) tensor([[1.2309]]) te

## Задание 3: Решить задачу нахождения корней квадратного уравнения методом градиентного спуска

In [2]:
# Task 1
# Find the roots of square equation by gradient descent
# x ** 2 - 6 * x + 4 = 0


In [3]:
# возвести в квадрат
# посчитать производную
# надо начать движение от начальной точки в направлении антградиента с заданным шагом
# x = x - lr * grad(x)
# всегда ли сойдемся за приемлемое количество шагов?
# важна ли начальная точка?
# как найти второй корень?
# как вляет ЛР?

In [349]:
# ax2 + bx + c = 0
a = 1
b = -6
c = 4

# ax2 + bx + c = 0
# производная ф-ции 2ax + b
# То минимум ф-ции ax2 + bx + c находится в точке x = -b/2a. Тогда двигаясь вправо или влево можно найти 2 корня уравнения.

# для нахождения корней возведем ф-цию в квадрат
# (ax2 + bx + c)2 = 0
# a2x4 + b2x2 + c2 + 2abx3 + 2acx2 + 2bcx = 0
# найдем производную
# 4a2x3 + 2b2x + 6abx2 + 4acx + 2bc = 0
# 4a2x3 + 6abx2 + (2b2+4ac)x + 2bc = 0

def f(x):
    return a * x**2 + b * x + c

def grad(x):
    return 4 * a**2 * x**3 + 6 * a * b * x**2 + (2 * b**2 + 4 * a * c) * x + 2 * b * c

centr = -b / 2 / a
lr = 0.01
x1 = centr - lr
accuracy = 0.0001
e = 0
x = x1 - lr * grad(x1)
while abs(x - x1) > accuracy:
    x1 = x
    #print(e, x1)
    e += 1
    x = x1 - lr * grad(x1)
print(e, x1)

47 0.7641048017066163


In [350]:
x2 = centr + lr
e = 0
x = x2 - lr * grad(x2)
while abs(x - x2) > accuracy:
    x2 = x
    #print(e, x2)
    e += 1
    x = x2 - lr * grad(x2)
print(e, x2)


47 5.235895198293384


In [None]:
# всегда ли сойдемся за приемлемое количество шагов?
# Ответ: зависит от LR и заданной точности

# важна ли начальная точка?
# Ответ: для нахождения обоих корней начальная точка важна, от этого зависит в какой минимум будем спускаться


# как найти второй корень?
# Ответ: тк ф-ция симметрична, найдем центр и будем спускаться вправо и влево

# как вляет ЛР?
# Ответ: влияет на скорость спуска
