In [98]:
import numpy as np
import torch

In [74]:
# Task 1
# Find the roots of square equation by gradient descent
# x ** 2 - 6 * x + 4 = 0


In [75]:
#Найдем сначала корни аналитически.

In [76]:
x1 = (6 + 20**0.5)/2
x1

5.23606797749979

In [77]:
x2 = (6 - 20**0.5)/2
x2

0.7639320225002102

In [94]:
#а теперь перейдем к градиентному спуску.

In [79]:
lr = 0.0001

In [80]:
# x = 5
# f' = 4 * x**3 - 36 * x**2 + 88*x - 48

In [81]:
# x = 10
f = 123
xs = []
for x in range(-50,50,10):
    f = 123
    while round(f,4) != 0:
        f = 4 * x**3 - 36 * x**2 + 88*x - 48
        x = x - lr * f
    print(x)
    xs.append(round(x,3))
x12 = set(xs)
  

5.236069220150704
0.7639307791419458
0.7639307812727587
0.7639307820614789
0.7639307777509723
0.7639307806919634
5.236069220260979
5.2360692207463835
5.2360692194085665
5.236069220349526


In [82]:
print(x12)

{0.764, 5.236}


In [83]:
# оба корня выше. Если большие точки брать, или большие шаги, то получается переполнение при расчете из-за больших значений. 
# и мы не сходимся к корням, а наоборот - расходимся. из разных точек мы можем прийти к разным корням.

In [84]:
# возвести в квадрат
# посчитать производную
# надо начать движение от начальной точки в направлении антградиента с заданным шагом
# x = x - lr * grad(x)
# всегда ли сойдемся за приемлемое количество шагов?
# важна ли начальная точка?
# как найти второй корень?
# как вляет ЛР?

In [85]:
# Task 2
# Realize forward and backward pass for linear layer with sigmoid activation

In [86]:
def sigmoid(x):
    return 1. / (1 + torch.exp(-x))

def sigmoid_backward(da, x):
    sig = sigmoid(x)
    
    return da * sig * (1 - sig)

In [88]:
def mse_loss(t, y):
    return (t - y) ** 2

def d_mse_loss(t, y):
    return 2 * (y - t)

In [277]:
class LinearLayer:
    def __init__(self, n_inp, n_out, activation='sigmoid'):
        self.w = torch.randn(n_out, n_inp) * 0.1
        self.b = torch.randn(n_out, 1) * 0.1
        if activation == 'sigmoid':
            self.activ = sigmoid
        elif activation == 'None':
            self.activ = None
        else:
            raise Exception(f'Unknown activation "{activation}"')
        self._clear_state()

    def _clear_state(self):
        self.lin = None
        self.inp = None
        self.d_w = None
        self.d_b = None

    def forward(self, x):
        self.inp = x
        self.lin = self.w @ self.inp + self.b
        activ = self.activ(self.lin) if self.activ is not None else self.lin
#         print(self.w.shape)
        return activ

    def backward(self, grad): # grad = d L / d z    Dout 
        # grad * dz / d lin
        if self.activ == sigmoid:
            grad_lin = sigmoid_backward(grad, self.lin) 
        else:
            grad_lin = grad
        # grad_lin * d lin / d w 
        m = self.inp.shape[1]
        self.d_w = grad_lin @ self.inp.T / m    # d_in dOut
        # grad_lin * d lin / d b 
        self.d_b = torch.sum(grad_lin, axis=1, keepdims=True) / m

        grad = self.w.T @ grad_lin

        return grad

In [158]:
from typing import Tuple

class Model:
    def __init__(self, arch: Tuple[Tuple[int, int]], activation):
        self.layers = []
        for i, p in enumerate(arch):
            self.layers.append(
                LinearLayer(p[0], p[1], 
                            activation=activation if i < len(arch)-1 else 'None')
                )
        self._clear_state()
    
    def _clear_state(self):
        for l in self.layers:
            l._clear_state()

    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        
        return x

    def backward(self, grad):
        for layer in reversed(self.layers):
            grad = layer.backward(grad)

        return grad 

In [280]:
class Adagrad_full:
    def __init__(self, model: Model, lr=0.1):
        self.lr = lr
        self.model = model
#         self.ac_w = np.zeros_like(model.w)
#         self.ac_b = np.zeros_like(model.b)
        
        self.acc = [[torch.zeros_like(layer.w), 
                     torch.zeros_like(layer.b)] for layer in self.model.layers]
        
    def step(self):
        for i, layer in enumerate(self.model.layers):
            self.acc[i][0] = self.acc[i][0] + layer.d_w**2
            self.acc[i][1] = self.acc[i][1] + layer.d_b**2
            
            adapt_lr_w = self.lr/torch.sqrt(self.acc[i][0])
            adapt_lr_b = self.lr/torch.sqrt(self.acc[i][1])
            
            layer.w = layer.w - layer.d_w * adapt_lr_w
            layer.b = layer.b - layer.d_b * adapt_lr_b
            
    def zero_grad(self):
        self.model._clear_state()

In [162]:
# pred = model(x)
# loss = criterion(pred, target)
# grad = d loss / d pred
# model.backward(grad)
# optim.step()

In [207]:
x = torch.rand(2000)*4-2
y = x**2 + (torch.rand(2000)*0.1 - 0.1)

In [133]:
# import matplotlib.pyplot as plt

In [208]:
# plt.scatter(x,y)

In [153]:
x

tensor([ 1.8311,  1.7189,  1.3752,  ...,  0.5330, -1.5569, -1.6533])

In [281]:
model = Model(((1, 100), (100, 1)), activation='sigmoid')
optim = Adagrad_full(model)
for e in range(20):
    for i, (val, t) in enumerate(zip(x, y)):
        optim.zero_grad()
        pred = model.forward(torch.tensor([[val]]))
        loss = mse_loss(t, pred)
        grad = d_mse_loss(t, pred)
        model.backward(grad)
        optim.step()
        
    print(e, model.forward(torch.tensor([[1.0]])), model.forward(torch.tensor([[2.0]])), 
          model.forward(torch.tensor([[-1.0]])), model.forward(torch.tensor([[-2.0]]))
         )

0 tensor([[1.3466]]) tensor([[2.0721]]) tensor([[1.2433]]) tensor([[1.8796]])
1 tensor([[1.3159]]) tensor([[2.9819]]) tensor([[1.0919]]) tensor([[3.0629]])
2 tensor([[1.2587]]) tensor([[3.1941]]) tensor([[1.0310]]) tensor([[3.3597]])
3 tensor([[1.2172]]) tensor([[3.2818]]) tensor([[1.0102]]) tensor([[3.4508]])
4 tensor([[1.1821]]) tensor([[3.3476]]) tensor([[1.0067]]) tensor([[3.4988]])
5 tensor([[1.1518]]) tensor([[3.4022]]) tensor([[1.0081]]) tensor([[3.5310]])
6 tensor([[1.1259]]) tensor([[3.4480]]) tensor([[1.0096]]) tensor([[3.5567]])
7 tensor([[1.1039]]) tensor([[3.4869]]) tensor([[1.0097]]) tensor([[3.5794]])
8 tensor([[1.0852]]) tensor([[3.5202]]) tensor([[1.0087]]) tensor([[3.6002]])
9 tensor([[1.0691]]) tensor([[3.5490]]) tensor([[1.0067]]) tensor([[3.6193]])
10 tensor([[1.0552]]) tensor([[3.5742]]) tensor([[1.0042]]) tensor([[3.6369]])
11 tensor([[1.0431]]) tensor([[3.5964]]) tensor([[1.0013]]) tensor([[3.6532]])
12 tensor([[1.0325]]) tensor([[3.6161]]) tensor([[0.9982]]) te

In [332]:
for i in range(-5,6):
    print(f'x = {i}, y_pred = {model.forward(torch.tensor([[float(i)]]))[0][0].numpy()}, y = {i**2}')
    #Где учили, там и выдает более ли менее приемлемые результаты. А за пределами совсем мимо.

x = -5, y_pred = 5.413887977600098, y = 25
x = -4, y_pred = 5.553648948669434, y = 16
x = -3, y_pred = 5.35898494720459, y = 9
x = -2, y_pred = 3.745281934738159, y = 4
x = -1, y_pred = 0.9768393039703369, y = 1
x = 0, y_pred = -0.08130151033401489, y = 0
x = 1, y_pred = 0.9854601621627808, y = 1
x = 2, y_pred = 3.709380626678467, y = 4
x = 3, y_pred = 5.851874351501465, y = 9
x = 4, y_pred = 6.875485420227051, y = 16
x = 5, y_pred = 7.402181625366211, y = 25
