# **Занятие 2.**

In [8]:
import torch
import random
import numpy as np
import matplotlib.pyplot as plt
from pytorch_lightning import seed_everything
%matplotlib inline

seed_everything(0)

Global seed set to 0


0

# Создание собственной библиотеки автоматического дифференцирования

## Собственное автоматическое дифференцирование

Сначала реализовал пример на бумаге, для того, чтобы не запутаться

![Image alt](./%D0%B3%D1%80%D0%B0%D1%84%20%D0%B1%D1%8D%D0%BA%D0%B2%D0%B0%D1%80%D0%B4.jpg)

Также здесь уже добавлена ф-ия Softmax из задания ниже

In [34]:
class Value:
    """ stores a single scalar value and its gradient """

    def __init__(self, data: float, _children=(), _op=''):
        self.data = data
        self.grad = 0
        # internal variables used for autograd graph construction
        self._backward = lambda: None # function 
        self._prev = set(_children) # set of Value objects
        self._op = _op # the op that produced this node, string ('+', '-', ....)

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data) # Standart expression

        def _backward():
            # Calculating the derivative of the sum
            self.grad += out.grad 
            other.grad += out.grad
        out._backward = _backward
        
        # Add children to resulting expression
        out._prev.add(other)
        out._prev.add(self)
        

        return out

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data)

        def _backward():
            # Calculating the derivative of the product
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward

        # Add children to resulting expression
        out._prev.add(other)
        out._prev.add(self)
        
        
        return out

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data ** other)

        def _backward():
            # Calculating the derivative of a power func
            self.grad += (other * self.data**(other-1)) * out.grad
        out._backward = _backward
        
        # Add children to resulting expression
        out._prev.add(self)

        return out
    

    def relu(self):
        out = Value(self.data) if self.data > 0 else Value(0)

        def _backward():
            # Calculating the derivative of the ReLU
            self.grad += out.grad if out.data > 0 else 0
        out._backward = _backward

        # Add children to resulting expression
        out._prev.add(self)
        
        return out
    
    
    def exp(self):
        import math
        
        out = Value(math.e ** self.data)

        def _backward():
            # Calculating the derivative of a exp func
            self.grad += out.data * out.grad
        out._backward = _backward
        
        # Add children to resulting expression
        out._prev.add(self)

        return out
    
    
    def softmax(input):
        e = [item.exp() for item in input]
        s = sum(e)
        out = [item / s for item in e]
        
        return out
        

    def backward(self):

        # topological order all of the children in the graph
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1
        for v in range(len(topo) - 1, -1, -1):
            topo[v]._backward()

    def __neg__(self): # -self
        return self * -1

    def __radd__(self, other): # other + self
        return self + other

    def __sub__(self, other): # self - other
        return self + (-other)

    def __rsub__(self, other): # other - self
        return other + (-self)

    def __rmul__(self, other): # other * self
        return self * other

    def __truediv__(self, other): # self / other
        return self * other**-1

    def __rtruediv__(self, other): # other / self
        return other * self**-1

    def __repr__(self):
        return f"Value(data={self.data}, grad={self.grad})"
    
    def __round__(self, n = 0):
        return Value(round(self.data, n))

In [10]:
def test_sanity_check():

    x = Value(-4.0)
    z = 2 * x + 2 + x
  
    q = z.relu() + z * x
    h = (z * z).relu()
    y = h + q + q * x
    y.backward()
    xmg, ymg = x, y

    x = torch.Tensor([-4.0]).double()
    x.requires_grad = True
    z = 2 * x + 2 + x
    q = z.relu() + z * x
    h = (z * z).relu()
    y = h + q + q * x
    y.backward()
    xpt, ypt = x, y

    
    # forward pass went well
    assert ymg.data == ypt.data.item()
    # backward pass went well
    print(xmg, xpt, xpt.grad)
    assert xmg.grad == xpt.grad.item()


def test_more_ops():

    a = Value(-4.0)
    b = Value(2.0)
    c = a + b
    d = a * b + b**3
    c += c + 1
    c += 1 + c + (-a)
    d += d * 2 + (b + a).relu()
    d += 3 * d + (b - a).relu()
    e = c - d
    f = e**2
    g = f / 2.0
    g += 10.0 / f
    g.backward()
    amg, bmg, gmg = a, b, g

    a = torch.Tensor([-4.0]).double()
    b = torch.Tensor([2.0]).double()
    a.requires_grad = True
    b.requires_grad = True
    c = a + b
    d = a * b + b**3
    c = c + c + 1
    c = c + 1 + c + (-a)
    d = d + d * 2 + (b + a).relu()
    d = d + 3 * d + (b - a).relu()
    e = c - d
    f = e**2
    g = f / 2.0
    g = g + 10.0 / f
    g.backward()
    apt, bpt, gpt = a, b, g

    tol = 1e-6
    # forward pass went well
    assert abs(gmg.data - gpt.data.item()) < tol
    # backward pass went well
    assert abs(amg.grad - apt.grad.item()) < tol
    assert abs(bmg.grad - bpt.grad.item()) < tol

In [11]:
test_sanity_check()

Value(data=-4.0, grad=46.0) tensor([-4.], dtype=torch.float64, requires_grad=True) tensor([46.], dtype=torch.float64)


In [12]:
test_more_ops()

# Обучение на основе собственной бибилотеки

## Многослойный перцептрон на основе класса Value

In [28]:
class Module:

    def zero_grad(self):
        for layer in self.parameters():
            for neuron in layer:
                for w in neuron:
                    w.grad = 0

    def parameters(self):
        _parameters = []
        for param in self.__dict__:
            try:
                for in_param in getattr(self, param):
                    if isinstance(in_param, Module):
                        _parameters.append(in_param.parameters())
                    elif isinstance(in_param, Value):
                        _parameters.append(in_param)
            except:
                if isinstance(getattr(self, param), Module):
                    _parameters.append(getattr(self, param).parameters())
                elif isinstance(getattr(self, param), Value):
                    _parameters.append(getattr(self, param))
        
        return _parameters
    

class Neuron(Module):

    def __init__(self, nin, nonlin=True):
        self.w = [Value(random.gauss(0, 1)) for _ in range(nin)]
        self.b = Value(random.gauss(0, 1))
        self.nonlin = nonlin

    def __call__(self, x):
        act = sum([self.w[i] * x[i] for i in range(len(x))]) + self.b
        return act.relu() if self.nonlin else act #act.step() для ступенчатой ф-ии

    def __repr__(self):
        return f"{'ReLU' if self.nonlin else 'Linear'}Neuron({len(self.w)})"


class Layer(Module):

    def __init__(self, nin, nout, **kwargs):
        self.neurons = [Neuron(nin, kwargs['nonlin']) for i in range(nout)]

    def __call__(self, x):
        out = []
        for item in x:
            res = [neuron(item) for neuron in self.neurons]
            out.append(res[0] if len(res) == 1 else res)
        return out[0] if len(out) == 1 else out

    def __repr__(self):
        return f"Layer of [{', '.join(str(n) for n in self.neurons)}]"
    
class MSELoss(Module):
    
    def __call__(self, y_true, y_pred):
        res = 0
        for true, pred in zip(y_true, y_pred):
            res += (true - pred)**2
        return res / len(y_pred)

class MLP(Module):

    def __init__(self, nin, nouts, learning_rate):
        self.learning_rate = learning_rate
        sz = [nin]
        sz.extend(nouts)
        self.layers = [Layer(sz[i], sz[i+1], nonlin=(i!=len(nouts)-1)) for i in range(len(nouts))]
        
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

    def __repr__(self):
        repr = '\n'.join(str(layer) for layer in self.layers)
        return f"MLP of [{repr}]"

 
    def step(self):
        for layer in self.layers:
            for neuron in layer.neurons:
                for w in neuron.w:
                    w -= self.learning_rate * w.grad
                neuron.b -= self.learning_rate * neuron.b.grad

## Обучение многослойного перцептрона

Сам перцептрон

In [7]:
model = MLP(3, [4, 4, 1], learning_rate = 0.01)
print(model)
print("number of parameters", len(model.parameters()))

MLP of [Layer of [ReLUNeuron(3), ReLUNeuron(3), ReLUNeuron(3), ReLUNeuron(3)]
Layer of [ReLUNeuron(4), ReLUNeuron(4), ReLUNeuron(4), ReLUNeuron(4)]
Layer of [LinearNeuron(4)]]
number of parameters 41


Набор данных

In [8]:
xs = [
  [2.0, 3.0, -1.0],
  [3.0, -1.0, 0.5],
  [0.5, 1.0, 1.0],
  [1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0] # desired targets

Код был запущен несколько раз и были замечены различные результаты испытаний - иногда решение сходилось ко всем приблизительно одинаковым значениям как в примере ниже. Это происходило изза того, что скорость обучения слишком большая чтобы "поймать" глобальный минимум.

In [33]:
model = MLP(3, [4, 4, 1], learning_rate = 0.3)
loss = MSELoss()

xs = [
  [2.0, 3.0, -1.0],
  [3.0, -1.0, 0.5],
  [0.5, 1.0, 1.0],
  [1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0] # desired targets

for k in range(20):
    
    model.zero_grad()
    
    # forward
    predict = model(xs)

    # calculate loss (mean square error)
    loss_val = loss(ys, predict)
    acc = sum([1 for i in range(len(ys)) if ys[i] == round(predict[i]).data]) / len(ys)
    
    # backward (zero_grad + backward)
    loss_val.backward()
    
    # update
    model.step()
    
    if k % 1 == 0:
        print(f"step {k} loss {loss_val.data}, accuracy {acc*100}%")
print([round(i.data, 2) for i in predict])

step 0 loss 3.8966812573745715, accuracy 0.0%
step 1 loss 3.155987972581905, accuracy 25.0%
step 2 loss 2.5759983831657722, accuracy 25.0%
step 3 loss 2.211519481281905, accuracy 0.0%
step 4 loss 1.927228902443592, accuracy 0.0%
step 5 loss 1.6925088873525087, accuracy 0.0%
step 6 loss 1.4967223750449228, accuracy 0.0%
step 7 loss 1.3376829402545805, accuracy 0.0%
step 8 loss 1.2236704997866708, accuracy 0.0%
step 9 loss 1.1298209637104708, accuracy 0.0%
step 10 loss 1.0518095132468426, accuracy 0.0%
step 11 loss 1.0023117800227048, accuracy 0.0%
step 12 loss 1.0003698848036329, accuracy 0.0%
step 13 loss 1.0000591815685813, accuracy 0.0%
step 14 loss 1.000009469050973, accuracy 0.0%
step 15 loss 1.0000015150481556, accuracy 0.0%
step 16 loss 1.0000002424077048, accuracy 0.0%
step 17 loss 1.0000000387852328, accuracy 0.0%
step 18 loss 1.0000000062056373, accuracy 0.0%
step 19 loss 1.000000000992902, accuracy 0.0%
[-0.0, -0.0, -0.0, -0.0]


При более тщательном подборе скорости обучения модель все же находит локальный минимум, но все еще не может определить истиное последнее значение. Скорее всего это связано с тем, что генерация весов происходит "слишком" случайно, а алгоритм не может выбраться из локального минимума без импульса.

In [48]:
model = MLP(3, [4, 4, 1], learning_rate = 0.1)
loss = MSELoss()

xs = [
  [2.0, 3.0, -1.0],
  [3.0, -1.0, 0.5],
  [0.5, 1.0, 1.0],
  [1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0] # desired targets

for k in range(20):
    
    model.zero_grad()
    
    # forward
    predict = model(xs)

    # calculate loss (mean square error)
    loss_val = loss(ys, predict)
    acc = sum([1 for i in range(len(ys)) if ys[i] == round(predict[i]).data]) / len(ys)
    
    # backward (zero_grad + backward)
    loss_val.backward()
    
    # update
    model.step()
    
    if k % 1 == 0:
        print(f"step {k} loss {loss_val.data}, accuracy {acc*100}%")
print([round(i.data, 2) for i in predict])

step 0 loss 33.86298405833632, accuracy 0.0%
step 1 loss 4.81722371318773, accuracy 0.0%
step 2 loss 1.7752157942163795, accuracy 0.0%
step 3 loss 1.3048714082104031, accuracy 0.0%
step 4 loss 1.1009231262400276, accuracy 0.0%
step 5 loss 1.0124973300929416, accuracy 0.0%
step 6 loss 0.965073163128244, accuracy 0.0%
step 7 loss 0.9354672983150997, accuracy 0.0%
step 8 loss 0.9139371279924209, accuracy 0.0%
step 9 loss 0.8963596835168479, accuracy 25.0%
step 10 loss 0.8809641284122156, accuracy 25.0%
step 11 loss 0.8669664285716869, accuracy 25.0%
step 12 loss 0.8539986441121031, accuracy 25.0%
step 13 loss 0.8418701871935692, accuracy 25.0%
step 14 loss 0.8304677804685794, accuracy 25.0%
step 15 loss 0.8197133818929017, accuracy 25.0%
step 16 loss 0.8095463487939902, accuracy 25.0%
step 17 loss 0.8005845374983889, accuracy 25.0%
step 18 loss 0.7952903440240731, accuracy 25.0%
step 19 loss 0.79187986012931, accuracy 25.0%
[1.27, -0.41, -0.39, -0.54]


Попробуем вместо обычного рандома инициализировать веса с помощью нормального распределения. Тогда после нескольких попыток получим следующий результат

In [45]:
model = MLP(3, [4, 4, 1], learning_rate = 0.1)
loss = MSELoss()

xs = [
  [2.0, 3.0, -1.0],
  [3.0, -1.0, 0.5],
  [0.5, 1.0, 1.0],
  [1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0] # desired targets

for k in range(20):
    
    model.zero_grad()
    
    # forward
    predict = model(xs)

    # calculate loss (mean square error)
    loss_val = loss(ys, predict)
    acc = sum([1 for i in range(len(ys)) if ys[i] == round(predict[i]).data]) / len(ys)
    
    # backward (zero_grad + backward)
    loss_val.backward()
    
    # update
    model.step()
    
    if k % 1 == 0:
        print(f"step {k} loss {loss_val.data}, accuracy {acc*100}%")
print([round(i.data, 2) for i in predict])

step 0 loss 29.918388624301553, accuracy 50.0%
step 1 loss 2.790502710119344, accuracy 0.0%
step 2 loss 1.0603250016812569, accuracy 50.0%
step 3 loss 0.39023987243692004, accuracy 50.0%
step 4 loss 0.13439485478025112, accuracy 75.0%
step 5 loss 0.046703407909980225, accuracy 100.0%
step 6 loss 0.01654572994558693, accuracy 100.0%
step 7 loss 0.006098044395967117, accuracy 100.0%
step 8 loss 0.002421350122013971, accuracy 100.0%
step 9 loss 0.0010848056356952817, accuracy 100.0%
step 10 loss 0.0005676501689844975, accuracy 100.0%
step 11 loss 0.0003453171282782078, accuracy 100.0%
step 12 loss 0.00023490194403016988, accuracy 100.0%
step 13 loss 0.00017116130280650483, accuracy 100.0%
step 14 loss 0.00012973506481878702, accuracy 100.0%
step 15 loss 0.00010074076897744155, accuracy 100.0%
step 16 loss 7.962508819328705e-05, accuracy 100.0%
step 17 loss 6.394376448344005e-05, accuracy 100.0%
step 18 loss 5.2190648554468004e-05, accuracy 100.0%
step 19 loss 4.3344324143291185e-05, accur

In [58]:
model = MLP(3, [4, 4, 1], learning_rate = 0.1)
loss = MSELoss()

xs = [
  [2.0, 3.0, -1.0],
  [3.0, -1.0, 0.5],
  [0.5, 1.0, 1.0],
  [1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0] # desired targets

for k in range(20):
    
    model.zero_grad()
    
    # forward
    predict = model(xs)

    # calculate loss (mean square error)
    loss_val = loss(ys, predict)
    acc = sum([1 for i in range(len(ys)) if ys[i] == round(predict[i]).data]) / len(ys)
    
    # backward (zero_grad + backward)
    loss_val.backward()
    
    # update
    model.step()
    
    if k % 1 == 0:
        print(f"step {k} loss {loss_val.data}, accuracy {acc*100}%")
print([round(i.data, 2) for i in predict])

step 0 loss 0.9011933884730603, accuracy 0.0%
step 1 loss 0.6837866585010065, accuracy 0.0%
step 2 loss 0.5350544232591083, accuracy 0.0%
step 3 loss 0.4198922483330204, accuracy 0.0%
step 4 loss 0.33048998026963816, accuracy 50.0%
step 5 loss 0.26099000485673035, accuracy 50.0%
step 6 loss 0.20687659668129027, accuracy 75.0%
step 7 loss 0.16483796847553955, accuracy 75.0%
step 8 loss 0.13264408784718007, accuracy 100.0%
step 9 loss 0.10731112867263276, accuracy 100.0%
step 10 loss 0.09121794680658037, accuracy 100.0%
step 11 loss 0.08096749113731268, accuracy 100.0%
step 12 loss 0.07231016869308589, accuracy 100.0%
step 13 loss 0.06460009429411438, accuracy 100.0%
step 14 loss 0.05771896950834098, accuracy 100.0%
step 15 loss 0.051576537823210214, accuracy 100.0%
step 16 loss 0.04609288479694333, accuracy 100.0%
step 17 loss 0.041196818741252327, accuracy 100.0%
step 18 loss 0.0368248979153626, accuracy 100.0%
step 19 loss 0.03292058058643555, accuracy 100.0%
[0.9, -0.78, -1.24, 0.9]


Очевидно, что правильная инициализация весов может существенно снизить затраты на обучение и ускорить этот процесс

Оригинальный алгоритм библиотеки свойствами, похожими на свойства первоначальных моделей, не обладает. Опять же, скорее всего, это связано именно с инициализацией весов

In [29]:
class q(torch.nn.Module):
    
    def __init__(self):
        super(q, self).__init__()
        self.fc1 = torch.nn.Linear(3, 4)
        self.fc2 = torch.nn.Linear(4, 4)
        self.fc3 = torch.nn.Linear(4, 1)
        self.ac2 = torch.nn.ReLU()
        torch.nn.init.normal_(self.fc1.weight)
        torch.nn.init.normal_(self.fc2.weight)
        torch.nn.init.normal_(self.fc2.weight)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.ac2(x)
        x = self.fc2(x)
        x = self.ac2(x)
        x = self.fc3(x)
        return x

In [50]:
m = q()
loss_func = torch.nn.MSELoss()
sgd = torch.optim.SGD(m.parameters(), lr=0.1)
xs = torch.Tensor(xs)
ys = torch.Tensor(ys)

for epoch in range(50):
    
    m.zero_grad()
        
    preds = m.forward(xs)
    
    loss_value = loss(ys, preds)

    loss_value.backward()
    
    sgd.step()
    
    accuracy = (preds == ys).float().mean()
    print(loss_value)
print(preds)

tensor([1.6654], grad_fn=<DivBackward0>)
tensor([39.5903], grad_fn=<DivBackward0>)
tensor([1.6603], grad_fn=<DivBackward0>)
tensor([1.2690], grad_fn=<DivBackward0>)
tensor([1.0703], grad_fn=<DivBackward0>)
tensor([0.9725], grad_fn=<DivBackward0>)
tensor([0.9069], grad_fn=<DivBackward0>)
tensor([0.8572], grad_fn=<DivBackward0>)
tensor([0.8162], grad_fn=<DivBackward0>)
tensor([0.7827], grad_fn=<DivBackward0>)
tensor([0.7528], grad_fn=<DivBackward0>)
tensor([0.7280], grad_fn=<DivBackward0>)
tensor([0.6885], grad_fn=<DivBackward0>)
tensor([0.6450], grad_fn=<DivBackward0>)
tensor([0.5944], grad_fn=<DivBackward0>)
tensor([0.5527], grad_fn=<DivBackward0>)
tensor([0.5037], grad_fn=<DivBackward0>)
tensor([0.4649], grad_fn=<DivBackward0>)
tensor([0.4152], grad_fn=<DivBackward0>)
tensor([0.3954], grad_fn=<DivBackward0>)
tensor([0.3641], grad_fn=<DivBackward0>)
tensor([0.3065], grad_fn=<DivBackward0>)
tensor([0.2618], grad_fn=<DivBackward0>)
tensor([0.2158], grad_fn=<DivBackward0>)
tensor([0.1888]

# Домашнее задание

### Домашнее задание 1. Доделать практику. Оформить код в три отдельных модуля `autograd`, `nn`, `train`

Модули представлены в папке customtorch. Результаты работы:

![Image-alt](./%D0%A0%D0%B5%D0%B7%D1%83%D0%BB%D1%8C%D1%82%D0%B0%D1%82%20%D0%BC%D0%BE%D0%B4%D1%83%D0%BB%D0%B5%D0%B9.jpg)

### Домашнее задание 2 (Опционально). Создать свою функцию softmax, наследуемую от `torch.autograd.Function` и имплементировать forward и backward проход. Сравнить со стандартной функцией в Pytorch. 
[Создание функций](https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html) [Софтмакс](https://congyuzhou.medium.com/softmax-3408fb42d55a)

Расписал производную. Получилось, что `dPx/dx = Px(1-Px)`, а для `dPx/dy = -Px*Py`. По аналогии расписал остальные и собрал все это в матрицу, для удобного умножения.

![Image-alt](./%D0%9F%D1%80%D0%BE%D0%B8%D0%B7%D0%B2%D0%BE%D0%B4%D0%BD%D0%B0%D1%8F%20Softmax.jpg)

In [38]:
import torch
import numpy as np


class self_Softmax(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        result = torch.e ** input.flatten()
        result = result / result.sum()
        ctx.save_for_backward(result)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        # First matrix on image
        result, = ctx.saved_tensors
        result = -result.view(-1, 1).expand(-1, len(result))
        
        # Second matrix on image
        reverse_result = -result.clone().T
        diag_elem = reverse_result[np.diag_indices(len(result))] - 1
        reverse_result[np.diag_indices(len(result))] = diag_elem
        
        return grad_output @ (result * reverse_result)

In [39]:
def show_tensor_params(*tensors):
  for x in tensors:
    print('---')
    print(f"data - {x.data}")
    print(f"grad - {x.grad}")
    print(f"grad_fn - {x.grad_fn}")
    print(f"req_grad - {x.requires_grad}")
    print(f"is_leaf - {x.is_leaf}")

В качестве конечной функции рассмотрел MSE и просто сумму

Для MSE:

In [9]:
x1 = torch.tensor([2., 3. ,4.], requires_grad = True)
x2 = torch.tensor([2., 3. ,4.], requires_grad = True)
y = torch.Tensor([0.25, 0.35, 0.4])

sm = torch.nn.Softmax(dim = 0)
res = sm(x1)
loss = ((res - y)**2).mean()

self_sm = self_Softmax.apply
self_res = self_sm(x2)
self_loss = ((self_res - y)**2).mean()

loss.backward()
self_loss.backward()

show_tensor_params(x1, x2)
self_loss

---
data - tensor([2., 3., 4.])
grad - tensor([-0.0178, -0.0394,  0.0572])
grad_fn - None
req_grad - True
is_leaf - True
---
data - tensor([2., 3., 4.])
grad - tensor([-0.0178, -0.0394,  0.0572])
grad_fn - None
req_grad - True
is_leaf - True


tensor(0.0357, grad_fn=<MeanBackward0>)

Для суммы:

In [26]:
x1 = torch.tensor([2., 3. ,4.], requires_grad = True)
x2 = torch.tensor([2., 3. ,4.], requires_grad = True)
y = torch.Tensor([0.25, 0.35, 0.4])

sm = torch.nn.Softmax(dim = 0)
res = sm(x1)
loss = (res - y).sum()

self_sm = self_Softmax.apply
self_res = self_sm(x2)
self_loss = (self_res - y).sum()

loss.backward()
self_loss.backward()

show_tensor_params(x1, x2)
self_loss

---
data - tensor([2., 3., 4.])
grad - tensor([0., 0., 0.])
grad_fn - None
req_grad - True
is_leaf - True
---
data - tensor([2., 3., 4.])
grad - tensor([3.7253e-09, 1.4901e-08, 0.0000e+00])
grad_fn - None
req_grad - True
is_leaf - True


tensor(-2.9802e-08, grad_fn=<SumBackward0>)

Видно, что результаты совпадают, но из-за точности вычислений в собственном дифференцировании получаются значения отличные от нуля

### Домашнее задание 3 (Опционально). Добавить функцию софтмакс в собственну библиотеку автоматического дифференцирования. Сравнить с пунктом 2

Расписал вручную для отладки кода на примере MSE. `a6`, `a7`, `a8` - вычисленные вероятности для `x`, `y`, `z` соответственно

![Image-alt](./%D0%93%D1%80%D0%B0%D1%84%20Softmax%20%2B%20MSE.jpg)

Добавил две функции ниже в класс Value

In [None]:
def exp(self):
        import math
        
        out = Value(math.e ** self.data)

        def _backward():
            # Calculating the derivative of a exp func
            self.grad += out.data * out.grad
        out._backward = _backward
        
        # Add children to resulting expression
        out._prev.add(self)

        return out
    
    
def softmax(input):
    e = [item.exp() for item in input]
    s = sum(e)
    out = [item / s for item in e]
    
    return out

При проверке можно увидеть, что результаты вычислений совпадают с вычислениями torch при различных конечных функциях

Для суммы

In [22]:
x = Value(2.0)
y = Value(3.0)
z = Value(4.0)
sm = Value.softmax([x, y, z])
ys = [0.25, 0.35, 0.4]
s = sum(sm)
s.backward()
print(x, y, z)
print(s)

Value(data=2.0, grad=0.0) Value(data=3.0, grad=0.0) Value(data=4.0, grad=0.0)
Value(data=0.9999999999999999, grad=1)


Для MSE

In [23]:
x = Value(2.0)
y = Value(3.0)
z = Value(4.0)
sm = Value.softmax([x, y, z])
ys = [0.25, 0.35, 0.4]
s = 0
for item, y_true in zip(sm, ys):
    s += (item - y_true) ** 2
s /= 3
s.backward()
print(x, y, z)
print(s)

Value(data=2.0, grad=-0.01778124792726957) Value(data=3.0, grad=-0.039410354515245355) Value(data=4.0, grad=0.05719160244251492)
Value(data=0.035675025648999506, grad=1)


Проверка на большее количество значений

In [44]:
x = [Value(1), Value(-1), Value(4), Value(-7), Value(10)]
sm = Value.softmax(x)
ys = [0.1, 0.05, 0.3, 0.01, 0.54]
s = 0
for item, y_true in zip(sm, ys):
    s += (item - y_true) ** 2
s /= 5
s.backward()
print('Value:')
print([i for i in x])

x1 = torch.tensor([1., -1., 4., -7., 10.], requires_grad = True)
x2 = torch.tensor([1., -1., 4., -7., 10.], requires_grad = True)
y = torch.Tensor(ys)

sm = torch.nn.Softmax(dim = 0)
res = sm(x1)
loss = ((res - y)**2).mean()

self_sm = self_Softmax.apply
self_res = self_sm(x2)
self_loss = ((self_res - y)**2).mean()

loss.backward()
self_loss.backward()

print('\nTorch SM:')
show_tensor_params(x1)

print('\nCustom SM:')
show_tensor_params(x2)

Value:
[Value(data=1, grad=-2.734123913507679e-05), Value(data=-1, grad=-3.36778200831724e-06), Value(data=4, grad=-0.0007446225295809933), Value(data=-7, grad=-7.687511605069202e-09), Value(data=10, grad=0.0007753392382360059)]

Torch SM:
---
data - tensor([ 1., -1.,  4., -7., 10.])
grad - tensor([-2.7341e-05, -3.3678e-06, -7.4462e-04, -7.6875e-09,  7.7535e-04])
grad_fn - None
req_grad - True
is_leaf - True

Custom SM:
---
data - tensor([ 1., -1.,  4., -7., 10.])
grad - tensor([-2.7341e-05, -3.3678e-06, -7.4462e-04, -7.6875e-09,  7.7533e-04])
grad_fn - None
req_grad - True
is_leaf - True


Все результаты идентичны, что говорит о корректности дифференцирования

### Домашнее задание 4 (Опционально). Добавить визуализацию обучения. Потом мы пройдем более подробно.

https://docs.wandb.ai/guides/integrations/pytorch

https://docs.wandb.ai/ref/python/watch  

https://docs.wandb.ai/guides/track/jupyter

In [None]:
!pip install wandb

In [None]:
!wandb login

In [None]:
import wandb
run = wandb.init(project="polynom_learning_")

In [None]:
run.finish()