# **Занятие 2.**

### [Домашнее задание в Colab](https://colab.research.google.com/drive/1sEI3kxp9OvGztDkA-Qo0ALxyMldd1EA_#scrollTo=qXc4AbAMDhO8)

# [Pytorch autograd](https://pytorch.org/docs/stable/autograd.html)

[Tutorial](https://www.youtube.com/watch?v=MswxJw-8PvE)

[Slides](https://app.diagrams.net/#G1bq3akhmA5DGRCiFYJfNPSn7il2wvCkEY)

[Torch C++ Binary operations](https://github.com/pytorch/pytorch/blob/c5872e6d6d8fd9b8439b914c143d49488335f573/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp)

[Torch C++ Activations](https://github.com/pytorch/pytorch/blob/c5872e6d6d8fd9b8439b914c143d49488335f573/aten/src/ATen/native/cpu/Activation.cpp)

# Создание собственной библиотеки автоматического дифференцирования

In [None]:
import torch

Практическое задание: написать собственный движок автоматического дифференцирования, а именно: реализовать

### Создание собственного класса Function для реализации кастомных функций

In [None]:
class Function:
    '''Simple Function class for creating custom activation functions'''
    def __init__(self, func, deriv):
        self.func = func    # Custom function
        self.deriv = deriv  # Its derivative

    def __call__(self, input): # Calculate function
        out = Value(self.func(input.data), _children=(input,))

        def _backward():
            input.grad += out.grad * self.deriv(input.data)
        out._backward = _backward
        out.grad_fn = f'<{self.__class__.__name__}Backward>'
        return out

    def __repr__(self):
        return f'{self.__class__.__name__}'

### Создание базовых функций через Function

In [None]:
from math import exp

class Exp(Function):
    def __init__(self):
        super().__init__(exp, exp)

class ReLU(Function):
    def __init__(self):
        func = lambda input: max(input, 0)
        deriv = lambda input: 1 if input > 0 else 0
        super().__init__(func, deriv)

class LeakyReLU(Function):
    def __init__(self, slope=0.01):
        func = lambda input: max(input, input * slope)
        deriv = lambda input: 1 if input > 0 else slope
        super().__init__(func, deriv)

class Sigmoid(Function):
    def __init__(self):
        func = lambda input: 1 / (1 + exp(-input))
        deriv = lambda input: func(input) * (1 - func(input))
        super().__init__(func, deriv)

class Tanh(Function):
    def __init__(self):
        def func(input):
            exp1, exp2 = exp(input), exp(-input)
            return (exp1 - exp2) / (exp1 + exp2)
        deriv = lambda input: 1 - func(input) ** 2
        super().__init__(func, deriv)

Exp = Exp()
ReLU = ReLU()
LeakyReLU = LeakyReLU(0.01)
Sigmoid = Sigmoid()
Tanh = Tanh()

### Создадим класс MultiFunction для реализации простых арифметических операций с двумя переменными

In [None]:
class MultiFunction:
    def __init__(self, func, deriv1, deriv2):
        self.func = func
        self.deriv1 = deriv1
        self.deriv2 = deriv2

    def __call__(self, value, other):
      if isinstance(other, Value):
          out = Value(self.func(value.data, other.data), _children=(value, other))
          def _backward():
              value.grad += out.grad * self.deriv1(value.data, other.data)
              other.grad += out.grad * self.deriv2(value.data, other.data)
      else:
          # Ignore `other` value if it's a constant (no need gradient for it)
          out = Value(self.func(value.data, other), _children=(value,))
          def _backward():
              value.grad += out.grad * self.deriv1(value.data, other)

      out._backward = _backward
      out.grad_fn = f'<{self.__class__.__name__}Backward>'
      return out

In [None]:
from math import log

class Add(MultiFunction):
    def __init__(self):
        func = lambda a, b: a + b
        deriv = lambda a, b: 1
        super().__init__(func, deriv, deriv)

class Mul(MultiFunction):
    def __init__(self):
        func = lambda a, b: a * b
        deriv1 = lambda a, b: b
        deriv2 = lambda a, b: a
        super().__init__(func, deriv1, deriv2)

class Div(MultiFunction):
    def __init__(self):
        func = lambda a, b: a / b
        deriv1 = lambda a, b: 1 / b
        deriv2 = lambda a, b: -a / (b * b)
        super().__init__(func, deriv1, deriv2)

class Pow(MultiFunction):
    def __init__(self):
        func = lambda a, b: a ** b
        deriv1 = lambda a, b: b * a ** (b - 1)
        deriv2 = lambda a, b: a ** b * log(a)
        super().__init__(func, deriv1, deriv2)

Add = Add()
Mul = Mul()
Div = Div()
Pow = Pow()

### Реализация класса Value для автоматического дифференцирования

In [7]:
class Value:
    """ Stores a single scalar value and its gradient """

    def __init__(self, data, _children=()):
        self.data = data
        self.grad = 0
        # Internal variables used for autograd graph construction
        self._backward = lambda: None # Backward function 
        self._prev = set(_children)   # Set of Value objects
        self._childs = None           # Full list of Values (need for Loss)
        self.grad_fn = None           # Name of the gradient function used

    def relu(self):
        return ReLU(self)

    def exp(self):
        return Exp(self)

    def __add__(self, other):
        return Add(self, other)

    def __mul__(self, other):
        return Mul(self, other)

    def __pow__(self, other):
        return Pow(self, other)

    def get_childs(self):
        if self._childs:
            return self._childs
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)
        self._childs = topo[::-1]
        return self._childs

    def backward(self):
        # topological order all of the children in the graph
        topo = self.get_childs()
        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1
        for v in topo:
            v._backward()

    def zero_grad(self):
        topo = self.get_childs()
        for v in topo:
            v.grad = 0
    
    def update(self, lr=0.01):
        topo = self.get_childs()
        for v in topo[1:]:
            v.data -= lr * v.grad

    def __neg__(self): # -self
        return self * -1

    def __radd__(self, other): # other + self
        return self + other

    def __sub__(self, other): # self - other
        return self + (-other)

    def __rsub__(self, other): # other - self
        return other + (-self)

    def __rmul__(self, other): # other * self
        return Mul(self, other)

    def __truediv__(self, other): # self / other
        return Div(self, other)

    def __rtruediv__(self, other): # other / self
        return other * self ** -1

    def __repr__(self):
        grad_fn = f', grad_fn={self.grad_fn}' if self.grad_fn else ''
        return f"Value(data={round(self.data, 4)}, grad={round(self.grad, 4)}{grad_fn})"

### Сравниваем работу нашей библиотеки с Pytorch

In [8]:
a = Value(-4.9)

b = ReLU(a) # or a.relu()
c = LeakyReLU(a)
d = Sigmoid(c)
e = Tanh(d)

y = b * c + d
y.backward()

print(a, b, c, d, e, y, sep='\n')

Value(data=-4.9, grad=0.0025)
Value(data=0, grad=-0.049, grad_fn=<ReLUBackward>)
Value(data=-0.049, grad=0.2498, grad_fn=<LeakyReLUBackward>)
Value(data=0.4878, grad=1, grad_fn=<SigmoidBackward>)
Value(data=0.4524, grad=0, grad_fn=<TanhBackward>)
Value(data=0.4878, grad=1, grad_fn=<AddBackward>)


In [9]:
a = torch.tensor(-4.9, requires_grad=True)

b = a.relu()
c = torch.nn.LeakyReLU()(a)
d = c.sigmoid()
e = d.tanh()

y = b * c + d
y.backward()

print(a, b, c, d, e, y, sep='\n')

tensor(-4.9000, requires_grad=True)
tensor(0., grad_fn=<ReluBackward0>)
tensor(-0.0490, grad_fn=<LeakyReluBackward0>)
tensor(0.4878, grad_fn=<SigmoidBackward0>)
tensor(0.4524, grad_fn=<TanhBackward0>)
tensor(0.4878, grad_fn=<AddBackward0>)


### Рассмотрим более сложные тесты

In [11]:
def test_sanity_check():

    x = Value(-4.0)
    z = 2 * x + 2 + x
  
    q = z.relu() + z * x
    h = (z * z).relu()
    y = h + q + q * x
    y.backward()
    xmg, ymg = x, y

    x = torch.Tensor([-4.0]).double()
    x.requires_grad = True
    z = 2 * x + 2 + x
    q = z.relu() + z * x
    h = (z * z).relu()
    y = h + q + q * x
    y.backward()
    xpt, ypt = x, y

    
    # forward pass went well
    assert ymg.data == ypt.data.item()
    # backward pass went well
    # print(xmg, xpt, xpt.grad)
    assert xmg.grad == xpt.grad.item()
    print('Test passed!')


def test_more_ops():

    a = Value(-4.0)
    b = Value(2.0)
    c = a + b
    d = a * b + b**3
    c += c + 1
    c += 1 + c + (-a)
    d += d * 2 + (b + a).relu()
    d += 3 * d + (b - a).relu()
    e = c - d
    f = e**2
    g = f / 2.0
    g += 10.0 / f
    g.backward()
    amg, bmg, gmg = a, b, g

    a = torch.Tensor([-4.0]).double()
    b = torch.Tensor([2.0]).double()
    a.requires_grad = True
    b.requires_grad = True
    c = a + b
    d = a * b + b**3
    c = c + c + 1
    c = c + 1 + c + (-a)
    d = d + d * 2 + (b + a).relu()
    d = d + 3 * d + (b - a).relu()
    e = c - d
    f = e**2
    g = f / 2.0
    g = g + 10.0 / f
    g.backward()
    apt, bpt, gpt = a, b, g

    tol = 1e-6
    # forward pass went well
    assert abs(gmg.data - gpt.data.item()) < tol
    # backward pass went well
    # print(amg, apt.data, apt.grad)
    # print(bmg, bpt.data, bpt.grad)
    assert abs(amg.grad - apt.grad.item()) < tol
    assert abs(bmg.grad - bpt.grad.item()) < tol
    print('Test passed!')

In [12]:
test_sanity_check()

Test passed!


In [13]:
test_more_ops()

Test passed!


# Обучение на основе собственной библиотеки

### Многослойный перцептрон на основе класса Value

In [14]:
import random

class Neuron:
    def __init__(self, nin, activation=None):
        self.w = [Value(random.uniform(0, 1)) for _ in range(nin)]
        self.b = Value(random.uniform(0, 1))
        self.nin = nin
        self.activation = activation

    def __call__(self, xs):
        out = self.b.data
        for i in range(self.nin):
            out += self.w[i].data * xs[i].data
        out = Value(out, _children=tuple([*self.w, *xs, self.b]))

        def _backward():
            for i in range(self.nin):
                self.w[i].grad += out.grad * xs[i].data
                xs[i].grad += out.grad * self.w[i].data
            self.b.grad += out.grad

        out._backward = _backward
        out.grad_fn = '<LinearBackward>'
        return self.activation(out) if self.activation else out

    def params(self):
        return (self.w, self.b)

    def __repr__(self):
        act = f'{self.activation}' if self.activation else ''
        return f'{act}Neuron({self.nin})'

class Layer:
    def __init__(self, nin, nout, activation=None):
        self.neurons = [Neuron(nin, activation) for _ in range(nout)]
        self.nin = nin
        self.nout = nout
        self.activation = activation

    def __call__(self, x):
        out = [neuron(x) for neuron in self.neurons]
        return out

    def params(self):
        return [neuron for neuron in self.neurons]

    def __repr__(self):
        act = f', activation=<{self.activation}>' if self.activation else ''
        return f'Linear({self.nin}, {self.nout}{act})'

In [15]:
class Sequential:
    def __init__(self, *layers):
        self.layers = list(layers)

    def __call__(self, xs):
        out = [x if isinstance(x, Value) else Value(x) for x in xs]
        for layer in self.layers:
            out = layer(out)
        return out[0] if len(out) == 1 else out

    def params(self):
        return [layer for layer in self.layers]
      
    def add(self, layer):
        self.layers.append(layer)

    def __repr__(self):
        repr = ''.join(f'  {layer}\n' for layer in self.layers)
        return f'Sequential(\n{repr})'

## Обучение многослойного перцептрона

Сам перцептрон

In [16]:
model = Sequential()
model.add(Layer(3, 5, Sigmoid))
model.add(Layer(5, 5, ReLU))
model.add(Layer(5, 1))
print(model)
print("Number of parameters", len(model.params()))

Sequential(
  Linear(3, 5, activation=<Sigmoid>)
  Linear(5, 5, activation=<ReLU>)
  Linear(5, 1)
)
Number of parameters 3


Набор данных

In [17]:
xs = [
  [2.,  3., -1.],
  [3., -1.,  .5],
  [.5,  1.,  1.],
  [1.,  1., -1.],
]
ys = [1.0, -1.0, -1.0, 1.0] # desired targets (y = -0.148x1 + 0.074x2 - 1.037x3 + 0.037)

In [18]:
%%time
for k in range(1000):
    for i in range(len(xs)):
        # forward
        y = model(xs[i])

        # calculate loss (mean square error)
        loss = (ys[i] - y) ** 2
        
        # backward (zero_grad + backward)
        loss.zero_grad()
        loss.backward()
        
        # update
        learning_rate = 0.01
        loss.update(learning_rate)
      
    if k % 100 == 99:
        print(f"Epoch: {k}, loss: {loss.data:.4f}")

Epoch: 99, loss: 0.0231
Epoch: 199, loss: 0.0009
Epoch: 299, loss: 0.0002
Epoch: 399, loss: 0.0000
Epoch: 499, loss: 0.0000
Epoch: 599, loss: 0.0000
Epoch: 699, loss: 0.0000
Epoch: 799, loss: 0.0000
Epoch: 899, loss: 0.0000
Epoch: 999, loss: 0.0000
CPU times: user 1.21 s, sys: 10.2 ms, total: 1.22 s
Wall time: 1.25 s


In [19]:
for i in range(len(xs)):
  print(round(model(xs[i]).data, 4), ys[i])

1.0001 1.0
-1.0 -1.0
-1.0 -1.0
0.9999 1.0


# Домашнее задание

**Домашнее задание 1.** Доделать практику. Оформить код в три отдельных модуля `autograd`, `nn`, `train`

**Домашнее задание 2 (Опционально).** Создать свою функцию softmax, наследуемую от `torch.autograd.Function` и имплементировать forward и backward проход. Сравнить со стандартной функцией в Pytorch. 
[Создание функций](https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html) [Софтмакс](https://congyuzhou.medium.com/softmax-3408fb42d55a)

In [30]:
values = torch.Tensor([[1., 2., 3.]])
values.requires_grad = True

y_true = torch.Tensor([.3, .3, .3])

y_pred = torch.nn.Softmax(dim=1)(values)
loss = (y_true - y_pred) ** 2
loss.sum().backward()

values.data, values.grad, loss, y_true

(tensor([[1., 2., 3.]]),
 tensor([[-0.0757, -0.1301,  0.2058]]),
 tensor([[0.0441, 0.0031, 0.1334]], grad_fn=<PowBackward0>),
 tensor([0.3000, 0.3000, 0.3000]))

**Домашнее задание 3 (Опционально).** Добавить функцию софтмакс в собственную библиотеку автоматического дифференцирования. Сравнить с пунктом 2

Создадим класс Sum для нахождения градиента сразу для нескольких значений

In [22]:
class Sum:
    def __call__(self, inputs): # Calculate multiple sum
        out = Value(sum(input.data for input in inputs), _children=tuple(inputs))

        def _backward():
            for input in inputs:
                input.grad += out.grad
        out._backward = _backward
        out.grad_fn = '<AddBackward>'
        return out

Sum = Sum()

Создадим собственную версию Softmax

In [28]:
class Softmax:
    def __call__(self, inputs): # Calculate softmax
        outs = [input.exp() for input in inputs]
        total = Sum(outs)
        outs = [out / total for out in outs]
        return outs

    def __repr__(self):
        return f'<Softmax>'

Softmax = Softmax()

In [32]:
values = [Value(1.), Value(2.), Value(3.)]

y_true = [.3, .3, .3]

y_pred = Softmax(values)
loss = [(yt - yp) ** 2 for (yt, yp) in zip(y_true, y_pred)]
Sum(loss).backward()

print(values, loss, y_true, sep='\n')

[Value(data=1.0, grad=-0.0757), Value(data=2.0, grad=-0.1301), Value(data=3.0, grad=0.2058)]
[Value(data=0.0441, grad=1, grad_fn=<PowBackward>), Value(data=0.0031, grad=1, grad_fn=<PowBackward>), Value(data=0.1334, grad=1, grad_fn=<PowBackward>)]
[0.3, 0.3, 0.3]


Как видим, результат получился одинаковый!

**Домашнее задание 4 (Опционально).** Добавить визуализацию обучения. Потом мы пройдем более подробно.

https://docs.wandb.ai/guides/integrations/pytorch

https://docs.wandb.ai/ref/python/watch  

https://docs.wandb.ai/guides/track/jupyter

In [None]:
!pip install wandb

In [None]:
!wandb login

In [None]:
import wandb
run = wandb.init(project="polynom_learning_")

In [None]:
run.finish()