In [126]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [127]:
%matplotlib inline
import numpy as np
from tqdm import trange
np.set_printoptions(suppress=True)

In [128]:
import torch
import math
import numpy as np

# Binary operators
## Dot product

Parece que todo es 

```python
a_grad = c_grad @ b.T 
b_grad = a.T @ c_grad
```

In [4]:
import numpy as np

def backprop_torch(a, b, const=1):
    a = torch.tensor(a, requires_grad=True)
    b = torch.tensor(b, requires_grad=True)
    c = a @ b
    c.retain_grad()
    (c.sum() * const).backward()
    
    print('a_grad_torch:\n', a.grad.detach().numpy(), end='\n'*2)
    print('b_grad_torch:\n', b.grad.detach().numpy(), end='\n'*2)
    print('c_grad_torch:\n', c.grad.detach().numpy(), end='\n'*2)

def backprop(a, b, const=1):
    c = a @ b
    c_grad = np.ones_like(c)*const
    
    a_grad = c_grad @ b.T 
    b_grad = a.T @ c_grad
    
    print('a_grad:\n', a_grad, end='\n'*2)
    print('b_grad:\n', b_grad, end='\n'*2)
    print('c_grad:\n', c_grad, end='\n'*2)

### Scalar matrix

In [5]:
a = np.array([[2.]])
b = np.array([[1.,1., 1., 1.]])

backprop_torch(a, b, 3.3)
backprop(a, b, 3.3)

a_grad_torch:
 [[13.2]]

b_grad_torch:
 [[6.6 6.6 6.6 6.6]]

c_grad_torch:
 [[3.3 3.3 3.3 3.3]]

a_grad:
 [[13.2]]

b_grad:
 [[6.6 6.6 6.6 6.6]]

c_grad:
 [[3.3 3.3 3.3 3.3]]



### Matrix vector

In [6]:
a = np.array([[2.,2.],
              [2.,2.]])
b = np.array([[1.,1.]]).T

backprop_torch(a, b)
backprop(a, b)

a_grad_torch:
 [[1. 1.]
 [1. 1.]]

b_grad_torch:
 [[4.]
 [4.]]

c_grad_torch:
 [[1.]
 [1.]]

a_grad:
 [[1. 1.]
 [1. 1.]]

b_grad:
 [[4.]
 [4.]]

c_grad:
 [[1.]
 [1.]]



### Vector matrix

In [8]:
backprop_torch(b.T, a)
backprop(b.T, a)

a_grad_torch:
 [[4. 4.]]

b_grad_torch:
 [[1. 1.]
 [1. 1.]]

c_grad_torch:
 [[1. 1.]]

a_grad:
 [[4. 4.]]

b_grad:
 [[1. 1.]
 [1. 1.]]

c_grad:
 [[1. 1.]]



### Matrix Matrix

In [9]:
backprop_torch(a, a)
backprop(a, a)

a_grad_torch:
 [[4. 4.]
 [4. 4.]]

b_grad_torch:
 [[4. 4.]
 [4. 4.]]

c_grad_torch:
 [[1. 1.]
 [1. 1.]]

a_grad:
 [[4. 4.]
 [4. 4.]]

b_grad:
 [[4. 4.]
 [4. 4.]]

c_grad:
 [[1. 1.]
 [1. 1.]]



## Hadamard product

Como multiplicacion de escalares.

In [18]:
a = np.array([[4.,5.],
              [1.,3.],
              [9.,0.]])

b = np.array([[2.,2.],
              [2.,2.],
              [2.,2.]])

a_tensor = torch.tensor(a, requires_grad=True)
b_tensor = torch.tensor(b, requires_grad=True)
c_tensor = a_tensor * b_tensor
c_tensor.retain_grad()
c_tensor.sum().backward()

In [17]:
print('a_grad_tensor:\n', a_tensor.grad, end='\n'*2)
print('b_grad_tensor:\n', b_tensor.grad, end='\n'*2)
print('c_grad_tensor:\n', c_tensor.grad, end='\n'*2)

a_grad_tensor:
 tensor([[6., 6.],
        [6., 6.],
        [6., 6.]], dtype=torch.float64)

b_grad_tensor:
 tensor([[12., 15.],
        [ 3.,  9.],
        [27.,  0.]], dtype=torch.float64)

c_grad_tensor:
 tensor([[3., 3.],
        [3., 3.],
        [3., 3.]], dtype=torch.float64)



## Add

Parece que vuelve a ser Hadamard product, cosa que makes sense.

$$
\frac{\partial sum}{\partial a} = \frac{\partial sum}{\partial sum}  
                                  \frac{\partial sum}{\partial c} 
                                  \frac{\partial c}{\partial a}\\
\frac{\partial sum}{\partial a} = 1
                                  \frac{\partial sum}{\partial c} 
                                  \frac{\partial c}{\partial a}\\
\frac{\partial c_{1,1}}{\partial a_{1,1}} = a_{1,1} + b_{1,1} = 1
$$

In [27]:
a = np.array([[4.,5.],
              [1.,3.],
              [9.,0.]])

b = np.array([[2.,2.],
              [2.,2.],
              [2.,2.]])

a_tensor = torch.tensor(a, requires_grad=True)
b_tensor = torch.tensor(b, requires_grad=True)
c_tensor = a_tensor + b_tensor
c_tensor.retain_grad()
(c_tensor.sum()*3).backward()

In [29]:
print('a_grad_tensor:\n', a_tensor.grad, end='\n'*2)
print('b_grad_tensor:\n', b_tensor.grad, end='\n'*2)
print('c_grad_tensor:\n', c_tensor.grad, end='\n'*2)
c_tensor

a_grad_tensor:
 tensor([[3., 3.],
        [3., 3.],
        [3., 3.]], dtype=torch.float64)

b_grad_tensor:
 tensor([[3., 3.],
        [3., 3.],
        [3., 3.]], dtype=torch.float64)

c_grad_tensor:
 tensor([[3., 3.],
        [3., 3.],
        [3., 3.]], dtype=torch.float64)



tensor([[ 6.,  7.],
        [ 3.,  5.],
        [11.,  2.]], dtype=torch.float64, grad_fn=<AddBackward0>)

# Reduction operators

## Sum

Derivative of the sum of any Tensor is just zeros_like(tensor), but wat happens
when it's combined with scalar multiplication on another Tensor of different size.

In [134]:
a = np.array([[2.,2.],
              [2.,2.],
              [2.,2.]])

b = np.array([[1.,2.],
              [3.,4.],
              [5.,6.]])

a_tensor = torch.tensor(a, requires_grad=True)
c_tensor = torch.tensor(b, requires_grad=True)

In [135]:
b_tensor = a_tensor.sum()
b_tensor.retain_grad()

d_tensor = (b_tensor * c_tensor).sum()
d_tensor.retain_grad()


In [136]:
d_tensor.backward()

In [138]:
b_tensor.grad

tensor(21., dtype=torch.float64)

In [39]:
c_tensor.sum() == b_tensor.grad

tensor(True)

## Max

In [132]:
a = np.array([[2.,2.],
              [2.,2.],
              [2.,2.]])

b = np.array([[1.,2.],
              [3.,4.],
              [5.,6.]])

a_tensor = torch.tensor(a, requires_grad=True)
c_tensor = torch.tensor(b, requires_grad=True)

In [120]:
b_tensor = a_tensor.max()
b_tensor.retain_grad()

d_tensor = (b_tensor * c_tensor).sum()
d_tensor.retain_grad()


In [121]:
d_tensor.backward()

In [133]:
a_tensor

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]], dtype=torch.float64, requires_grad=True)

In [130]:
a_tensor.max().sum().backward()


In [131]:
a_tensor.grad

tensor([[0.1667, 0.1667],
        [0.1667, 0.1667],
        [0.1667, 0.1667]], dtype=torch.float64)