In [1]:
import torch

# For the overall model

In [53]:
seed = torch.manual_seed(42)
X = torch.rand(4, 3, requires_grad=True)
W = torch.rand(3, 1, requires_grad=True)
y = torch.rand(4, 1)

In [54]:
h = X@W
h

tensor([[1.5702],
        [1.5010],
        [1.3708],
        [1.0863]], grad_fn=<MmBackward0>)

In [55]:
loss = y-h
loss

tensor([[-1.1408],
        [-0.6156],
        [-0.7969],
        [-0.8197]], grad_fn=<SubBackward0>)

In [56]:
loss = loss.mean()

In [57]:
loss

tensor(-0.8433, grad_fn=<MeanBackward0>)

In [58]:
W.grad

In [59]:
loss.backward()

In [60]:
W.grad

tensor([[-0.5578],
        [-0.7584],
        [-0.6295]])

The above **is averaged!** Let's prove them below.

# One by one

In [61]:
seed = torch.manual_seed(42)
X = torch.rand(4, 3, requires_grad=True)
W = torch.rand(3, 1, requires_grad=True)
y = torch.rand(4, 1)

In [62]:
h_first = X[[0]] @ W
h_first

tensor([[1.5702]], grad_fn=<MmBackward0>)

Note that the above is the same as the one in the batch. Of course, it has to be.

In [63]:
loss_first = y[0] - h_first
loss_first

tensor([[-1.1408]], grad_fn=<SubBackward0>)

In [64]:
loss_first.sum().backward()

In [65]:
# For the first - time
grad_first = W.grad
grad_first

tensor([[-0.8823],
        [-0.9150],
        [-0.3829]])

In [66]:
# let's do the same for other records

W.grad = None
h_second = X[[1]] @ W
loss_second = y[1] - h_second
loss_second.sum().backward()
grad_second = W.grad

W.grad = None
h_third = X[[2]] @ W
loss_third = y[2] - h_third
loss_third.sum().backward()
grad_third = W.grad

W.grad = None
h_forth = X[[3]] @ W
loss_forth = y[3] - h_forth
loss_forth.sum().backward()
grad_forth = W.grad

grad_first, grad_second, grad_third, grad_forth

(tensor([[-0.8823],
         [-0.9150],
         [-0.3829]]),
 tensor([[-0.9593],
         [-0.3904],
         [-0.6009]]),
 tensor([[-0.2566],
         [-0.7936],
         [-0.9408]]),
 tensor([[-0.1332],
         [-0.9346],
         [-0.5936]]))

> Now... let's average them.

In [67]:
grad_first + grad_second + grad_third + grad_forth

tensor([[-2.2313],
        [-3.0337],
        [-2.5181]])

In [69]:
# This is the true one -- which matches our automatic one
(grad_first + grad_second + grad_third + grad_forth) / 4

tensor([[-0.5578],
        [-0.7584],
        [-0.6295]])

# Cool!