- Autograd is a core pytorch library for automatic differentiation.
- In the forward phase, the autograd tape will remember all the operations it executed
- During the backward phase, the autograd tape will replay the operations.
- Each operation will be replaced by its gradient function.
- Then, the gradients are chained using the chain rule.
- The chain rule is used to compute the gradient of the loss function with respect to the parameters of the model.
- The gradient of the loss function with respect to the parameters of the model is used to update the parameters of the model using an optimization algorithm.
- The autograd tape is a powerful tool that allows us to compute the gradients of complex functions with respect to their parameters.

In [33]:
import torch

# Example - 1 (Scalar Tensor)

In [34]:
x = torch.tensor(3.0, requires_grad=True) # to get the gradient of x
y = x**2

In [35]:
x

tensor(3., requires_grad=True)

In [36]:
y

tensor(9., grad_fn=<PowBackward0>)

In [37]:
y.backward() # compute the derivative of y w.r.t x

In [38]:
x.grad

tensor(6.)

# Example - 2 (Scalar Tensor)

In [39]:
import math

In [40]:
x = torch.tensor(3.0, requires_grad=True)
y = x**2
z = torch.sin(y)

In [41]:
print(x)
print(y)
print(z)

tensor(3., requires_grad=True)
tensor(9., grad_fn=<PowBackward0>)
tensor(0.4121, grad_fn=<SinBackward0>)


In [42]:
z.backward()

In [43]:
x.grad

tensor(-5.4668)

In [44]:
x = torch.tensor(6.7) #input feature
y = torch.tensor(0) #output label (binary)
w = torch.tensor(1.0, requires_grad=True) #weight
b = torch.tensor(0.0, requires_grad=True) #bias

In [45]:
# Binary Cross-Entropy Loss for scalar
def binary_cross_entropy_loss(prediction, target):
    epsilon = 1e-8  # To prevent log(0)
    prediction = torch.clamp(prediction, epsilon, 1 - epsilon)
    return -(target * torch.log(prediction) + (1 - target) * torch.log(1 - prediction))

In [46]:
z = w*x + b
z

tensor(6.7000, grad_fn=<AddBackward0>)

In [47]:
y_pred = torch.sigmoid(z) #prediction
y_pred

tensor(0.9988, grad_fn=<SigmoidBackward0>)

In [48]:
loss = binary_cross_entropy_loss(y_pred, y)
loss

tensor(6.7012, grad_fn=<NegBackward0>)

In [49]:
loss.backward()

In [50]:
print(w.grad)
print(b.grad)

tensor(6.6918)
tensor(0.9988)


# Example - 3 (Vector Tensor)

In [51]:
x = torch.tensor([1.0,2.0,3.0], requires_grad=True)
x

tensor([1., 2., 3.], requires_grad=True)

In [52]:
y = (x**2).mean()

In [53]:
y.backward()

In [54]:
x.grad

tensor([0.6667, 1.3333, 2.0000])

In [55]:
# Clearing the gradient because the backward() function accumulates the gradients
x.grad.zero_()

tensor([0., 0., 0.])

In [56]:
# Disable gradient tracking to speed up computations and reduce memory usage when you are sure that you will not call Tensor.backward().
x = torch.tensor(3.0, requires_grad=True)
x

tensor(3., requires_grad=True)

In [57]:
y = x**2
y

tensor(9., grad_fn=<PowBackward0>)

In [58]:
y.backward()

In [59]:
x.grad

tensor(6.)

In [60]:
x.requires_grad_(False)
x

tensor(3.)

In [61]:
y = x**2
y

tensor(9.)

In [62]:
y.backward() # This will throw an error because x does not have requires_grad=True

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
with torch.no_grad():
    y = x**2
    y

In [None]:
y.backward() # This will throw an error because x does not have requires_grad=True

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn