Why do we need Autograd?

1. Find the derivative of y wrt to x in the expression y = x^2

In [1]:
def dy_dx(x):
  return 2*x       # Easy to do.

In [2]:
dy_dx(2)

4

2. Find the dz_dx for y = x^2 and z = sin(y)

In [14]:
# We can solve that by using chain rule.
import math
def dz_dx(x):
  return 2 * x * math.cos(x**2)

In [15]:
dz_dx(3)

-5.466781571308061

3. Let's make it more difficult. Find du_dx where, y = x^2, z = sin(y), and u = e^z. Now here we have to do the derivative three times and combine them. du_dz * dz_dy * dy_dx.

In [5]:
# using PyTorch

import torch

In [8]:
x = torch.tensor(3.0, requires_grad=True)
y = x**2
print(x)
print(y)      # PyTorch stores the way it computed y (by taking pow). Which will help it to calculate the gradient.

tensor(3., requires_grad=True)
tensor(9., grad_fn=<PowBackward0>)


In [9]:
# To do the differentiation.
y.backward()

In [10]:
# To see the gradient value.
x.grad

tensor(6.)

Next problem.

In [11]:
x = torch.tensor(3.0, requires_grad=True)
y = x**2
z = torch.sin(y)
print(z)
print(y)

tensor(0.4121, grad_fn=<SinBackward0>)
tensor(9., grad_fn=<PowBackward0>)


In [12]:
z.backward()

In [13]:
x.grad

tensor(-5.4668)

Want to find the gradient of a simple one node NN having activation function as sigmoid.
--> Training on one data point only. Want to show how autograd can be used in simple NN.  

In [16]:
import torch

# Inputs
x = torch.tensor(6.7) # Input feature
y = torch.tensor(0) # Output label

# Initial guess of weights and bias.
w = torch.tensor(1.0)
b = torch.tensor(0.0)

In [18]:
# Binary Cross-Entropy loss for scalar. --> Calculating the loss.
def binary_cross_entropy_loss(y_pred, y_true):
  epsilon = 1e-8    # To prevent log(0)
  y_pred = torch.clamp(y_pred, epsilon, 1-epsilon)
  return -(y_true * torch.log(y_pred) + (1-y_true) * torch.log(1-y_pred))

In [20]:
# Forward pass
z = w * x + b                          # weighted sum (linear part)
y_pred = torch.sigmoid(z)              # activation function (predicted probability)

# compute binary cross-entropy loss
loss = binary_cross_entropy_loss(y_pred, y)
print(loss)

tensor(6.7012)


In [21]:
# Backprop
# 1. dL_dy_pred: Loss wrt the prediction (y_pred)
dloss_dy_pred = (y_pred - y)/(y_pred * (1 - y_pred))

# 2. dy_pred/dz: Prediction (y_pred) wrt z (sigmoid derivative)
dy_pred_dz = y_pred * (1 - y_pred)

# 3. dz/dw: z wrt w
dz_dw = x

# 4. dz/db: z wrt b
dz_db = 1

dL_dw = dloss_dy_pred * dy_pred_dz * dz_dw
dL_db = dloss_dy_pred * dy_pred_dz * dz_db

In [23]:
print(f"Manual Gradient of loss dL_dw: {dL_dw}")
print(f"dL_db: {dL_db}")

Manual Gradient of loss dL_dw: 6.691762447357178
dL_db: 0.998770534992218


Doing same thing using autograd.

In [24]:
x = torch.tensor(6.7)
y = torch.tensor(0.0)

w = torch.tensor(1.0, requires_grad=True)
b = torch.tensor(0.0, requires_grad=True)

In [25]:
z = w * x + b
y_pred = torch.sigmoid(z)
loss = binary_cross_entropy_loss(y_pred, y)
print(z, y_pred, loss)

tensor(6.7000, grad_fn=<AddBackward0>) tensor(0.9988, grad_fn=<SigmoidBackward0>) tensor(6.7012, grad_fn=<NegBackward0>)


In [26]:
loss.backward()

In [27]:
print(f"Autograd Gradient of loss dL_dw: {w.grad}")
print(f"dL_db: {b.grad}")

Autograd Gradient of loss dL_dw: 6.6917619705200195
dL_db: 0.9987704753875732


Autograd with vectors.

In [37]:
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)

y = x ** 2
print(y)

y = y.mean()             # grad can be implicitly created only for scalar outputs
print(y)

tensor([1., 4., 9.], grad_fn=<PowBackward0>)
tensor(4.6667, grad_fn=<MeanBackward0>)


In [38]:
y.backward()

In [39]:
x.grad

tensor([0.6667, 1.3333, 2.0000])

# Clearing Gradient
--> When we rerun the gradient we should clear the last gradient.

In [74]:
x = torch.tensor(2.0, requires_grad=True)

In [75]:
y = x**2

In [76]:
y.backward()

In [77]:
x.grad

tensor(4.)

Problem: PyTorch's autograd engine accumulates gradients by default.

In [78]:
for i in range(5):
  y = x**2
  y.backward()
  print(x.grad)

tensor(8.)
tensor(12.)
tensor(16.)
tensor(20.)
tensor(24.)


To solve that problem we have to clear the gradient manualy. After every epochs you will have to clear gradient.

In [79]:
x.grad.zero_()

tensor(0.)

# Disable gradient tracking.
During prediction we don't do backprop, we only do forwardprop. Here, we can disable gradient tracking. You can do that in three ways:
1. requires_grad_(False)
2. detach()
3. torch.no_grad()

In [81]:
x = torch.tensor(2.0, requires_grad=True)
print(x)

tensor(2., requires_grad=True)


In [82]:
x.requires_grad_(False)
print(x)

tensor(2.)


In [83]:
x = torch.tensor(2.0, requires_grad=True)
print(x)

tensor(2., requires_grad=True)


In [84]:
z = x.detach()
print(z)

tensor(2.)


In [87]:
y = x ** 2
y.backward()
print(x.grad)

y = z ** 2
# y.backward()               # It will give error. Beacuse you have detached the z from the computational graph
# print(z.grad)

tensor(12.)


In [88]:
x = torch.tensor(2.0, requires_grad=True)
y = x ** 2
print(y)

with torch.no_grad():
  y = x ** 2
  print(y)

tensor(4., grad_fn=<PowBackward0>)
tensor(4.)
