In [1]:
import torch

# Tracking Computations

In [2]:
# Create tensors and set requires_grad=True to track computation
# By default, requires_grad is False
a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)
print(f"Tensor a: {a}")
print(f"Tensor b: {b}")

Tensor a: tensor([2., 3.], requires_grad=True)
Tensor b: tensor([6., 4.], requires_grad=True)


In [3]:
# Perform an operation involving these tensors
# Let Q = 3a^3 - b^2
# PyTorch builds a computation graph behind the scenes

In [4]:
# Term 1: 3a^3
a_cubed = a.pow(3)
term1 = 3 * a_cubed

In [5]:
# Term 2: b^2
term2 = b.pow(2)

In [6]:
# Final Q
Q = term1 - term2
print(f"Q = 3a^3 - b^2:\n{Q}")

Q = 3a^3 - b^2:
tensor([-12.,  65.], grad_fn=<SubBackward0>)


# Computing Gradients with backward()

In [7]:
# Q is not a scalar. Usually, .backward() is called on a scalar loss value.
# To illustrate, let's calculate a single scalar value from Q, e.g., the mean.
# This external_grad is implicitly torch.tensor(1.0) when backward() is called on a scalar.
external_grad = torch.tensor([1.0, 1.0]) # Gradient for Q itself (needed as Q is not scalar)
Q.backward(gradient=external_grad)

# Check Gradients

In [8]:
# Gradients are now populated in the .grad attribute of the original tensors (a and b)
# It contains dQ/da and dQ/db

In [9]:
# Check gradients dQ/da = 9a^2
print(f"Gradients for a (dQ/da = 9a^2): {a.grad}") # Should be 9 * [2^2, 3^2] = 9 * [4, 9] = [36, 81]

Gradients for a (dQ/da = 9a^2): tensor([36., 81.])


In [10]:
# Check gradients dQ/db = -2b
print(f"Gradients for b (dQ/db = -2b): {b.grad}") # Should be -2 * [6, 4] = [-12, -8]

Gradients for b (dQ/db = -2b): tensor([-12.,  -8.])


In [11]:
# --- Important Notes about .backward() ---
# 1. Gradients are accumulated: If you call backward() multiple times, gradients add up.
#    You usually need to zero out gradients before each training iteration using optimizer.zero_grad().
# 2. Only leaf nodes get gradients: By default, only tensors created directly by the user
#    with requires_grad=True will have their .grad populated. Intermediate tensors (like 'term1') won't.
# 3. backward() on non-scalar output: If you call backward() on a tensor with more than one element (like Q),
#    you need to provide a 'gradient' argument of the same shape, representing the gradient
#    of the final scalar loss with respect to that tensor (often just torch.ones_like(Q) or as shown above).
#    If the output *is* scalar (like Q.mean()), you can just call .backward() without arguments.

# Example with scalar output

In [12]:
x = torch.tensor(2.0, requires_grad=True)
y = torch.tensor(3.0, requires_grad=True)
z = x**2 * y + x*y
print(f"x: {x}, y: {y}")
print(f"z = x**2 * y + x*y: {z}")

x: 2.0, y: 3.0
z = x**2 * y + x*y: 18.0


In [13]:
# Calculate gradients dz/dx and dz/dy
z.backward() # No gradient argument needed as z is scalar

In [14]:
print(f"Gradient dz/dx: {x.grad}") # dz/dx = 2xy + y = 2*2*3 + 3 = 15
print(f"Gradient dz/dy: {y.grad}") # dz/dy = x^2 + x = 2^2 + 2 = 6

Gradient dz/dx: 15.0
Gradient dz/dy: 6.0


In [15]:
# Zero the gradients before potential next calculation
x.grad.zero_()
y.grad.zero_()
print(f"\nGradients after zeroing: x.grad={x.grad}, y.grad={y.grad}")


Gradients after zeroing: x.grad=0.0, y.grad=0.0


# Excluding Blocks from Tracking: torch.no_grad()

In [16]:
print(f"Q requires grad: {Q.requires_grad}") # Q requires grad because it depends on a and b

Q requires grad: True


In [17]:
with torch.no_grad():
    # Operations inside this block are NOT tracked
    Q_no_grad = 3*a**3 - b**2
    print(f"Q_no_grad requires grad: {Q_no_grad.requires_grad}") # This will be False

Q_no_grad requires grad: False


In [18]:
# Operations outside the block are tracked again if inputs require grad
Q_again = 3*a**3 - b**2
print(f"Q_again requires grad: {Q_again.requires_grad}") # This will be True

Q_again requires grad: True
