# A Gentle Introduction to torch.autograd


## usage in pytorch

In [2]:
import torch
from torchvision.models import resnet18, ResNet18_Weights
model = resnet18(weights=ResNet18_Weights.DEFAULT)
data = torch.rand(1, 3, 64, 64)
labels = torch.rand(1, 1000)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /Users/andrewreusche/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 44.7M/44.7M [00:01<00:00, 44.8MB/s]


In [5]:
prediction = model(data) # forward pass

In [6]:
loss = (prediction - labels).sum()
loss.backward() # backward pass

In [7]:
optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)

In [8]:
optim.step() #gradient descent

## differentiation in Autograd

In [9]:
"""
Let’s take a look at how autograd collects gradients. We create two tensors a and b with requires_grad=True.
This signals to autograd that every operation on them should be tracked.
"""

'\nLet’s take a look at how autograd collects gradients. We create two tensors a and b with requires_grad=True.\nThis signals to autograd that every operation on them should be tracked.\n'

In [10]:
import torch

a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)

In [13]:
#create anpther tensor
Q = 3*a**3 - b**2

In [15]:
#When we call .backward() on Q, autograd calculates these gradients and stores them in the respective tensors’ .grad attribute.

In [16]:
# Equivalently, we can also aggregate Q into a scalar and call backward implicitly, like Q.sum().backward().

In [17]:
external_grad = torch.tensor([1., 1.])
Q.backward(gradient=external_grad)

In [18]:
# Gradients are now deposited in a.grad and b.grad

In [19]:
# check if collected gradients are correct
print(9*a**2 == a.grad)
print(-2*b == b.grad)

tensor([True, True])
tensor([True, True])


## exclusion from the DAG


In [22]:
x = torch.rand(5, 5)
y = torch.rand(5, 5)
z = torch.rand((5, 5), requires_grad=True)

a = x + y
print(f"Does `a` require gradients?: {a.requires_grad}")
b = x + z
print(f"Does `b` require gradients?: {b.requires_grad}")

Does `a` require gradients?: False
Does `b` require gradients?: True


In [24]:
#In a NN, parameters that don’t compute gradients are usually called frozen parameters.

In [25]:
from torch import nn, optim

model = resnet18(weights=ResNet18_Weights.DEFAULT)

# Freeze all the parameters in the network
for param in model.parameters():
    param.requires_grad = False

In [26]:
model.fc = nn.Linear(512, 10)

In [27]:
# Optimize only the classifier
optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)