# Gradient : the rate of change of a function with respect to its parameters or variables. In the context of machine learning, the gradient specifically refers to the vector of partial derivatives of a function with respect to each of its parameters.

In [1]:
import torch

In [2]:
x = torch.randn(3, requires_grad = True)
x

tensor([ 1.3148,  0.8830, -0.2900], requires_grad=True)

In [3]:
# does a computational graph, like inputs towards neuron and output
# for each op we have input and output
# using back propogation we can calculate gradient
# forwards pass calculates output
# because we have gradient activated, pytorch will create a gradient function
# y -> grad_fn -> add backward -> backward pass
# which will be used in back propogation
x+2

tensor([3.3148, 2.8830, 1.7100], grad_fn=<AddBackward0>)

In [4]:
x*2*2

tensor([ 5.2592,  3.5319, -1.1600], grad_fn=<MulBackward0>)

In [5]:
x.mean()

tensor(0.6359, grad_fn=<MeanBackward0>)

In [7]:
x.grad

In [12]:
y = x*2*2
y = y.mean()
y.backward() # to compute gradients, we need to reduce the tensor y to a scalar hence the mean()

In [13]:
x.grad

tensor([2.6667, 2.6667, 2.6667])

In [15]:
# if y not scalar, so we did not apply mean()
y = x*2*2
y.shape

torch.Size([3])

In [16]:
vector = torch.randn(3)

In [18]:
#y.backward()
y.backward(vector) # vector jakobian product

In [19]:
x.grad

tensor([7.4539, 9.6260, 1.1981])

In [21]:
# prevent from tracking gradient if required, example : weight adjustment during training
# 3 ways
x.requires_grad_(False) # inplace
x

tensor([ 1.3148,  0.8830, -0.2900])

In [22]:
# x_no_grad = x.detach()
# with torch.no_grad():

In [27]:
weights = torch.ones(4, requires_grad = True)
for epoch in range(3):
    model_output = (weights * 3).sum()
    model_output.backward()
    print(weights.grad)

tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])
tensor([9., 9., 9., 9.])


In [29]:
# so gradient will always sum up
# to fix we need to empty gradients
weights = torch.ones(4, requires_grad = True)
for epoch in range(3):
    model_output = (weights * 3).sum()
    model_output.backward()
    print(weights.grad)
    weights.grad.zero_()

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])


In [None]:
# example
# optimizer = torch.optim.SGD(weights, lr = 0.01)
# optimizer.step()
# optimizer.zero_grad()