# Demo: Introducing Autograd

In [2]:
# Let's first understand how tensors can be set up to track history using the Autograd package.

import torch

In [3]:
# Let's instantiate a couple of tensors;

# 2 x 3 tensor
tensor1 = torch.Tensor([[1, 2, 3 ],
                        [4, 5, 6]
                       ])

display(tensor1.shape)

tensor1

torch.Size([2, 3])

tensor([[1., 2., 3.],
        [4., 5., 6.]])

In [5]:
# another 2 x 3 tensor
tensor2 = torch.Tensor([[7, 8, 9],
                       [10, 11, 12]])

tensor2

tensor([[ 7.,  8.,  9.],
        [10., 11., 12.]])

In [8]:
# Every tensor created in PyTorch will have the "requires_grad" property. When the value of this is true,
# this means that PyTorch will track computations for this tensor in the forward phase when we use the 
# computation graph to make predictions, and it will calculate gradients for this tensor in the backward phase.
# The gradients will be calculated with respect to a scalar, such as the loss. 

# When you instantiate a tensor, the default value for requires_grad is set to false.
# Let's check the requires_grad property for tensor2 and you'll find that this is false as well.

display(tensor1.requires_grad)
display(tensor2.requires_grad)

False

False

In [13]:
# If you want to enable tracking history for a particular tensor so that gradients are calculated with respect to that tensor,
# you need to enable the requires_grad flag, which you can do by calling requires_grad_() function.

tensor1.requires_grad_()

# now the property is true
display(tensor1.requires_grad)


# Calling requires_grad_() on a tesnor will update the requires_grad property for this tensor in place.
# You can confirm this by checking the requires_grad property.
tensor1

True

tensor([[1., 2., 3.],
        [4., 5., 6.]], requires_grad=True)

In [14]:
# If you check the requires_grad property for tensor2, you'll find that it's still false.

tensor2.requires_grad

False

In [16]:
# The gradients calculated using automatic differentiation with respect to any tensor is present in the grad matrix
# associated with the tensor.

# no gradients are available yet - this is part of a computation graph but no forward or backward passes have been made
print(tensor1.grad)

None


In [17]:
# You can see that there are no gradients available yet.
# We have set up a tensor, but we haven't used it within a computation graph.
# We haven't performed a forward or a backward pass.
# This tensor hasn't been used in a computation.
# There is nothing to calculate gradients with respect to.

display(tensor1.grad)

None

In [18]:
# The computation graph within PyTorch is made up of tensors and functions.
# These together make up our directed acyclic computation graph.

# Directed acyclic computation graphs are often used in deep learning frameworks such as TensorFlow and PyTorch 
# to represent the flow of data through the layers of a neural network. 
# This representation helps perform automatic differentiation, allowing the efficient computation of gradients 
# for optimization.

# You can think of tensors as the nodes in this graph and functions are the transformations performed along edges.
# Every tensor has a grad function used to create that function (i.e. transformations or affine transformation functions).
# Now this tensor is something that we created, it's grad function is none.

print(tensor1.grad_fn)

None


In [20]:
display(tensor1)
display(tensor2)

# Let's now use these tensors to perform a simple computation
# that will set up our computation graph.
# Output_tensor is equal to tensor1 multiplied by tensor2.

output_tensor = tensor1 * tensor2

output_tensor

tensor([[1., 2., 3.],
        [4., 5., 6.]], requires_grad=True)

tensor([[ 7.,  8.,  9.],
        [10., 11., 12.]])

tensor([[ 7., 16., 27.],
        [40., 55., 72.]], grad_fn=<MulBackward0>)

In [22]:
# When you create a tensor using an operation,
# the requires_grad property for the resulting tensor is based on the
# input tensors that we use to create this output tensor.

# Requires_grad is true because tensor1 had requires_grad set to true and
# tensor1 was used to create this output tensor.
display(output_tensor.requires_grad)

True

In [23]:
# The output tensor also has the grad property used to store gradients.
# There are no gradients, we haven't made any backward pass.

print(output_tensor.grad)

None


  return self._grad


In [31]:
# But you'll find that this output tensor will have a grad function, i.e. grad_fn.
# Because we used a specific multiplication operation to create this output tensor,
# you can see that MulBackward0 is the grad function associated with this tensor.

# In PyTorch, any resulting tensor has reference to the function (operation) that created it.
print(output_tensor.grad_fn)

<MulBackward0 object at 0x00000244A8584748>


In [32]:
# User created tensors, such as tensor1 and tensor2,
# will have no corresponding function.
# And so one has no grad function, neither does tensor2.

display(tensor1.grad_fn)
display(tensor2.grad_fn)

None

None

In [36]:
# We'll now create another output tensor by performing a slightly different operation, (tensor1 multiplied by tensor2).mean().
# The grad function of the output tensor will reference the last function used to create that tensor, MeanBackward.

# mean() function will find mean() of all elements in the resulting tensor.
# Even though there were two functions involved, the multiplication and then the mean calculation,
# the grad function references the last function, that is the mean.

output_tensor = (tensor1 * tensor2).mean()
display(output_tensor)
print(output_tensor.grad_fn)

tensor(36.1667, grad_fn=<MeanBackward0>)

<MeanBackward0 object at 0x00000244AA32F470>


In [39]:
# So tensor1 has been a part of multiple computation graphs where we calculated the output tensor,
# but no gradient is associated yet with tensor1.
# This is because we haven't yet performed a backward pass used to calculate gradients.
print(tensor1.grad)

# Gradient calculation, that is our vector of partial derivatives,
# will be calculated only when we call the backward() function on an output.
# So output_tensor.backward() will begin the backward pass through our
# computation graph and now we'll have gradients for tensor1.
output_tensor.backward()

None


In [42]:
# Now we have the gradients for tensor1
print(tensor1.grad)

# nothing for tensor2, because its requires_grad property is set to false
print(tensor2.grad)

# output_tensor is also non-leaf tensor, there is no gradient calculated for it as well
print(output_tensor.grad)

tensor([[1.1667, 1.3333, 1.5000],
        [1.6667, 1.8333, 2.0000]])
None
None


  return self._grad


In [45]:
# These gradients here are the partial derivatives for the parameters in tensor1 calculated 
# with reference to the output tensor. Since gradients are partial derivatives with respect to every value within tensor1,
# the shape of the gradient will exactly match the shape of the tensor itself.
print(tensor1.grad)
print(tensor1.grad.shape, tensor1.shape)


# Remember that tensor2 was also part of the computation,
# but because requires_grad was set to false,
# no gradients were calculated with respect to tensor2.
print(tensor2.grad)


# There are no gradients associated with the output tensor because
# this is the value with respect to which we calculated partial
# derivatives to get our gradients.
print(output_tensor.grad)

tensor([[1.1667, 1.3333, 1.5000],
        [1.6667, 1.8333, 2.0000]])
torch.Size([2, 3]) torch.Size([2, 3])
None
None


In [50]:
# Once again, we'll calculate a new tensor using tensor1 and you'll see
# how the requires_grad property is propagated from input
# tensors to the result tensors.

# The requires_grad property is propagated from input tensors to the resulting tensors.
# You can see that new tensor has requires_grad set to true.
new_tensor = tensor1 * 3
print(new_tensor.requires_grad)

True


In [51]:
# And the gradient function, i.e. grad_fn associated with this new tensor is MulBackward,
# the multiplication operation that created this tensor.
print(new_tensor.grad_fn)

new_tensor

<MulBackward0 object at 0x00000244AA32F160>


tensor([[ 3.,  6.,  9.],
        [12., 15., 18.]], grad_fn=<MulBackward0>)

In [54]:
# If you have tensors with requires_grad set to true, which means tracking history is enabled,
# if you want to stop the Autograd package from tracking history on these tensors,
# it's possible to do that using torch.no_grad.

# Here we have a with block with torch.no_grad() and any computation performed within 
# this with block will have tracking history turned off.

with torch.no_grad():
    # Here is a new tensor created from tensor1 which has requires_grad set to true.
    new_tensor = tensor1 * 3
    print('new_tensor = ', new_tensor)
    # The original tensor, tensor1, has requires_grad set to true, there are no surprises here.
    # For tensor2, it is false, because requires_grad is not enabled when the tensor is created.
    print("requires_grad for tensor1 = ", tensor1.requires_grad) # it will be true, since it is explicitly set before with block
    print("requires_grad for tensor2 = ", tensor2.requires_grad) # it is false, because it was false before with block
    # but the new tensor that we created has requires_grad set to false.
    # So requires_grad was not propagated to this new tensor because it was created within a torch.no_grad block.
    print("requires_grad for new_tensor = ", new_tensor.requires_grad)

new_tensor =  tensor([[ 3.,  6.,  9.],
        [12., 15., 18.]])
requires_grad for tensor1 =  True
requires_grad for tensor2 =  False
requires_grad for new_tensor =  False


# Demo: Working with Gradients

In [61]:
# Let's take a look at a few Python functions and how we can use decorators
# to determine whether gradients should be enabled or not.
# Here is a function called calculate, which takes in a tensor t,
# it multiplies t by 2 and returns a result.

def calculate(t):
    return t * 2


# Now let's take a look at another function here,
# calculate_with_no_grad, which has the decorator @ torch.no_grad applied.
# You can see that calculate_with_no_grad performs the same
# multiplication action as calculate, but gradients will not be enabled,
# history tracking will not be turned on,
# even if the tensor's requires_grad property set to true.

@torch.no_grad()
def calculate_with_no_grad(t):
    return t * 2



In [62]:
# Original tensor1 with requires_grad property set to true
print(tensor1)


# Let's now invoke first, the calculate function and pass in tensor1,
# which has requires_grad set to true.
# The result_tensor will also have gradients enabled and it's
# grad function will be MulBackward because it was created
# using a multiplication operation.
result_tensor = calculate(tensor1)

result_tensor

tensor([[1., 2., 3.],
        [4., 5., 6.]], requires_grad=True)


tensor([[ 2.,  4.,  6.],
        [ 8., 10., 12.]], grad_fn=<MulBackward0>)

In [63]:
# If you check requires_grad for the result_tensor, you'll see that it's true.
result_tensor.requires_grad

True

In [65]:
# Now let's calculate the result_tensor once again.
# This time we'll call it result_tensor_no_grad,
# but we'll invoke the calculate_with_no_grad function to do this.
# This is the function with a decorator @ torch.no_grad,
# which means gradient tracking will be turned off.
# If you take a look at the result_tensor_no_grad,
# you'll find that because history tracking was turned off,
# it does not keep track of the function that created this
# tensor and requires_grad is set to false,
# even though the input tensor, tensor1, had requires_grad true.

result_tensor_no_grad = calculate_with_no_grad(tensor1)

display(result_tensor_no_grad)

result_tensor_no_grad.requires_grad

tensor([[ 2.,  4.,  6.],
        [ 8., 10., 12.]])

False

In [71]:
# tensor1 originally has requires_grad property set to true
print(tensor1.requires_grad)

# Just like we can turn off history tracking using torch.no_grad,
# we can enable explicit tracking of history within a torch.no_grad() block.


with torch.no_grad():
    # So here is a tensor computation, tensor1 multiplied by 3,
    # stored in new_tensor_no_grad, which is within our outer torch.no_grad() block.
    new_tensor_no_grad = tensor1 * 3
    print('new_tensor_no_grad = ', new_tensor_no_grad)
    
    # But we want to turn off gradient tracking in general,
    # within this outer block we want to enable gradients for one computation.
    # You can do that using with torch.enable_grad() within an outer no_grad block.
    with torch.enable_grad():
        # Here we compute tensor1 multiplied by 3 and store the result in new_tensor_grad.
        new_tensor_grad = tensor1 * 3
        print('new_tensor_grad = ', new_tensor_grad)

        
# This is how you can nest an inner block where you want
# history tracking to be enabled and gradients to be calculated
# within an outer torch.no_grad block.

True
new_tensor_no_grad =  tensor([[ 3.,  6.,  9.],
        [12., 15., 18.]])
new_tensor_grad =  tensor([[ 3.,  6.,  9.],
        [12., 15., 18.]], grad_fn=<MulBackward0>)


In [72]:
# If you take a look at the result,
# you can see that the tensor that was created within the
# torch.no_grad outer block does not have grad function and
# its requires_grad will be false.
display(new_tensor_no_grad)
display(new_tensor_no_grad.requires_grad)

# The tensor that was created within the inner block with torch.enable_grad has
# a grad function and its requires_grad will be set to true.
display(new_tensor_grad)
display(new_tensor_grad.requires_grad)

tensor([[ 3.,  6.,  9.],
        [12., 15., 18.]])

False

tensor([[ 3.,  6.,  9.],
        [12., 15., 18.]], grad_fn=<MulBackward0>)

True

In [83]:
# You can also specify the value for requires_grad when you instantiate a tensor.
# Here is a torch.tensor created with requires.grad set to true, that is tensor_one.
tensor_one = torch.tensor([[1.0, 2.0],
                          [3.0, 4.0]], requires_grad=True)

tensor_one

tensor([[1., 2.],
        [3., 4.]], requires_grad=True)

In [84]:
# We'll create tensor_two using torch.Tensor, by default requires_grad will be set to false.
tensor_two = torch.Tensor([[5, 6],
                          [7, 8]])

tensor_two

tensor([[5., 6.],
        [7., 8.]])

In [85]:
# tensor_one already has history tracking and gradient calculation enabled.
print(tensor_one.requires_grad)

# Let's do the same for tensor_two by calling requires_grad_(),
# which will update the requires_grad flag in place.

tensor_two.requires_grad_()

True


tensor([[5., 6.],
        [7., 8.]], requires_grad=True)

In [86]:
# We'll now perform a simple computation which sets up the
# forward pass of our computation graph, tensor_one + tensor_two, 
# the whole thing will calculate the mean and store in final_tensor.

final_tensor = (tensor_one + tensor_two).mean()
final_tensor

tensor(9., grad_fn=<MeanBackward0>)

In [87]:
# You can see that the final_tensor has the grad function MeanBackward associated.
# The requires_grad property for the final tensor is true because
# this is propagated from the input tensors.
display(final_tensor.grad_fn)
display(final_tensor.requires_grad)

# If any of the input tensors have requires_grad set to true,
# final_tensor will also have its requires_grad property set to true.

<MeanBackward0 at 0x244aa33a048>

True

In [88]:
# We'll only perform the forward pass on our computation graphs.
# There are no gradients (i.e. grad matrix) associated with tensor_one, nor with tensor_two.

print(tensor_one.grad)
print(tensor_two.grad)

None
None


In [89]:
# We have history tracking for all of the tensors in this computation graph.
# When we call final_tensor.backward(),
# that's when gradients are calculated and you'll now see that
# tensor_one has corresponding gradients, as does tensor_two.

final_tensor.backward()

# These are gradients of tensor_one and tensor_two
# calculated with respect to the final tensor.
print(tensor_one.grad)
print(tensor_two.grad)

tensor([[0.2500, 0.2500],
        [0.2500, 0.2500]])
tensor([[0.2500, 0.2500],
        [0.2500, 0.2500]])


In [90]:
# Tensors involved in a computation are part of a larger computation graph.
# If you want a tensor which is detached from the current computation graph,
# you can call the detach() function on a tensor that will return a new
# tensor detached from the current computation graph,
# and this new tensor will always have requires_grad set to false.

# detach() returns a new tensor detached from the current computation graph - will always have requires_grad=False
detached_tensor = tensor_one.detach()
detached_tensor

tensor([[1., 2.],
        [3., 4.]])

In [94]:
# Remember that the original tensor, tensor_one, has requires_grad set to true.
display(tensor_one)

# The detached_tensor has requires_grad set to false.
# It's part of no computation graph.
display(detached_tensor)
display(detached_tensor.requires_grad)

tensor([[1., 2.],
        [3., 4.]], requires_grad=True)

tensor([[1., 2.],
        [3., 4.]])

False

In [97]:
# We'll now use the original tensor, tensor_one,
# and the detached_tensor in the computation.
# And we'll call the mean() function and the result is stored in mean_tensor.
# the operation that creates mean_tensor actually constitutes our forward pass.
mean_tensor = (tensor_one + detached_tensor).mean()

# mean_tensor will have MeanBackward as grad function, propagated from the last operation that create the tensor
display(mean_tensor)

# it also has requires_grad set to true, again propagated from one the tensors which has requires_grad true.
display(mean_tensor.requires_grad)

tensor(5., grad_fn=<MeanBackward0>)

True

In [98]:
# With the forward pass complete,
# we'll call mean_tensor.backward() to calculate gradients.
mean_tensor.backward()

# And if you take a look at the resulting tensors,
# you can see that gradients have been calculated for tensor_one,
# but no gradients have been calculated for the detached_tensor
# because it has requires_grad set to false.
display(tensor_one.grad)
display(detached_tensor.grad)

tensor([[0.5000, 0.5000],
        [0.5000, 0.5000]])

None