In [1]:
import torch

x = torch.ones(5)  # input tensor
y = torch.zeros(3)  # expected output
w = torch.randn(5, 3, requires_grad=True)
b = torch.randn(3, requires_grad=True)
z = torch.matmul(x, w)+b
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)

In [2]:
torch.ones(5) 

tensor([1., 1., 1., 1., 1.])

In [3]:
torch.randn(5, 3, requires_grad=True)

tensor([[-1.8347,  1.1429, -1.1189],
        [-0.6423, -0.3072,  0.0492],
        [ 0.0896,  0.1494, -0.6353],
        [-0.7330,  0.7993, -1.0520],
        [-0.2979, -0.2766, -0.7528]], requires_grad=True)

In [4]:
torch.randn(3, requires_grad=True)

tensor([ 1.5023, -1.7972,  0.2184], requires_grad=True)

In [5]:
torch.matmul(x, w)+b

tensor([2.0899, 1.4277, 1.4672], grad_fn=<AddBackward0>)

In [6]:
print(f"Gradient function for z = {z.grad_fn}")
print(f"Gradient function for loss = {loss.grad_fn}")

Gradient function for z = <AddBackward0 object at 0x103ad5ca0>
Gradient function for loss = <BinaryCrossEntropyWithLogitsBackward0 object at 0x103ad5490>


## computing gradients

In [7]:
loss.backward()
print(w.grad)
print(b.grad)

tensor([[0.2966, 0.2688, 0.2709],
        [0.2966, 0.2688, 0.2709],
        [0.2966, 0.2688, 0.2709],
        [0.2966, 0.2688, 0.2709],
        [0.2966, 0.2688, 0.2709]])
tensor([0.2966, 0.2688, 0.2709])


# disabling gradient tracking 

In [8]:
z = torch.matmul(x, w)+b
print(z.requires_grad)

with torch.no_grad():
    z = torch.matmul(x, w)+b
print(z.requires_grad)

True
False


In [9]:
#or 

z = torch.matmul(x, w)+b
z_det = z.detach()
print(z_det.requires_grad)

False


In [11]:
'''
reasons for this:

mark parameters in your neiral network as frozen parameters

speed up computations wher you are only doing forward pass, because computations on tensors that 
do not track gradients would be more efficient 
'''



'\nreasons for this:\n\nmark parameters in your neiral network as frozen parameters\n\nspeed up computations wher you are only doing forward pass, because computations on tensors that \ndo not track gradients would be more efficient \n'

## more on computational graphs 

## optional reading: tensor gradients and jacobian products

In [12]:
inp = torch.eye(4, 5, requires_grad=True)
out = (inp+1).pow(2).t()
out.backward(torch.ones_like(out), retain_graph=True)
print(f"First call\n{inp.grad}")
out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nSecond call\n{inp.grad}")
inp.grad.zero_()
out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nCall after zeroing gradients\n{inp.grad}")

First call
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])

Second call
tensor([[8., 4., 4., 4., 4.],
        [4., 8., 4., 4., 4.],
        [4., 4., 8., 4., 4.],
        [4., 4., 4., 8., 4.]])

Call after zeroing gradients
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])
