# Exercise Sheet 1b



### Understand Computation Graphs and Backpropagation

- Compute the gradients along the given network manually.
- Compare your results to the gradients computed by pytorch's autograd

In [1]:
import numpy as np
import torch
import torch.nn.functional as F

In [2]:
simple_net = torch.nn.Sequential(
    torch.nn.Linear(4, 2, bias=False),
    torch.nn.ReLU(),
    torch.nn.Linear(2, 1, bias=False),
    torch.nn.Sigmoid()
)

simple_net.load_state_dict(torch.load('initial_model.pth'))


criterion = torch.nn.BCELoss()

In [3]:
in_features = torch.Tensor([[-2, -0.3, 1.7, 0.2]])
target = torch.Tensor([[1.]])

prediction = simple_net(in_features)

loss = criterion(prediction, target) 
loss.backward()

In [4]:
simple_net[0].weight.grad

tensor([[-0.5130, -0.0770,  0.4361,  0.0513],
        [ 0.4947,  0.0742, -0.4205, -0.0495]])

In [5]:
simple_net[2].weight.grad

tensor([[-0.0709, -0.2826]])

# Forward Pass

In [6]:

#Pretrained weights of the model layer 1 and layer 2
w1= simple_net[0].weight
w2=simple_net[2].weight
print ("W1 Matrix :{}\n".format(w1))
print ("W2 Matrix :{}\n".format(w2))



W1 Matrix :Parameter containing:
tensor([[ 0.1433, -0.2529,  0.2200,  0.0073],
        [-0.3766,  0.0989, -0.0612,  0.1814]], requires_grad=True)

W2 Matrix :Parameter containing:
tensor([[-0.5954,  0.5741]], requires_grad=True)



In [7]:
#First operation  = wX (input multiplied by the weight matrix)
first_out = np.dot(w1.detach().cpu(), in_features.T)
print ("Output of first hidden layer: \n{}\n".format(first_out))

Output of first hidden layer: 
[[0.16463013]
 [0.6558626 ]]



In [8]:
#Relu applied on first layer output
relu_out_first = relu = F.relu(torch.tensor(first_out))
print ("Output after RELU: \n {}\n".format(relu_out_first))

Output after RELU: 
 tensor([[0.1646],
        [0.6559]])



In [9]:
# Output of first layer multiplied by weight matrix of second layer
second_out = np.dot(w2.detach().cpu(), relu_out_first)
print ("Output of output layer: \n{}\n".format(second_out))

Output of output layer: 
[[0.27853638]]



In [10]:
#Relu applied on first layer output
sigmoid_out = F.sigmoid(torch.tensor(second_out)).detach().numpy()
print ("Output after Sigmoid: \n {}\n".format(sigmoid_out))

Output after Sigmoid: 
 [[0.56918734]]





In [11]:
forward_loss =  - np.log(sigmoid_out) # -Y(log(y)) for case where y = 1 in binary cross entropy
print(forward_loss)

[[0.56354564]]


# BACK PROPAGATION

#### derivative of BCE when y = 1:   $  -1/x $
#### derivative of sigmoid:   $ e^-x / (1 - e^-x)^-2 $
#### derivative of ReLU:         1 when x>0 else 0  
#### derivative of (wX) w.r.t `X` is `w` and w.r.t `w` is `X`

In [12]:
# derivative of loss w.r.t loss is 1
grad_loss = 1

#derivative of loss w.r.t sigmoid_out is `derivative of loss w.r.t loss * derivative of loss w.r.t sigmoid_out` chain rule
grad_loss_wrt_sig = -(1/sigmoid_out) * grad_loss
print ("Gradient of loss W.R.T output of sigmoid :",grad_loss_wrt_sig.item()) 

Gradient of loss W.R.T output of sigmoid : -1.7568907737731934


In [13]:
# Derivative of sigmoid W.R.T input
grad_sigmoid_wrt_w2_2_relu = (np.exp(-second_out) / (1 + np.exp(-second_out))**(2)) * grad_loss_wrt_sig
print ("Gradient of sigmoid activation w.r.t input :", grad_sigmoid_wrt_w2_2_relu)

Gradient of sigmoid activation w.r.t input : [[-0.43081263]]


### Gradients of second layer w2

In [14]:
# Derivative of wX w.r.t weights. here w is weights of second layer and X is output of ReLU
grad_wX_wrt_w_2 =  relu_out_first * grad_sigmoid_wrt_w2_2_relu
print ("Gradiends of second layer weights:", grad_wX_wrt_w_2.reshape(-1))
print ("Pytorch gradients :", simple_net[2].weight.grad)

Gradiends of second layer weights: tensor([-0.0709, -0.2826])
Pytorch gradients : tensor([[-0.0709, -0.2826]])


In [15]:
# Derivative of wX w.r.t `X`. here w is weights of second layer and X is output of ReLU
grad_wX_wrt_X2 =  w2.detach().numpy() * grad_sigmoid_wrt_w2_2_relu
print ("Gradiends of wX w.r.t X2:", grad_wX_wrt_X2.reshape(-1))

Gradiends of wX w.r.t X2: [ 0.2565156  -0.24734934]


In [16]:
# Gradient of Relu is 
# 1 for x > 0; 
# 0 for x <= 0
grad_relu = 1 * grad_wX_wrt_X2 # since input to relu is > 0 


### Gradients First layer W1

In [17]:
grad_w1 = in_features * grad_relu.reshape(2,1)
print ("Manually computed gradients W1: \n", grad_w1)
print ("\nPytorch gradients W1: \n", simple_net[0].weight.grad)

Manually computed gradients W1: 
 tensor([[-0.5130, -0.0770,  0.4361,  0.0513],
        [ 0.4947,  0.0742, -0.4205, -0.0495]])

Pytorch gradients W1: 
 tensor([[-0.5130, -0.0770,  0.4361,  0.0513],
        [ 0.4947,  0.0742, -0.4205, -0.0495]])


# Conclusion
##### Gradients computed from pytorch backprop and manual backprop are the same