In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable

In [67]:
class Linear(nn.Module):
    def __init__(self,feature_list):
        super(Linear,self).__init__()
        self.feature_list = feature_list
        self.layers = []
        
        # Define Layers
        for i in range(len(feature_list)-1):
            self.layers.append(nn.Linear(self.feature_list[i],self.feature_list[i+1]))
        self.total = nn.ModuleList(self.layers)
             
        # Initialize Weights to 1 in order to check gradients easily.
        for idx,m in enumerate(self.total):
            if isinstance(m, nn.Linear):
                m.weight.data.fill_(1)
                m.bias.data.fill_(0)
                
            if idx==1:
                self.h0 = m.register_forward_hook(self.forward_hook)
                self.h1 = m.register_forward_pre_hook(self.forward_pre_hook)
                self.h2 = m.register_backward_hook(self.backward_hook)
                
                
    # hook(module, input, output) -> None            
    def forward_hook(self,*args):
        module,input,output = args[0],args[1],args[2]
        print("\n This is Forward Hook \n")
        # This part is weird 
        for i in args:
            print(type(i))
            
        
    # hook(module, grad_input, grad_output)
    def backward_hook(self,*args):
        module, grad_input, grad_output = args[0],args[1],args[2]
        print("\n This is Backward Hook \n")
        print(grad_input)
        print(grad_output)
        for i in args:
            print(type(i))
        
    
    # hook(module, input)
    def forward_pre_hook(self,*args):
        module, input = args[0],args[1]
        print("\n This is Forward Pre Hook \n")
        for i in args:
            print(type(i))
        
    
    def remove_hook(self):
        self.h0.remove()
        self.h1.remove()
        self.h2.remove()
    
    def forward(self,x):
        out = x
        for idx,layer in enumerate(self.total):
            out = layer(out)
        return out

In [68]:
class LinearNaive(nn.Module):
    def __init__(self,feature_list):
        super(LinearNaive,self).__init__()
        self.feature_list = feature_list
        self.layers = []
        
        # Define Layers
        for i in range(len(feature_list)-1):
            self.layers.append(nn.Linear(self.feature_list[i],self.feature_list[i+1]))
        self.total = nn.ModuleList(self.layers)
             
        # Initialize Weights to 1 in order to check gradients easily.
        for idx,m in enumerate(self.total):
            if isinstance(m, nn.Linear):
                m.weight.data.fill_(2)
                m.bias.data.fill_(0)
    
    def forward(self,x):
        out = x
        for idx,layer in enumerate(self.total):
            out = layer(out)
        return out

In [69]:
class VerboseExecution(nn.Module):
    def __init__(self, model: nn.Module):
        super().__init__()
        self.model = model

        # Register a hook for each layer
        for name, layer in self.model.named_children():
            layer.__name__ = name
            layer.register_forward_hook(
                lambda layer, _, output: print(f"{layer.__name__}: {output.shape}")
            )

    def forward(self, x):
        return self.model(x)

In [70]:
feature_list = [1,2,4]
model = LinearNaive(feature_list)
verbose_model = VerboseExecution(model)

In [71]:
feature_list = [1,2,4]
model = Linear(feature_list)

In [72]:
x = Variable(torch.ones(2,1),requires_grad=True)
out = model(x)
out = torch.sum(out)


 This is Forward Pre Hook 

<class 'torch.nn.modules.linear.Linear'>
<class 'tuple'>

 This is Forward Hook 

<class 'torch.nn.modules.linear.Linear'>
<class 'tuple'>
<class 'torch.Tensor'>


In [73]:
out.backward()


 This is Backward Hook 

(tensor([2., 2., 2., 2.]), tensor([[4., 4.],
        [4., 4.]]), tensor([[2., 2., 2., 2.],
        [2., 2., 2., 2.]]))
(tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.]]),)
<class 'torch.nn.modules.linear.Linear'>
<class 'tuple'>
<class 'tuple'>


In [74]:
import torch 
a = torch.ones(5)
a.requires_grad = True

b = 2*a

b.retain_grad()   # Since b is non-leaf and it's grad will be destroyed otherwise.

c = b.mean()

c.backward()

print(a.grad, b.grad)

# Redo the experiment but with a hook that multiplies b's grad by 2. 
a = torch.ones(5)

a.requires_grad = True

b = 2*a

b.retain_grad()

b.register_hook(lambda x: print(x))  

b.mean().backward() 


print(a.grad, b.grad)

tensor([0.4000, 0.4000, 0.4000, 0.4000, 0.4000]) tensor([0.2000, 0.2000, 0.2000, 0.2000, 0.2000])
tensor([0.2000, 0.2000, 0.2000, 0.2000, 0.2000])
tensor([0.4000, 0.4000, 0.4000, 0.4000, 0.4000]) tensor([0.2000, 0.2000, 0.2000, 0.2000, 0.2000])


In [32]:
b

tensor([2., 2., 2., 2., 2.], grad_fn=<MulBackward0>)

In [45]:
x = torch.ones(2, 2, requires_grad=True)
print(x)
y = x + 2
y.retain_grad()
print(y)

tensor([[1., 1.],
        [1., 1.]], requires_grad=True)
tensor([[3., 3.],
        [3., 3.]], grad_fn=<AddBackward0>)


In [46]:
z = y * y * 3

In [47]:
out = z.mean()
out.backward()

In [48]:
x.grad

tensor([[4.5000, 4.5000],
        [4.5000, 4.5000]])

In [49]:
y.grad

tensor([[4.5000, 4.5000],
        [4.5000, 4.5000]])

In [78]:
c_ = torch.zeros(4, requires_grad=True)
c_.sum()

tensor(0., grad_fn=<SumBackward0>)

In [79]:
c_.sum().backward()
c_.grad

tensor([1., 1., 1., 1.])

In [80]:
l = nn.Linear(1, 2)

In [81]:
l(torch.ones(1, 1))

tensor([[-0.8580, -0.8244]], grad_fn=<AddmmBackward>)