In [None]:
import torch
import torch.nn as nn

In [6]:
class ExampleDeepNeuralNetwork(nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), nn.ReLU()), ## Each is a layer with weigh matrix of shape (layer_sizes[i], layer_sizes[i+1]) i.e. having layer[i+1] neurons and then a ReLU activation function
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), nn.ReLU()),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), nn.ReLU()),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), nn.ReLU()),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), nn.ReLU()),
        ])

    def forward(self, x):
        for layer in self.layers:
            # Compute the output of the current layer
            layer_output = layer(x)
            # Check if shortcut can be applied
            if self.use_shortcut and x.shape == layer_output.shape:
                x = x + layer_output ## For every layer, we add the input of the layer to the output of the layer if the input and output shapes match
            else:
                x = layer_output
        return x


In [None]:
layer_sizes = [3, 3, 3, 3, 3, 1] ## 3 x 3 weight matrices for each layer, and the last layer has 1 neuron with a 3 x 1 weight matrix
sample_input = torch.tensor([[1., 0., -1.]])
torch.manual_seed(123) # specify random seed for the initial weights for reproducibility
model_without_shortcut = ExampleDeepNeuralNetwork(
layer_sizes, use_shortcut=False # No shortcut connections
)

In [9]:
def print_gradients(model, x):
    # Forward pass
    output = model(x)
    target = torch.tensor([[0.]])

    # Calculate loss based on how close the target
    # and output are
    loss = nn.MSELoss() ## (y - 0)^2
    loss = loss(output, target)
    
    # Backward pass to calculate the gradients
    loss.backward()

    for name, param in model.named_parameters():
        if 'weight' in name:
            # Print the mean absolute gradient of the weights ## The mean of the gradients for each weight in the weight matrix
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

In [11]:
print_gradients(model_without_shortcut, sample_input) ## We can see that the gradients for the earlier layers are getting much smaller than the gradients for the later layers, which is a sign of vanishing gradients

layers.0.0.weight has gradient mean of 0.0013751330552622676
layers.1.0.weight has gradient mean of 0.0038167957682162523
layers.2.0.weight has gradient mean of 0.0076410952024161816
layers.3.0.weight has gradient mean of 0.007722062990069389
layers.4.0.weight has gradient mean of 0.04962990805506706


In [None]:
torch.manual_seed(123)
model_with_shortcut = ExampleDeepNeuralNetwork(
layer_sizes, use_shortcut=True ## With shortcut connections
)
print_gradients(model_with_shortcut, sample_input) ## Much better gradient flow through the network, as the gradients for all layers are of similar magnitude. Also the loss function graph is much smoother and converges faster, with lesser local minima and saddle points.

layers.0.0.weight has gradient mean of 0.5557742714881897
layers.1.0.weight has gradient mean of 0.09135337918996811
layers.2.0.weight has gradient mean of 0.7913904190063477
layers.3.0.weight has gradient mean of 0.21711303293704987
layers.4.0.weight has gradient mean of 3.140749216079712
