In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

### Model WITHOUT Shortcut Connections
We'll pass the input through 5 layers without any shortcuts.

In [11]:
# Fix the seed for consistent results
torch.manual_seed(123)

# Create a simple input with 3 values
x = torch.tensor([[1.0, 0.0, -1.0]], requires_grad=True)

# Define 5 layers
layer1 = nn.Linear(3, 3)
layer2 = nn.Linear(3, 3)
layer3 = nn.Linear(3, 3)
layer4 = nn.Linear(3, 3)
layer5 = nn.Linear(3, 1)

# Forward pass (no shortcuts)
z1 = F.gelu(layer1(x))
z2 = F.gelu(layer2(z1))
z3 = F.gelu(layer3(z2))
z4 = F.gelu(layer4(z3))
output = layer5(z4)

# Target and loss
target = torch.tensor([[0.0]])
loss = F.mse_loss(output, target)

# Backward pass
loss.backward()

# Print gradients
print(" Without Shortcut Connections:")
print("Layer 1 Gradient:", layer1.weight.grad.abs().mean().item())
print("Layer 2 Gradient:", layer2.weight.grad.abs().mean().item())
print("Layer 3 Gradient:", layer3.weight.grad.abs().mean().item())
print("Layer 4 Gradient:", layer4.weight.grad.abs().mean().item())
print("Layer 5 Gradient:", layer5.weight.grad.abs().mean().item())

 Without Shortcut Connections:
Layer 1 Gradient: 0.0006289227749221027
Layer 2 Gradient: 0.00037446373607963324
Layer 3 Gradient: 0.0022297531832009554
Layer 4 Gradient: 0.004360882565379143
Layer 5 Gradient: 0.01574201136827469


### Model WITH Shortcut Connections
Now we add the original input back after each layer.

In [13]:
# Reset everything
torch.manual_seed(123)
x = torch.tensor([[1.0, 0.0, -1.0]], requires_grad=True)

# Define the same 5 layers again
layer1 = nn.Linear(3, 3)
layer2 = nn.Linear(3, 3)
layer3 = nn.Linear(3, 3)
layer4 = nn.Linear(3, 3)
layer5 = nn.Linear(3, 1)

# Forward pass with shortcut connections
z1 = F.gelu(layer1(x))
z2 = F.gelu(layer2(z1 + x))
z3 = F.gelu(layer3(z2 + x))
z4 = F.gelu(layer4(z3 + x))
output = layer5(z4 + x)

# Loss and backprop
target = torch.tensor([[0.0]])
loss = F.mse_loss(output, target)
loss.backward()

# Print gradients
print("\n With Shortcut Connections:")
print("Layer 1 Gradient:", layer1.weight.grad.abs().mean().item())
print("Layer 2 Gradient:", layer2.weight.grad.abs().mean().item())
print("Layer 3 Gradient:", layer3.weight.grad.abs().mean().item())
print("Layer 4 Gradient:", layer4.weight.grad.abs().mean().item())
print("Layer 5 Gradient:", layer5.weight.grad.abs().mean().item())


 With Shortcut Connections:
Layer 1 Gradient: 0.0018841872224584222
Layer 2 Gradient: 0.007543565705418587
Layer 3 Gradient: 0.04298516735434532
Layer 4 Gradient: 0.19880297780036926
Layer 5 Gradient: 0.4630465507507324
