1.Xavier Initialization (Glorot Initialization)
Good for tanh/sigmoid activations.

In [2]:
import torch.nn as nn
import torch.nn.init as init

layer = nn.Linear(128, 64)
init.xavier_uniform_(layer.weight)  # Xavier uniform initialization
# OR
init.xavier_normal_(layer.weight)   # Xavier normal initialization


Parameter containing:
tensor([[-0.0755, -0.0385,  0.0434,  ...,  0.0142, -0.0712, -0.0349],
        [ 0.0915, -0.1302,  0.1380,  ...,  0.0677,  0.0548, -0.0623],
        [-0.0066, -0.2320, -0.0840,  ...,  0.0669,  0.0897, -0.0522],
        ...,
        [ 0.0126, -0.1058, -0.0694,  ...,  0.0047,  0.0243,  0.0197],
        [ 0.0546,  0.1408,  0.0137,  ..., -0.1051, -0.1119,  0.1344],
        [ 0.0376, -0.0061,  0.2077,  ...,  0.1835, -0.1105,  0.1316]],
       requires_grad=True)

2.Kaiming Initialization (He Initialization)
Best for ReLU or variants.

In [3]:
init.kaiming_uniform_(layer.weight, nonlinearity='relu')  # Uniform version
# or
init.kaiming_normal_(layer.weight, nonlinearity='relu')   # Normal distribution


Parameter containing:
tensor([[ 0.1459, -0.0754, -0.1616,  ..., -0.0316,  0.0688, -0.3076],
        [-0.0180,  0.0043, -0.1444,  ..., -0.1103, -0.0545, -0.0979],
        [ 0.0445, -0.0357,  0.0143,  ...,  0.0735,  0.0677,  0.1482],
        ...,
        [-0.0153, -0.0061, -0.0661,  ...,  0.1265,  0.0752,  0.0819],
        [-0.1057,  0.0172,  0.0212,  ...,  0.1956,  0.2271, -0.0449],
        [ 0.0494,  0.1408,  0.0971,  ..., -0.0348,  0.1239,  0.1736]],
       requires_grad=True)

3.Orthogonal Initialization
Helps preserve the norm of the gradient

In [4]:
init.orthogonal_(layer.weight)


Parameter containing:
tensor([[-0.0744, -0.0734, -0.0643,  ...,  0.0008,  0.0341,  0.0635],
        [ 0.0405,  0.0727,  0.0324,  ..., -0.0294,  0.1316,  0.0155],
        [ 0.0679, -0.0225, -0.0199,  ..., -0.0729,  0.0108,  0.0054],
        ...,
        [ 0.1460,  0.1216, -0.0405,  ...,  0.0825, -0.0375,  0.0170],
        [-0.0579, -0.1148,  0.0424,  ...,  0.0026,  0.0403, -0.0817],
        [-0.1264,  0.0951,  0.0693,  ...,  0.0887, -0.1839, -0.0168]],
       requires_grad=True)

4. Manual Initialization
Custom initialization using any method you want.

In [10]:
import torch.nn as nn
import torch

layer = nn.Linear(128, 64)

with torch.no_grad():
    layer.weight.fill_(0.01)  # All weights set to 0.01
    layer.bias.fill_(0)       # All biases set to 0


Apply Initialization to a Whole Model

In [11]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        init.xavier_uniform_(m.weight)
        if m.bias is not None:
            init.zeros_(m.bias)

model = nn.Sequential(
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 10)
)

model.apply(init_weights)


Sequential(
  (0): Linear(in_features=128, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=10, bias=True)
)