# 模型构造

In [1]:
import torch
from torch import nn
from torch.nn import functional as F

In [3]:
net = nn.Sequential(nn.Linear(20,256), nn.ReLU(), nn.Linear(256,10))

X = torch.rand(2,20)
net(X)

tensor([[-0.2655, -0.0332, -0.0731,  0.1181,  0.0733, -0.0084,  0.1133, -0.2533,
         -0.1110,  0.1474],
        [-0.2063, -0.0937,  0.0255, -0.0024,  0.0608, -0.0189,  0.1814, -0.2989,
         -0.0773,  0.2345]], grad_fn=<AddmmBackward0>)

In [4]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20,256)
        self.out = nn.Linear(256,10)

    def forward(self, X):
        return self.out(F.relu(self.hidden(X)))

In [5]:
net = MLP()

net(X)

tensor([[-0.0811,  0.0843, -0.1519,  0.2166, -0.0242,  0.2877, -0.2354, -0.0334,
          0.3355,  0.1949],
        [ 0.0034,  0.0195, -0.1883,  0.1457,  0.0379,  0.2902, -0.2298, -0.0074,
          0.3057,  0.0699]], grad_fn=<AddmmBackward0>)

In [10]:
class MLP_ReLU(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20,256)
        self.ReLU = nn.functional.relu
        self.out = nn.Linear(256,10)

    def forward(self, X):
        return self.out(self.ReLU(self.hidden(X)))

In [11]:
net = MLP_ReLU()
net(X)

tensor([[-0.2821, -0.0666, -0.0563,  0.0231, -0.1628, -0.0693, -0.1017,  0.1201,
          0.1300, -0.0406],
        [-0.2046, -0.1166, -0.1416, -0.0146, -0.1771, -0.0081, -0.1152,  0.1522,
          0.1687, -0.0658]], grad_fn=<AddmmBackward0>)

In [14]:
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for block in args:
            self._modules[block] = block
        
    def forward(self, X):
        for block in self._modules.values():
            X = block(X)
        return X

In [15]:
net = MySequential(nn.Linear(20,256), nn.ReLU(), nn.Linear(256,10))
net(X)

tensor([[-0.0019,  0.0711, -0.1424,  0.1452,  0.0695, -0.1450,  0.2005, -0.2235,
          0.0609, -0.0329],
        [-0.0232,  0.0253, -0.0816,  0.1468,  0.1091, -0.1776,  0.1961, -0.1557,
          0.0574, -0.0283]], grad_fn=<AddmmBackward0>)

# 参数管理

In [16]:
net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,1))
X = torch.rand(size=(2,4))
net(X)

tensor([[-0.0563],
        [ 0.0176]], grad_fn=<AddmmBackward0>)

In [17]:
print(net[2].state_dict())

OrderedDict([('weight', tensor([[ 0.1365, -0.2541, -0.2487, -0.2194,  0.0376, -0.0563,  0.2135,  0.1687]])), ('bias', tensor([0.0247]))])


In [18]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([0.0247], requires_grad=True)
tensor([0.0247])


In [19]:
net[2].weight.grad == None

True

In [21]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [22]:
net.state_dict()['2.bias'].data

tensor([0.0247])

In [24]:
def block1():
    return nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4,1))
rgnet(X)

tensor([[-0.3345],
        [-0.3345]], grad_fn=<AddmmBackward0>)

In [25]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


## 内置初始化

In [27]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)
        
net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([ 0.0178, -0.0004,  0.0004, -0.0190]), tensor(0.))

In [29]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)

net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [33]:
def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)

def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([-0.3483, -0.2022, -0.0720, -0.5812])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [34]:
shared = nn.Linear(8,8)
net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), shared, nn.ReLU(), shared, nn.ReLU(), nn.Linear(8,1))

net(X)
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0,0] = 100
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


# 自定义层

In [35]:
class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, X):
        return X - X.mean()
    
layer = CenteredLayer()
layer(torch.FloatTensor([1, 2, 3, 4, 5]))

tensor([-2., -1.,  0.,  1.,  2.])

# 读写文件

In [36]:
x = torch.arange(4)
torch.save(x, 'x-file')

x2 = torch.load('x-file')
x2

tensor([0, 1, 2, 3])

In [39]:
y = torch.zeros(4)
torch.save([x, y], 'x-files')

x2,y2 = torch.load('x-files')
x2,y2

(tensor([0, 1, 2, 3]), tensor([0., 0., 0., 0.]))

In [42]:
mydict = {'x': x, 'y':y}
torch.save(mydict,'mydict')
mydict2 = torch.load('mydict')
mydict2

{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}

In [43]:
net = MLP()
X = torch.randn(size=(2,20))
Y = net(X)

In [44]:
torch.save(net.state_dict(), 'mlp.params')

In [46]:
clone = MLP()
clone.load_state_dict(torch.load('mlp.params'))
clone.eval()

MLP(
  (hidden): Linear(in_features=20, out_features=256, bias=True)
  (out): Linear(in_features=256, out_features=10, bias=True)
)

In [47]:
Y_clone = clone(X)
Y_clone == Y

tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])

In [50]:
!nvidia-smi

Fri Aug  8 15:37:35 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 576.57                 Driver Version: 576.57         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 ...  WDDM  |   00000000:01:00.0  On |                  N/A |
| N/A   52C    P8              7W /   25W |    1892MiB /   8188MiB |     30%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                