## dl 工具
本节介绍pytorch框架的原理，和平常的使用指南

In [2]:
import torch
from torch import nn
from torch.nn import functional as F

In [3]:
net = nn.Sequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))
x = torch.rand(2,20)
net(x).shape

torch.Size([2, 10])

## what a dl module should contain
1. ingest input data as arguments to its forward propagation method.

2. Generate an output by having the forward propagation method return a value. Note that the output may have a different shape from the input. For example, the first fully connected layer in our model above ingests an input of arbitrary dimension but returns an output of dimension 256.

3. Calculate the gradient of its output with respect to its input, which can be accessed via its backpropagation method. Typically this happens automatically.

4. Store and provide access to those parameters necessary for executing the forward propagation computation.

5. Initialize model parameters as needed|

In [4]:
# example use of module 

class MLP(nn.Module):
    def __init__(self):
        # Call the constructor of the parent class nn.Module to perform
        # the necessary initialization
        super().__init__()
        self.hidden = nn.LazyLinear(256)
        self.out = nn.LazyLinear(10)

    # Define the forward propagation of the model, that is, how to return the
    # required model output based on the input X
    def forward(self, X):
        return self.out(F.relu(self.hidden(X)))

In [5]:
class MySequential(nn.Module):
    '''
    exmaple usage:
    # net = MySequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))
    '''
    def __init__(self, *args):
        super().__init__()
        # print(type(args))
        # print(args)
        for idx, module in enumerate(args):
            self.add_module(str(idx), module)
    
    def forward(self, X):
        for module in self.children():
            X = module(X)
        return X 

In [6]:
net = MySequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))
X = torch.rand(2, 20)
net(X).shape

torch.Size([2, 10])

In [7]:
net = nn.Sequential(nn.LazyLinear(8),
                    nn.ReLU(),
                    nn.LazyLinear(1))

X = torch.rand(size=(2, 4))
net(X).shape

torch.Size([2, 1])

In [13]:
net.state_dict()

OrderedDict([('0.weight',
              tensor([[ 0.2220, -0.3163,  0.2834, -0.4338],
                      [-0.3197,  0.0201,  0.4770, -0.2166],
                      [ 0.0747, -0.2863, -0.4664, -0.3698],
                      [-0.2241,  0.1159,  0.1058,  0.4025],
                      [-0.4208,  0.1172, -0.1733, -0.0737],
                      [-0.1261,  0.1330,  0.4491, -0.4001],
                      [-0.0286, -0.2975,  0.2509, -0.4897],
                      [ 0.0894,  0.0715, -0.1141,  0.1767]])),
             ('0.bias',
              tensor([ 0.1737,  0.2992, -0.4590,  0.4695, -0.2361, -0.2528,  0.0049, -0.0899])),
             ('2.weight',
              tensor([[-0.0223,  0.2326,  0.1453,  0.3076,  0.0728,  0.3464,  0.0303, -0.0071]])),
             ('2.bias', tensor([0.0976]))])

In [None]:
# Parameters are complex objects, containing values, gradients, and additional information
type(net[2].bias), type(net[2].bias.data), net[2].bias.data

(torch.nn.parameter.Parameter, torch.Tensor, tensor([0.0976]))

In [18]:
net[2].weight.grad == None

True

In [20]:
[(name, param.shape) for name, param in net.named_parameters()]

[('0.weight', torch.Size([8, 4])),
 ('0.bias', torch.Size([8])),
 ('2.weight', torch.Size([1, 8])),
 ('2.bias', torch.Size([1]))]

In [None]:
# init weights
def init_normal(module):
    if type(module) == nn.Linear:
        nn.init.normal_(module.weight, mean=0, std=0.01)
        nn.init.zeros_(module.bias)
# net.apply(f) -> aplly to every module inside 
net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([ 0.0195, -0.0017, -0.0038,  0.0162]), tensor(0.))

In [25]:
def init_constant(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 1)
        nn.init.zeros_(module.bias)

net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [26]:
def init_xavier(module):
    if type(module) == nn.Linear:
        nn.init.xavier_uniform_(module.weight)

def init_42(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([-0.3016,  0.4335,  0.0767,  0.5660])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [27]:
# Customize
def my_init(module):
    if type(module) == nn.Linear:
        print("Init", *[(name, param.shape)
                        for name, param in module.named_parameters()][0])
        nn.init.uniform_(module.weight, -10, 10)
        module.weight.data *= module.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[-7.9879, -5.3836, -5.7137, -8.5929],
        [ 0.0000,  0.0000, -0.0000, -9.1433]], grad_fn=<SliceBackward0>)

In [28]:
from torch.nn import functional as F
from d2l import torch as d2l

In [29]:
# customized layer 
class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, X):
        return X - X.mean()

In [31]:
layer = CenteredLayer()
layer(torch.tensor([1.0, 2, 3, 4, 5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [33]:
net = nn.Sequential(nn.LazyLinear(128), CenteredLayer())

In [45]:
Y = net(torch.rand(4, 8))
Y.mean()

tensor(-2.7940e-09, grad_fn=<MeanBackward0>)

In [47]:
class MyLinear(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.weight = nn.Parameter(torch.rand(in_dim, out_dim))
        self.bias = nn.Parameter(torch.rand(in_dim,))
    
    def forward(self, X):
        output =  X.matmul(self.weight) + self.bias 
        return F.relu(output)

In [None]:
Layer = MyLinear(5, 3)
# 可以看见linear 里面的weight都是nn.parameter
Layer.weight

Parameter containing:
tensor([[0.4087, 0.1449, 0.2614],
        [0.1314, 0.1076, 0.1351],
        [0.1481, 0.8742, 0.1078],
        [0.0568, 0.4710, 0.4945],
        [0.9152, 0.2011, 0.9766]], requires_grad=True)

# IO

In [51]:
x = torch.arange(4)
torch.save(x, 'x-file')

In [53]:
x2 = torch.load('x-file')
x2

  x2 = torch.load('x-file')


tensor([0, 1, 2, 3])

In [54]:
y = torch.zeros(4)
torch.save([x, y],'x-files')
x2, y2 = torch.load('x-files')
(x2, y2)

  x2, y2 = torch.load('x-files')


(tensor([0, 1, 2, 3]), tensor([0., 0., 0., 0.]))

In [55]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.LazyLinear(256)
        self.output = nn.LazyLinear(10)

    def forward(self, x):
        return self.output(F.relu(self.hidden(x)))

net = MLP()
X = torch.randn(size=(2, 20))
Y = net(X)

In [56]:
torch.save(net.state_dict(), 'mlp.params')

In [57]:
clone = MLP()
clone.load_state_dict(torch.load('mlp.params'))
clone.eval()

  clone.load_state_dict(torch.load('mlp.params'))


MLP(
  (hidden): LazyLinear(in_features=0, out_features=256, bias=True)
  (output): LazyLinear(in_features=0, out_features=10, bias=True)
)

# GPU

In [59]:
# torch.device('cpu') and torch.device('cuda')
torch.device('cuda')

device(type='cuda')

In [61]:
def cpu():  #@save
    """Get the CPU device."""
    return torch.device('cpu')

def gpu(i=0):  #@save
    """Get a GPU device."""
    return torch.device(f'cuda:{i}')

cpu(), gpu(), gpu(1)

(device(type='cpu'),
 device(type='cuda', index=0),
 device(type='cuda', index=1))

In [63]:
def num_gpus():  #@save
    """Get the number of available GPUs."""
    return torch.cuda.device_count()

num_gpus()

1

In [65]:
def try_gpu(i=0):  #@save
    """Return gpu(i) if exists, otherwise return cpu()."""
    if num_gpus() >= i + 1:
        return gpu(i)
    return cpu()

def try_all_gpus():  #@save
    """Return all available GPUs, or [cpu(),] if no GPU exists."""
    return [gpu(i) for i in range(num_gpus())]

try_gpu(), try_gpu(10), try_all_gpus()

(device(type='cuda', index=0),
 device(type='cpu'),
 [device(type='cuda', index=0)])

In [None]:
# 默认是在cpu上
x = torch.tensor([1, 2, 3])
x.device

device(type='cpu')

In [None]:
# 默认是在gpu
X = torch.ones(2, 3, device=try_gpu())
X

tensor([[1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')

In [69]:
!nvidia-smi

Thu Mar 13 01:32:26 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.46                 Driver Version: 546.80       CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 ...    On  | 00000000:01:00.0  On |                  N/A |
| N/A   47C    P8               4W /  95W |   1096MiB /  8188MiB |     11%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [76]:
# model on gpu
net = nn.Sequential(nn.LazyLinear(1))
net = net.to(device=try_gpu())
net[0].weight.data.device

device(type='cuda', index=0)

In [74]:
net(X)

tensor([[-0.1876],
        [-0.1876]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [None]:
@d2l.add_to_class(d2l.Trainer)  #@save
def __init__(self, max_epochs, num_gpus=0, gradient_clip_val=0):
    self.save_hyperparameters()
    self.gpus = [d2l.gpu(i) for i in range(min(num_gpus, d2l.num_gpus()))]

@d2l.add_to_class(d2l.Trainer)  #@save
def prepare_batch(self, batch):
    if self.gpus:
        batch = [a.to(self.gpus[0]) for a in batch]
    return batch

@d2l.add_to_class(d2l.Trainer)  #@save
def prepare_model(self, model):
    model.trainer = self
    model.board.xlim = [0, self.max_epochs]
    if self.gpus:
        model.to(self.gpus[0])
    self.model = model