##### 单精度训练示例

In [2]:
#pytorch版本
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import SGD

inputs=[torch.zeros(10,1,1)]*10
targets=[torch.ones(10,1,1)]*10
model=nn.Linear(1,1)
optimizer=SGD(model.parameters(),lr=0.01)   
#梯度清零
optimizer.zero_grad()
#训练
for input,target in zip(inputs,targets):
    pred=model(input)
    loss=F.l1_loss(pred,target)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

In [3]:
#mmengine版本
from mmengine.optim import OptimWrapper

inputs=[torch.zeros(10,1,1)]*10
targets=[torch.ones(10,1,1)]*10
model=nn.Linear(1,1)
optimizer=SGD(model.parameters(),lr=0.01) 
#构造优化器包装器
optimizer_wrapper=OptimWrapper(optimizer=optimizer)

for input,target in zip(inputs,targets):
    pred=model(input)
    loss=F.l1_loss(pred,target)
    optimizer_wrapper.update_params(loss)

##### 混合精度训练

In [4]:
#pytorch版本
from torch.cuda.amp import autocast

inputs=[torch.zeros(10,1,1)]*10
targets=[torch.ones(10,1,1)]*10
model=nn.Linear(1,1).cuda()
optimizer=SGD(model.parameters(),lr=0.01)
#梯度清零
optimizer.zero_grad()

for input,target in zip(inputs,targets):
    with autocast():
        pred=model(input.cuda())
    loss=F.l1_loss(pred,target.cuda())
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

In [5]:
#mmengine版本
from mmengine.optim import AmpOptimWrapper

inputs=[torch.zeros(10,1,1)]*10
targets=[torch.ones(10,1,1)]*10
model=nn.Linear(1,1).cuda()
optimizer=SGD(model.parameters(),lr=0.01)
#构造优化器包装器
optimizer_wrapper=AmpOptimWrapper(optimizer=optimizer)
for input,target in zip(inputs,targets):
    with optimizer_wrapper.optim_context(model):
        pred=model(input.cuda())
    loss=F.l1_loss(pred,target.cuda())
    optimizer_wrapper.update_params(loss)


##### 混合精度和梯度累加

In [6]:
#pytorch版本
#梯度累加实现原理即为只在某些步骤进行梯度更新和梯度清零
for idx,(input,target) in enumerate(zip(inputs,targets)):
    with autocast():
        pred=model(input.cuda())
    loss=F.l1_loss(pred,target.cuda())
    loss.backward()
    if idx%2==0:
        optimizer.step()
        optimizer.zero_grad()

In [7]:
#mmengine版本
optimizer_wrapper =AmpOptimWrapper(optimizer=optimizer,accumulative_counts=2)

for idx,(input,target) in enumerate(zip(inputs,targets)):
    with optimizer_wrapper.optim_context(model):
        pred=model(input.cuda())
    loss=F.l1_loss(pred,target.cuda())
    optimizer_wrapper.update_params(loss)


mmengine的优化器封装同样实现了backward，step，zero_grad等接口，用户可以像普通优化器一样使用。

In [8]:
#使用mmengine实现pytorch风格的优化器
for idx,(input,target) in enumerate(zip(inputs,targets)):
    #梯度清零
    optimizer_wrapper.zero_grad()
    with optimizer_wrapper.optim_context(model):
        pred=model(input.cuda())
    loss=F.l1_loss(pred,target.cuda())
    #显著的区别就在此处，不再是loss.backward()
    optimizer_wrapper.backward(loss)
    if idx%2==0:
        optimizer_wrapper.step()
        optimizer_wrapper.zero_grad()

梯度裁剪

In [9]:
optimizer_wrapper=AmpOptimWrapper(optimizer=optimizer,clip_grad=dict(max_norm=1.))
optimizer_wrapper=AmpOptimWrapper(optimizer=optimizer,clip_grad=dict(clip_value=0.2))

获取学习率和动量

In [1]:
from mmengine.optim import OptimWrapper
import torch.nn as nn
from torch.optim import SGD

model=nn.Linear(10,10)
optimizer=SGD(model.parameters(),lr=0.01,momentum=0.9)
optimizer_wrapper=OptimWrapper(optimizer=optimizer)
print(optimizer_wrapper.get_lr())
print(optimizer_wrapper.get_momentum())
print(optimizer.param_groups[0]['lr'])
print(optimizer.param_groups[0]['momentum'])

  from .autonotebook import tqdm as notebook_tqdm


{'lr': [0.01]}
{'momentum': [0.9]}
0.01
0.9


导出/加载优化器状态字典

In [2]:
import torch.nn as nn
from torch.optim import SGD
from mmengine.optim import OptimWrapper,AmpOptimWrapper

model=nn.Linear(10,10)
optimizer=SGD(model.parameters(),lr=0.01,momentum=0.9)

optimizer_wrapper=OptimWrapper(optimizer=optimizer)
amp_optimizer_wrapper=AmpOptimWrapper(optimizer=optimizer)

#导出状态字典
optim_state_dict=optimizer_wrapper.state_dict()
amp_state_dict=amp_optimizer_wrapper.state_dict()
print(optim_state_dict)
print(amp_state_dict)
print('*'*20)
#加载状态字典
optim_wrapper_new=OptimWrapper(optimizer=optimizer)
amp_optimizer_wrapper_new=AmpOptimWrapper(optimizer=optimizer)
optim_wrapper_new.load_state_dict(optim_state_dict)
amp_optimizer_wrapper_new.load_state_dict(amp_state_dict)

{'state': {}, 'param_groups': [{'lr': 0.01, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'maximize': False, 'foreach': None, 'params': [0, 1]}]}
{'state': {}, 'param_groups': [{'lr': 0.01, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'maximize': False, 'foreach': None, 'params': [0, 1]}], 'loss_scaler': {'scale': 65536.0, 'growth_factor': 2.0, 'backoff_factor': 0.5, 'growth_interval': 2000, '_growth_tracker': 0}}
********************


优化器字典：优化器字典并没有实现update_params等方法，不能直接用于训练中更新参数等，但有get_lr等方法

In [4]:
from torch.optim import SGD
import torch.nn as nn
from mmengine.optim import OptimWrapper,OptimWrapperDict

gen=nn.Linear(10,10)
disc=nn.Linear(10,10)

optim_gen=SGD(gen.parameters(),lr=0.01,momentum=0.9)
disc_gen=SGD(disc.parameters(),lr=0.01,momentum=0.9)

optim_wrapper_gen=OptimWrapper(optimizer=optim_gen)
optim_wrapper_disc=OptimWrapper(optimizer=disc_gen)
optim_wrapper_dict=OptimWrapperDict(gen=optim_wrapper_gen,disc=optim_wrapper_disc)
print(optim_wrapper_dict.get_lr())
print(optim_wrapper_dict.get_momentum())

{'gen.lr': [0.01], 'disc.lr': [0.01]}
{'gen.momentum': [0.9], 'disc.momentum': [0.9]}


执行器中配置优化器封装

In [None]:
optimizer=dict(type='SGD',lr=0.01,momentum=0.9)
optim_wrapper=dict(type='OptimWrapper',optimizer=optimizer)