In [8]:
from transformers import AutoModel
from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live
from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live

# 加载预训练模型
model = AutoModel.from_pretrained("t5-3b")

# 估计内存需求
estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=1, num_nodes=1)
print("--------------------------------")
estimate_zero2_model_states_mem_needs_all_live(model, num_gpus_per_node=1, num_nodes=1)
print("--------------------------------")

# shared params calculated only ones
total_params = sum(
    {p.data_ptr(): p.numel() for p in model.parameters()}.values()
)

largest_layer_params = 0
for m in model.modules():
    # assuming no shared params within a single layer
    layer_params = sum(p.numel() for p in m.parameters(recurse=False))
    largest_layer_params = max(largest_layer_params, layer_params)

largest_layer_memory = (4*largest_layer_params)

total_gpus = 1

case1 = largest_layer_memory + int(18*total_params/total_gpus)
case2 = largest_layer_memory
case3 = largest_layer_memory + int(2*total_params/total_gpus)

print(f"total params:         {total_params/1e6:6.2f}M")
print(f"largest layer params: {largest_layer_params/1e6:6.2f}M")
print(f"largest layer memory: {largest_layer_memory>>20:6}MB")
print(f"case1 gpu memory: {(case1)>>20:6}MB")
print(f"case2 gpu memory: {(case2)>>20:6}MB")
print(f"case3 gpu memory: {(case3)>>20:6}MB")


Estimated memory needed for params, optim states and gradients for a:
HW: Setup with 1 node, 1 GPU per node.
SW: Model with 2851M total params, 32M largest layer params.
  per CPU  |  per GPU |   Options
   71.71GB |   0.12GB | offload_param=OffloadDeviceEnum.cpu, offload_optimizer=OffloadDeviceEnum.cpu, zero_init=1
   71.71GB |   0.12GB | offload_param=OffloadDeviceEnum.cpu, offload_optimizer=OffloadDeviceEnum.cpu, zero_init=0
   63.74GB |   5.43GB | offload_param=none, offload_optimizer=OffloadDeviceEnum.cpu, zero_init=1
   63.74GB |   5.43GB | offload_param=none, offload_optimizer=OffloadDeviceEnum.cpu, zero_init=0
    0.18GB |  47.93GB | offload_param=none, offload_optimizer=none, zero_init=1
   15.93GB |  47.93GB | offload_param=none, offload_optimizer=none, zero_init=0
--------------------------------
Estimated memory needed for params, optim states and gradients for a:
HW: Setup with 1 node, 1 GPU per node.
SW: Model with 2851M total params.
  per CPU  |  per GPU |   Options
   