In [1]:
import os
import copy
import json
from transformers import AutoModelForCausalLM, AutoConfig
from safetensors.torch import save_file

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
llama_8b_config = AutoConfig.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
llama_70b_config = AutoConfig.from_pretrained("meta-llama/Meta-Llama-3-70B-Instruct")

# nemotron_config = AutoConfig.from_pretrained("nvidia/Nemotron-4-340B-Instruct")

In [3]:
llama_8b_config

LlamaConfig {
  "_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128009,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 2.0,
    "type": "dynamic"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.4",
  "use_cache": true,
  "vocab_size": 128256
}

In [4]:
# nemotron 340B
hidden_size = 18432
ffn_hidden_size = 73728
num_layers = 96
num_attention_heads = 96

In [5]:
def estimate_param(dim70b, dim8b, nearest_multiple=None, mult_factor=1, model_size=420):
	x = dim70b * (model_size / 70) * (dim70b / dim8b) * (8 / 70)
	x *= mult_factor
	if nearest_multiple is not None:
		return nearest_multiple * round(x / nearest_multiple)

### Config A

- deep

1 LAYER TRAINING: ~16GB

In [7]:
llama_400b_config = copy.deepcopy(llama_70b_config)
llama_400b_config.hidden_size = estimate_param(llama_70b_config.hidden_size, llama_8b_config.hidden_size, 128)
llama_400b_config.intermediate_size = estimate_param(llama_70b_config.intermediate_size, llama_8b_config.intermediate_size, 128)
llama_400b_config.num_attention_heads = estimate_param(llama_70b_config.num_attention_heads, llama_8b_config.num_attention_heads, 8)
llama_400b_config.num_key_value_heads = estimate_param(llama_70b_config.num_key_value_heads, llama_8b_config.num_key_value_heads, 8)
llama_400b_config.hidden_size, llama_400b_config.intermediate_size, llama_400b_config.num_attention_heads, llama_400b_config.num_key_value_heads

(11264, 39296, 88, 8)

In [8]:
llama_400b_config.hidden_size

11264

In [12]:
# hidden_size = 11264
# intermediate_size = 39296
# num_attention_heads = 88
# num_key_value_heads = 8

llama_400b_config.hidden_size, llama_400b_config.intermediate_size, llama_400b_config.num_attention_heads, llama_400b_config.num_key_value_heads

(11264, 39296, 88, 8)

In [13]:
# find num layers
llama_400b_config.num_hidden_layers = 1

In [16]:
model = AutoModelForCausalLM.from_config(llama_400b_config)

In [17]:
weights = model.state_dict()

In [18]:
output_dir = "/workspace/models/meta-llama/Meta-Llama-3-400B-Instruct-A"
os.makedirs(output_dir, exist_ok=True)

In [19]:
save_file(weights, os.path.join(output_dir, "model_state_dict.safetensors"))

In [24]:
# save config
with open(os.path.join(output_dir, "config.json"), "w") as f:
	json.dump(llama_400b_config.to_dict(), f)

In [56]:
# model.model.layers[0]
# count total params
total_params = sum(p.numel() for p in model.model.layers[0].parameters())
print(2 * total_params / 1e9)
num_estimated_hidden_layers = int(420 / (total_params / 1e9))
num_estimated_hidden_layers

3.209474048


261

In [58]:
tot_mem = 0
for n,p in model.model.layers[0].named_parameters():
    if "proj" in n:	
        quant_mem = 2 * p.numel() / 4 / 1e9
        quant_stat_mem = 2 * p.numel() / 128 / 1e9
        print(n, quant_mem, quant_stat_mem)
        tot_mem += quant_mem + quant_stat_mem
        
        # TODO: add loraA, loraB, magnitude params
        
    else:
        layer_mem = 2 * p.numel() / 1e9
        print(n, layer_mem)
        tot_mem += layer_mem

self_attn.q_proj.weight 0.063438848 0.001982464
self_attn.k_proj.weight 0.005767168 0.000180224
self_attn.v_proj.weight 0.005767168 0.000180224
self_attn.o_proj.weight 0.063438848 0.001982464
mlp.gate_proj.weight 0.221315072 0.006916096
mlp.up_proj.weight 0.221315072 0.006916096
mlp.down_proj.weight 0.221315072 0.006916096
input_layernorm.weight 2.2528e-05
post_attention_layernorm.weight 2.2528e-05


### Config B

- mid

1 LAYER TRAINING: ~25GB

In [6]:
llama_400b_config = copy.deepcopy(llama_70b_config)
llama_400b_config.hidden_size = estimate_param(llama_70b_config.hidden_size, llama_8b_config.hidden_size, mult_factor=1.5, nearest_multiple=128)
llama_400b_config.intermediate_size = estimate_param(llama_70b_config.intermediate_size, llama_8b_config.intermediate_size, mult_factor=1.5, nearest_multiple=128)
llama_400b_config.num_attention_heads = estimate_param(llama_70b_config.num_attention_heads, llama_8b_config.num_attention_heads, mult_factor=1.5, nearest_multiple=8)
llama_400b_config.num_key_value_heads = estimate_param(llama_70b_config.num_key_value_heads, llama_8b_config.num_key_value_heads, mult_factor=1.5, nearest_multiple=8)
llama_400b_config.hidden_size, llama_400b_config.intermediate_size, llama_400b_config.num_attention_heads, llama_400b_config.num_key_value_heads

(16896, 59008, 128, 8)

In [7]:
# hidden_size = 16896
# intermediate_size = 59008
# num_attention_heads = 128
# num_key_value_heads = 8

llama_400b_config.hidden_size, llama_400b_config.intermediate_size, llama_400b_config.num_attention_heads, llama_400b_config.num_key_value_heads

(16896, 59008, 128, 8)

In [8]:
# find num layers
llama_400b_config.num_hidden_layers = 1

In [9]:
model = AutoModelForCausalLM.from_config(llama_400b_config).to("cuda:0")

In [11]:
weights = model.state_dict()

In [13]:
output_dir = "/workspace/models/meta-llama/Meta-Llama-3-400B-Instruct-B"
os.makedirs(output_dir, exist_ok=True)

In [14]:
save_file(weights, os.path.join(output_dir, "model_state_dict.safetensors"))

In [15]:
# save config
with open(os.path.join(output_dir, "config.json"), "w") as f:
	json.dump(llama_400b_config.to_dict(), f)

In [16]:
# model.model.layers[0]
# count total params
total_params = sum(p.numel() for p in model.model.layers[0].parameters())
print(2 * total_params / 1e9)
num_estimated_hidden_layers = int(420 / (total_params / 1e9))
num_estimated_hidden_layers

7.19533056


116

In [17]:
tot_mem = 0
for n,p in model.model.layers[0].named_parameters():
    if "proj" in n:	
        quant_mem = 2 * p.numel() / 4 / 1e9
        quant_stat_mem = 2 * p.numel() / 128 / 1e9
        print(n, quant_mem, quant_stat_mem)
        tot_mem += quant_mem + quant_stat_mem
        
        # TODO: add loraA, loraB, magnitude params
        
    else:
        layer_mem = 2 * p.numel() / 1e9
        print(n, layer_mem)
        tot_mem += layer_mem

self_attn.q_proj.weight 0.142737408 0.004460544
self_attn.k_proj.weight 0.008921088 0.000278784
self_attn.v_proj.weight 0.008921088 0.000278784
self_attn.o_proj.weight 0.142737408 0.004460544
mlp.gate_proj.weight 0.498499584 0.015578112
mlp.up_proj.weight 0.498499584 0.015578112
mlp.down_proj.weight 0.498499584 0.015578112
input_layernorm.weight 3.3792e-05
post_attention_layernorm.weight 3.3792e-05


In [18]:
tot_mem

1.85509632

### Config C

- wide

In [6]:
llama_400b_config = copy.deepcopy(llama_70b_config)
llama_400b_config.hidden_size = estimate_param(llama_70b_config.hidden_size, llama_8b_config.hidden_size, mult_factor=2, nearest_multiple=128)
llama_400b_config.intermediate_size = estimate_param(llama_70b_config.intermediate_size, llama_8b_config.intermediate_size, mult_factor=2, nearest_multiple=128)
llama_400b_config.num_attention_heads = estimate_param(llama_70b_config.num_attention_heads, llama_8b_config.num_attention_heads, mult_factor=2, nearest_multiple=8)
llama_400b_config.num_key_value_heads = estimate_param(llama_70b_config.num_key_value_heads, llama_8b_config.num_key_value_heads, mult_factor=2, nearest_multiple=8)
llama_400b_config.hidden_size, llama_400b_config.intermediate_size, llama_400b_config.num_attention_heads, llama_400b_config.num_key_value_heads

(22528, 78592, 176, 8)

In [8]:
# hidden_size = 22528
# intermediate_size = 78592
# num_attention_heads = 176
# num_key_value_heads = 8

llama_400b_config.hidden_size, llama_400b_config.intermediate_size, llama_400b_config.num_attention_heads, llama_400b_config.num_key_value_heads

(22528, 78592, 176, 8)

In [9]:
# find num layers
llama_400b_config.num_hidden_layers = 1

In [10]:
model = AutoModelForCausalLM.from_config(llama_400b_config).to("cuda:0")

: 

In [None]:
weights = model.state_dict()

In [None]:
output_dir = "/workspace/models/meta-llama/Meta-Llama-3-400B-Instruct-C"
os.makedirs(output_dir, exist_ok=True)

In [None]:
save_file(weights, os.path.join(output_dir, "model_state_dict.safetensors"))

In [None]:
# save config
with open(os.path.join(output_dir, "config.json"), "w") as f:
	json.dump(llama_400b_config.to_dict(), f)

In [None]:
# model.model.layers[0]
# count total params
total_params = sum(p.numel() for p in model.model.layers[0].parameters())
print(2 * total_params / 1e9)
num_estimated_hidden_layers = int(420 / (total_params / 1e9))
num_estimated_hidden_layers

7.19533056


116

In [None]:
tot_mem = 0
for n,p in model.model.layers[0].named_parameters():
    if "proj" in n:	
        quant_mem = 2 * p.numel() / 4 / 1e9
        quant_stat_mem = 2 * p.numel() / 128 / 1e9
        print(n, quant_mem, quant_stat_mem)
        tot_mem += quant_mem + quant_stat_mem
        
        # TODO: add loraA, loraB, magnitude params
        
    else:
        layer_mem = 2 * p.numel() / 1e9
        print(n, layer_mem)
        tot_mem += layer_mem

self_attn.q_proj.weight 0.142737408 0.004460544
self_attn.k_proj.weight 0.008921088 0.000278784
self_attn.v_proj.weight 0.008921088 0.000278784
self_attn.o_proj.weight 0.142737408 0.004460544
mlp.gate_proj.weight 0.498499584 0.015578112
mlp.up_proj.weight 0.498499584 0.015578112
mlp.down_proj.weight 0.498499584 0.015578112
input_layernorm.weight 3.3792e-05
post_attention_layernorm.weight 3.3792e-05


In [None]:
tot_mem

1.85509632