In [16]:
import copy
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig

In [7]:
llama_8b_config = AutoConfig.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
llama_70b_config = AutoConfig.from_pretrained("meta-llama/Meta-Llama-3-70B-Instruct")

# nemotron_config = AutoConfig.from_pretrained("nvidia/Nemotron-4-340B-Instruct")

In [3]:
llama_8b_config

LlamaConfig {
  "_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128009,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 2.0,
    "type": "dynamic"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.4",
  "use_cache": true,
  "vocab_size": 128256
}

In [5]:
(llama_70b_config.hidden_size / llama_8b_config.hidden_size,

llama_70b_config.intermediate_size / llama_8b_config.intermediate_size,

llama_70b_config.num_hidden_layers / llama_8b_config.num_hidden_layers,

llama_70b_config.num_attention_heads / llama_8b_config.num_attention_heads,

llama_70b_config.num_key_value_heads / llama_8b_config.num_key_value_heads)

(2.0, 2.0, 2.5, 2.0, 1.0)

In [None]:
# nemotron 340B
hidden_size = 18432
ffn_hidden_size = 73728
num_layers = 96
num_attention_heads = 96

In [67]:
def estimate_param(dim70b, dim8b, mult_factor=1, nearest_multiple=None, model_size=420):
	x = dim70b * (model_size / 70) * (dim70b / dim8b) * (8 / 70)
	x *= mult_factor
	if nearest_multiple is not None:
		return nearest_multiple * round(x / nearest_multiple)

In [54]:
llama_400b_config = copy.deepcopy(llama_70b_config)
llama_400b_config.hidden_size = estimate_param(llama_70b_config.hidden_size, llama_8b_config.hidden_size, 128)
llama_400b_config.intermediate_size = estimate_param(llama_70b_config.intermediate_size, llama_8b_config.intermediate_size, 128)
llama_400b_config.num_attention_heads = estimate_param(llama_70b_config.num_attention_heads, llama_8b_config.num_attention_heads, 8)
llama_400b_config.num_key_value_heads = estimate_param(llama_70b_config.num_key_value_heads, llama_8b_config.num_key_value_heads, 8)
# llama_400b_config.num_hidden_layers = (llama_70b_config.num_hidden_layers / llama_8b_config.num_hidden_layers) 

In [75]:
# hidden_size = 11264
# intermediate_size = 39296
# num_attention_heads = 88
# num_key_value_heads = 8

llama_400b_config.hidden_size, llama_400b_config.intermediate_size, llama_400b_config.num_attention_heads, llama_400b_config.num_key_value_heads

(11264, 39296, 88, 8)

In [76]:
# find num layers
llama_400b_config.num_hidden_layers = 1

In [None]:
model = AutoModelForCausalLM.from_config(llama_400b_config)

In [79]:
# model.model.layers[0]
# count total params
total_params = sum(p.numel() for p in model.model.layers[0].parameters())
print(total_params/1e9)
num_estimated_hidden_layers = int(420 / (total_params / 1e9))
num_estimated_hidden_layers

1.604737024


261