# LLM Calculator

Use can use this calculator for the following class of models:
- [DeepSeek V3-like models](#scrollTo=bdfe6522-3073-4ba8-8521-cb53f94ae663)
- [Mixtral-like models](#scrollTo=-L-QRkgDDy82)
- [Dense models](#scrollTo=kOhEU-XtD1gc)

## DeepSeek V3-like models

Enter your component values here:

In [1]:
batch_per_device = 8
num_dense_layers = 3
num_moe_layers = 58
num_activations = 2
max_target_length = 8192
vocab_size = 129280
emb_dim = 7168
mlp_dim = 18432
num_query_heads = 128
num_kv_heads = 128
q_lora_rank = 1536
kv_lora_rank = 512
qk_nope_head_dim = 128
qk_rope_head_dim = 64
v_head_dim = 128
moe_mlp_dim = 2048
shared_experts = 1
num_experts = 256
num_experts_per_tok = 8

Params:

In [2]:
attention_qkv = (num_dense_layers+num_moe_layers)*(emb_dim*q_lora_rank+q_lora_rank*num_query_heads*(qk_nope_head_dim+qk_rope_head_dim)+emb_dim*(kv_lora_rank+qk_rope_head_dim)+kv_lora_rank*num_query_heads*(qk_nope_head_dim+v_head_dim))
attention_projection = (num_dense_layers+num_moe_layers)*(emb_dim*num_query_heads*v_head_dim)
dense_mlp = num_dense_layers*emb_dim*mlp_dim*(num_activations+1)
moe_gate_shared = num_moe_layers*(emb_dim*num_experts+shared_experts*(num_activations+1)*(emb_dim*moe_mlp_dim))
moe_routed = num_moe_layers*((num_activations+1)*(emb_dim*moe_mlp_dim))
vocab_embedding = 2*emb_dim*vocab_size
total_params = (attention_qkv+attention_projection)+dense_mlp+moe_routed+num_experts*moe_routed+vocab_embedding
active_params = (attention_qkv+attention_projection)+dense_mlp+(moe_gate_shared+num_experts_per_tok*moe_routed)+vocab_embedding

In [3]:
import io
import pandas as pd

params = pd.read_csv(io.StringIO(f'''
Name, Value, Notes
Attention (QKV),{attention_qkv:.2e},MLA
Attention (Projection),{attention_projection:.2e},
Dense MLP,{dense_mlp:.2e},dense layers
MoE gate & shared,{moe_gate_shared:.2e},gate + shared experts
MoE routed,{moe_routed:.2e},routed experts
Vocab embedding,{vocab_embedding:.2e}, input & output embedding
Total Params,{total_params:.2e},Attention + gates & shared + experts * routed + vocab embedding
Active Params,{active_params:.2e},Attention + gates & shared + experts per token * routed + vocab embedding
'''))
params.fillna('', inplace=True)

In [4]:
params

Unnamed: 0,Name,Value,Notes
0,Attention (QKV),4250000000.0,MLA
1,Attention (Projection),7160000000.0,
2,Dense MLP,1190000000.0,dense layers
3,MoE gate & shared,2660000000.0,gate + shared experts
4,MoE routed,2550000000.0,routed experts
5,Vocab embedding,1850000000.0,input & output embedding
6,Total Params,671000000000.0,Attention + gates & shared + experts * routed ...
7,Active Params,37600000000.0,Attention + gates & shared + experts per token...


FLOPs/device:

In [5]:
one_mla_qkv = 2*batch_per_device*max_target_length*(emb_dim*q_lora_rank+q_lora_rank*num_query_heads*(qk_nope_head_dim+qk_rope_head_dim))+2*batch_per_device*max_target_length*(emb_dim*(kv_lora_rank+qk_rope_head_dim)+kv_lora_rank*num_query_heads*(qk_nope_head_dim+v_head_dim))
one_mla_attention = 2*batch_per_device*max_target_length*max_target_length*num_query_heads*(qk_nope_head_dim+qk_rope_head_dim+v_head_dim)
one_mla_projection = 2*batch_per_device*max_target_length*emb_dim*num_query_heads*v_head_dim
total_attention = (num_dense_layers+num_moe_layers)*(one_mla_qkv+one_mla_attention+one_mla_projection)
one_dense_layer = 2*batch_per_device*max_target_length*mlp_dim*emb_dim*num_activations+2*batch_per_device*max_target_length*mlp_dim*emb_dim
one_moe_layer = 2*batch_per_device*max_target_length*emb_dim*num_experts+(shared_experts+num_experts_per_tok)*(2*batch_per_device*max_target_length*moe_mlp_dim*emb_dim*num_activations+2*batch_per_device*max_target_length*moe_mlp_dim*emb_dim)
total_mlp = num_dense_layers*one_dense_layer+num_moe_layers*one_moe_layer
vocab_embedding_flops = 2*batch_per_device*max_target_length*emb_dim*vocab_size
total_forward = vocab_embedding_flops+total_attention+total_mlp
total_backward = 2*total_forward
total = total_forward+total_backward
total_tflops = total/(10**12)

In [6]:
flops = pd.read_csv(io.StringIO(f'''
Name, Value, Notes
One MLA QKV,{one_mla_qkv:.2e},
One MLA Attention,{one_mla_attention:.2e},
One MLA Projection,{one_mla_projection:.2e},
Total Attention,{total_attention:.2e},
One Dense Layer,{one_dense_layer:.2e},
One MoE Layer,{one_moe_layer:.2e},gate + shared + routed
Total MLP,{total_mlp:.2e},dense_layers + moe_layers
Vocab embedding,{vocab_embedding:.2e},
Total forward,{total_forward:.2e},embedding + attention + feedforward
Total backward,{total_backward:.2e},2x forward flops
Total,{total:.2e},
Total (TFLOPs),{total_tflops:.2e},
'''))
flops.fillna('', inplace=True)

In [7]:
flops

Unnamed: 0,Name,Value,Notes
0,One MLA QKV,9130000000000.0,
1,One MLA Attention,44000000000000.0,
2,One MLA Projection,15400000000000.0,
3,Total Attention,4180000000000000.0,
4,One Dense Layer,52000000000000.0,
5,One MoE Layer,52200000000000.0,gate + shared + routed
6,Total MLP,3180000000000000.0,dense_layers + moe_layers
7,Vocab embedding,1850000000.0,
8,Total forward,7480000000000000.0,embedding + attention + feedforward
9,Total backward,1.5e+16,2x forward flops


## Mixtral-like models

Enter your component values here:

In [8]:
batch_per_device = 1
num_dense_layers = 5
max_target_length = 4096
vocab_size = 32000
emb_dim = 4096
mlp_dim = 14336
head_dim = 128
num_query_heads = 32
num_kv_heads = 8
num_experts = 8
num_experts_per_tok = 2

Params:

In [9]:
attention_qkvo = num_dense_layers*(2*emb_dim*num_query_heads*head_dim + 2*emb_dim*num_kv_heads*head_dim)
mlp = num_dense_layers*(emb_dim*num_experts+3*emb_dim*mlp_dim)
vocab_embedding = 2*emb_dim*vocab_size
one_expert = attention_qkvo+mlp+vocab_embedding
total_params = attention_qkvo+vocab_embedding+num_experts*mlp
active_params = attention_qkvo+vocab_embedding+num_experts_per_tok*mlp

In [10]:
import io
import pandas as pd

params = pd.read_csv(io.StringIO(f'''
Name, Value, Notes
Attention (QKVO * layers),{attention_qkvo:.2e},"QO - 2DNH, KV - 2DKH"
MLP (matmuls * layers),{mlp:.2e},gates + 3 DF
Vocab embedding,{vocab_embedding:.2e},2 DV ( input & output embedding)
One expert,{one_expert:.2e},Attention + MLP + Vocab embedding
Total Params,{total_params:.2e},Attention + experts * MLP + Vocab embedding
Active Params,{active_params:.2e},Attention + experts per token * MLP + Vocab embedding
'''))
params.fillna('', inplace=True)

In [11]:
params

Unnamed: 0,Name,Value,Notes
0,Attention (QKVO * layers),210000000.0,"QO - 2DNH, KV - 2DKH"
1,MLP (matmuls * layers),881000000.0,gates + 3 DF
2,Vocab embedding,262000000.0,2 DV ( input & output embedding)
3,One expert,1350000000.0,Attention + MLP + Vocab embedding
4,Total Params,7520000000.0,Attention + experts * MLP + Vocab embedding
5,Active Params,2230000000.0,Attention + experts per token * MLP + Vocab em...


FLOPs:

In [12]:
forward_attention_weights = num_dense_layers*(2*2*batch_per_device*max_target_length*emb_dim*num_query_heads*head_dim+2*2*batch_per_device*max_target_length*emb_dim*num_kv_heads*head_dim)
forward_attention_o_weights = num_dense_layers*2*2*batch_per_device*max_target_length*max_target_length*num_query_heads*head_dim
forward_mlp = num_dense_layers*(2*batch_per_device*max_target_length*emb_dim*num_experts+3*2*batch_per_device*max_target_length*emb_dim*mlp_dim*num_experts_per_tok)
forward_vocab_embedding = 2*batch_per_device*max_target_length*emb_dim*vocab_size
total_forward = forward_attention_weights+forward_attention_o_weights+forward_mlp+forward_vocab_embedding
total_backward = 2*total_forward
total = total_forward+total_backward

In [13]:
flops =  pd.read_csv(io.StringIO(f'''
Name, Value, Notes
Forward Attention / weights,{forward_attention_weights:.2e},QKVO matmul with weights (4BSDNH + 4BSDKH)
Forward Attention /o weights,{forward_attention_o_weights:.2e},2 Dot product in attention (softmax ignored)
Forward MLP,{forward_mlp:.2e},gate + 3 * 2BSDF
Forward Vocab embedding,{forward_vocab_embedding:.2e},2BSDV
Total forward,{total_forward:.2e},
Total backward,{total_backward:.2e},
Total,{total:.2e},
'''))
flops.fillna('', inplace=True)

In [14]:
flops

Unnamed: 0,Name,Value,Notes
0,Forward Attention / weights,1720000000000.0,QKVO matmul with weights (4BSDNH + 4BSDKH)
1,Forward Attention /o weights,1370000000000.0,2 Dot product in attention (softmax ignored)
2,Forward MLP,14400000000000.0,gate + 3 * 2BSDF
3,Forward Vocab embedding,1070000000000.0,2BSDV
4,Total forward,18600000000000.0,
5,Total backward,37200000000000.0,
6,Total,55800000000000.0,


## Dense models

Enter your component values here:

In [15]:
batch_per_device = 1
num_dense_layers = 32
max_target_length = 4096
vocab_size = 32000
emb_dim = 4096
mlp_dim = 14336
head_dim = 128
num_query_heads = 32
num_kv_heads = 8

Params:

In [16]:
attention_qkvo=num_dense_layers*(2*emb_dim*num_query_heads*head_dim + 2*emb_dim*num_kv_heads*head_dim)
mlp=num_dense_layers*(3*emb_dim*mlp_dim)
vocab_embedding=2*emb_dim*vocab_size
total_params=attention_qkvo+vocab_embedding+mlp

In [17]:
import io
import pandas as pd

params = pd.read_csv(io.StringIO(f'''
Name, Value, Notes
Attention (QKVO * layers),{attention_qkvo:.2e},"QO - 2DNH, KV - 2DKH"
MLP (matmuls * layers),{mlp:.2e},3 DF
Vocab embedding,{vocab_embedding:.2e},2 DV ( input & output embedding)
Total Params,{total_params:.2e},Attention + MLP + Vocab embedding
'''))
params.fillna('', inplace=True)

In [18]:
params

Unnamed: 0,Name,Value,Notes
0,Attention (QKVO * layers),1340000000.0,"QO - 2DNH, KV - 2DKH"
1,MLP (matmuls * layers),5640000000.0,3 DF
2,Vocab embedding,262000000.0,2 DV ( input & output embedding)
3,Total Params,7240000000.0,Attention + MLP + Vocab embedding


FLOPs:

In [19]:
forward_attention_weights=num_dense_layers*(2*2*batch_per_device*max_target_length*emb_dim*num_query_heads*head_dim+2*2*batch_per_device*max_target_length*emb_dim*num_kv_heads*head_dim)
forward_attention_o_weights=num_dense_layers*2*2*batch_per_device*max_target_length*max_target_length*num_query_heads*head_dim
forward_mlp=num_dense_layers*(3*2*batch_per_device*max_target_length*emb_dim*mlp_dim)
forward_vocab_embedding=2*batch_per_device*max_target_length*emb_dim*vocab_size
total_forward=forward_attention_weights+forward_attention_o_weights+forward_mlp+forward_vocab_embedding
total_backward=2*total_forward
total=total_forward+total_backward

In [20]:
flops = pd.read_csv(io.StringIO(f'''
Name, Value, Notes
Forward Attention / weights,{forward_attention_weights:.2e},QKVO matmul with weights (4BSDNH + 4BSDKH)
Forward Attention /o weights,{forward_attention_o_weights:.2e},2 Dot product in attention (softmax ignored)
Forward MLP,{forward_mlp:.2e},3 * 2BSDF
Forward Vocab embedding,{forward_vocab_embedding:.2e},2BSDV
Total forward,{total_forward:.2e},
Total backward,{total_backward:.2e},
Total,{total:.2e},
'''))
flops.fillna('', inplace=True)

In [21]:
flops

Unnamed: 0,Name,Value,Notes
0,Forward Attention / weights,11000000000000.0,QKVO matmul with weights (4BSDNH + 4BSDKH)
1,Forward Attention /o weights,8800000000000.0,2 Dot product in attention (softmax ignored)
2,Forward MLP,46200000000000.0,3 * 2BSDF
3,Forward Vocab embedding,1070000000000.0,2BSDV
4,Total forward,67000000000000.0,
5,Total backward,134000000000000.0,
6,Total,201000000000000.0,
