In [47]:
# calculate model size

# Dummy Model configurations for Llama 3.2 1B
LLAMA32_CONFIG = {
    "vocab_size": 128_256,
    "emb_dim": 2048,
    "n_heads": 32,
    "n_layers": 16,
    "hidden_dim": 8192,
    "n_kv_groups": 8,
}


def calculate_size(LLAMA32_CONFIG, return_it=False):
    # Extract parameters
    vocab_size = LLAMA32_CONFIG["vocab_size"]
    emb_dim = LLAMA32_CONFIG["emb_dim"]
    n_heads = LLAMA32_CONFIG["n_heads"]
    n_layers = LLAMA32_CONFIG["n_layers"]
    hidden_dim = LLAMA32_CONFIG["hidden_dim"]
    n_kv_groups = LLAMA32_CONFIG["n_kv_groups"]

    # Embedding Layer
    embedding_size = vocab_size * emb_dim

    # Attention Mechanism (per layer)
    # QKV projections
    qkv_size = 3 * (emb_dim * emb_dim)
    # Output projection
    output_proj_size = emb_dim * emb_dim
    # Adjustments for key-value groups
    qkv_size_grouped = 2 * (emb_dim * emb_dim // n_kv_groups)  # key and value projections
    attention_size = qkv_size + qkv_size_grouped + output_proj_size

    # Feedforward Layer (per layer)
    feedforward_size = (emb_dim * hidden_dim) + (hidden_dim * emb_dim)

    # Layer Normalization (per layer)
    layer_norm_size = 2 * emb_dim

    # Total per-layer size
    layer_size = attention_size + feedforward_size + layer_norm_size

    # Stacked layers
    total_layer_size = layer_size * n_layers

    # Output Projection
    output_projection_size = emb_dim * vocab_size

    # Total Model Size
    total_model_size = embedding_size + total_layer_size + output_projection_size
    
    if return_it:return total_model_size
    print(total_model_size)

calculate_size(LLAMA32_CONFIG)

1347485696


In [2]:
import torch

# Llama 3.2 3B
LLAMA32_CONFIG = {
    "vocab_size": 128_256,      # Vocabulary size
    "context_length": 131_072,  # Context length
    "emb_dim": 3072,            # Embedding dimension
    "n_heads": 24,              # Number of attention heads
    "n_layers": 28,             # Number of layers
    "hidden_dim": 8192,         # Size of the intermediate dimension in FeedForward
    "n_kv_groups": 8,           # Key-Value groups for grouped-query attention
    "rope_base": 500_000.0,     # The base in RoPE's "theta"
    "dtype": torch.bfloat16,    # Lower-precision dtype to reduce memory usage
    "rope_freq": {              # RoPE frequency scaling
        "factor": 32.0,
        "low_freq_factor": 1.0,
        "high_freq_factor": 4.0,
        "original_context_length": 8192,
    }
}

calculate_size(LLAMA32_CONFIG)

3320487936


In [9]:

# Llama 3.2 1B
LLAMA32_CONFIG = {
    "vocab_size": 128_256,      # Vocabulary size
    "context_length": 131_072,  # Context length
    "emb_dim": 2048,            # Embedding dimension
    "n_heads": 32,              # Number of attention heads
    "n_layers": 16,             # Number of layers
    "hidden_dim": 8192,         # Size of the intermediate dimension in FeedForward
    "n_kv_groups": 8,           # Key-Value groups for grouped-query attention
    "rope_base": 500_000.0,     # The base in RoPE's "theta"
    "dtype": torch.bfloat16,    # Lower-precision dtype to reduce memory usage
    "rope_freq": {              # RoPE frequency scaling
        "factor": 32.0,
        "low_freq_factor": 1.0,
        "high_freq_factor": 4.0,
        "original_context_length": 8192,
    }
}
calculate_size(LLAMA32_CONFIG)

1347485696


### Custom smaller models

In [50]:
def check_config(LLAMA32_CONFIG):
    '''
    emb_dim//n_heads must be even
    emb_dim must be divisible by num_kv_groups
    '''
    print(f'these values should be True')
    print(LLAMA32_CONFIG["emb_dim"] % LLAMA32_CONFIG["n_heads"] == 0)
    print((LLAMA32_CONFIG["emb_dim"] // LLAMA32_CONFIG["n_kv_groups"])%2 == 0)
    




# Llama 3.2 200M
LLAMA32_CONFIG = {
    "vocab_size": 50_000,       # 128_256 reduced vocabulary size
    "context_length": 2048,     # 131_072 reduced Context length (unrelated to model size)
    "emb_dim": 1024,            # 2048 reduced Embedding dimension
    "n_heads": 16,              # 32 reduced Number of attention heads
    "n_layers": 8,             # 16 reduced Number of layers
    "hidden_dim": 4096,         # 8192 Size of the intermediate dimension in FeedForward
    "n_kv_groups": 8,           # 8 Key-Value groups for grouped-query attention
    "rope_base": 500_000.0,     # 500_000 The base in RoPE's "theta"
    "dtype": torch.bfloat16,    # Lower-precision dtype to reduce memory usage
    "rope_freq": {              # RoPE frequency scaling
        "factor": 32.0,
        "low_freq_factor": 1.0,
        "high_freq_factor": 4.0,
        "original_context_length": 8192,
    }
}

print(f' model size: {calculate_size(LLAMA32_CONFIG, return_it=True)}')
check_config(LLAMA32_CONFIG)

 model size: 205176832
these values should be True
True
True


### Debug mode

In [6]:
# use this one (not above)?
import torch

'''
emb_dim//num_heads must be even
emb_dim must be divisible by num_kv_groups
'''

# Debug mode
LLAMA32_CONFIG = {
    # d_out = emb_dim
    # Embedding dimension <d_out // num_heads> must be even
    "vocab_size": 128_256,      # Vocabulary size
    "context_length": 10,  # Context length
    # d_in=d_out=emb_dim,
    # d_out must be divisible by num_heads
    "emb_dim": 8,            # Embedding dimension
    # (num_heads must be divisible by num_kv_groups)
    "n_heads": 4,              # Number of attention heads
    "n_layers": 2,             # Number of layers
    "hidden_dim": 16,         # Size of the intermediate dimension in FeedForward
    "n_kv_groups": 2,           # Key-Value groups for grouped-query attention
    "rope_base": 500_000.0,     # The base in RoPE's "theta"
    "dtype": torch.bfloat16,    # Lower-precision dtype to reduce memory usage
    "rope_freq": {              # RoPE frequency scaling
        "factor": 32.0,
        "low_freq_factor": 1.0,
        "high_freq_factor": 4.0,
        "original_context_length": 8192,
    }
}
calculate_size(LLAMA32_CONFIG)

2053280


# combining

In [52]:
# Llama 3.2 1B

calculate_size(LLAMA32_CONFIG)

205176832


In [None]:
if args.debug:
    LLAMA_CONFIG = {
        "vocab_size": 50_000,        # Small vocab size for quick embedding testing
        "context_length": 8,      # Very short context length
        "emb_dim": 16,            # Minimal embedding dimension
        "n_heads": 2,             # Minimal number of attention heads
        "n_layers": 2,            # Minimal number of transformer layers
        "hidden_dim": 64,         # Scaled-down feedforward dimension
        "n_kv_groups": 1,         # Simplified attention grouping
        "drop_rate": 0.0,         # Dropout deactivated for deterministic debugging
        "qkv_bias": False         # Simplified attention mechanism
    }

else:
    # Llama 3.2 200M
    LLAMA32_CONFIG = {
        "vocab_size": 50_000,       # 128_256 reduced vocabulary size
        "context_length": 2048,     # 131_072 reduced Context length (unrelated to model size)
        "emb_dim": 1024,            # 2048 reduced Embedding dimension
        "n_heads": 16,              # 32 reduced Number of attention heads
        "n_layers": 8,             # 16 reduced Number of layers
        "hidden_dim": 4096,         # 8192 Size of the intermediate dimension in FeedForward
        "n_kv_groups": 8,           # 8 Key-Value groups for grouped-query attention
        "rope_base": 500_000.0,     # 500_000 The base in RoPE's "theta"
        "dtype": torch.bfloat16,    # Lower-precision dtype to reduce memory usage
        "rope_freq": {              # RoPE frequency scaling
            "factor": 32.0,
            "low_freq_factor": 1.0,
            "high_freq_factor": 4.0,
            "original_context_length": 8192,
        }
    }

calculate_size(LLAMA124M_CONFIG)
