## Exploring Llama2 Architecture
Layers, parameters, input and output sizes.

In [1]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import accelerate
#import bitsandbytes  # Works with CUDA
import numpy as np
from tqdm import tqdm
import pandas as pd
import time

# device = torch.device("mps") if torch.backends.mps.is_built() else torch.device("cpu")  # To run on mac
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [2]:
# Get model and tokenizer
model_name = "meta-llama/Llama-2-7b-chat-hf"
access_token = os.environ["HF_API_KEY"]

# Quantization: https://huggingface.co/docs/transformers/v4.33.2/en/main_classes/quantization
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)

model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto",  token=access_token);  # Without using bnb_config
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, token=access_token);

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



## Configuration / Architecture
Model is loaded without quantization because it can affect the number of trainable parameters.

In [3]:
print("Number of parameters (all, excluding embeddings, trainable): ({}, {}, {})".format(model.num_parameters(), model.num_parameters(exclude_embeddings=True), model.num_parameters(only_trainable=True)))
print("Number of attention heads: ", model.config.num_attention_heads)
print("Number of hidden layers: ", model.config.num_hidden_layers)
print("Number of key_value heads (Grouped Query Attention): ", model.config.num_key_value_heads)
print("Hidden size: ", model.config.hidden_size)
print("Hidden activation: ", model.config.hidden_act)
print("Intermediate size: ", model.config.intermediate_size)
print("Maximum sequence length (context): ", model.config.max_position_embeddings)
print("Vocabulary size: ", model.config.vocab_size)
print("\nModel:\n", model)

Number of parameters (all, excluding embeddings, trainable): (6738415616, 6607343616, 6738415616)
Number of attention heads:  32
Number of hidden layers:  32
Number of key_value heads (Grouped Query Attention):  32
Hidden size:  4096
Hidden activation:  silu
Intermediate size:  11008
Maximum sequence length (context):  4096
Vocabulary size:  32000

Model:
 LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bi

## Parameters layer by layer
1. Input and output sizes per layer.
2. Number of parameters per layer (shape of weights).

In [4]:
layers = [module for module in model.modules()]

for layer in layers:
  print(layer)
  print("-----------------------------------------------")

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_

In [5]:
for name, param in model.named_parameters():
    print(name, param.size())

model.embed_tokens.weight torch.Size([32000, 4096])
model.layers.0.self_attn.q_proj.weight torch.Size([4096, 4096])
model.layers.0.self_attn.k_proj.weight torch.Size([4096, 4096])
model.layers.0.self_attn.v_proj.weight torch.Size([4096, 4096])
model.layers.0.self_attn.o_proj.weight torch.Size([4096, 4096])
model.layers.0.mlp.gate_proj.weight torch.Size([11008, 4096])
model.layers.0.mlp.up_proj.weight torch.Size([11008, 4096])
model.layers.0.mlp.down_proj.weight torch.Size([4096, 11008])
model.layers.0.input_layernorm.weight torch.Size([4096])
model.layers.0.post_attention_layernorm.weight torch.Size([4096])
model.layers.1.self_attn.q_proj.weight torch.Size([4096, 4096])
model.layers.1.self_attn.k_proj.weight torch.Size([4096, 4096])
model.layers.1.self_attn.v_proj.weight torch.Size([4096, 4096])
model.layers.1.self_attn.o_proj.weight torch.Size([4096, 4096])
model.layers.1.mlp.gate_proj.weight torch.Size([11008, 4096])
model.layers.1.mlp.up_proj.weight torch.Size([11008, 4096])
model.l

## Initializing model from different configurations
To explore how changing the model's parameters will affect the number of weights in each layer.

In [6]:
from transformers import LlamaModel, LlamaConfig

# Initializing a LLaMA llama-7b style configuration
# hidden_size must be divisible by num_heads
configuration = LlamaConfig(vocab_size=32000, hidden_size=2048, intermediate_size=11008, num_hidden_layers=1, num_attention_heads=32)

# Initializing a model from the llama-7b style configuration
model = LlamaModel(configuration)

# Look at the architecture
print("Number of parameters (all, excluding embeddings, trainable): ({}, {}, {})".format(model.num_parameters(), model.num_parameters(exclude_embeddings=True), model.num_parameters(only_trainable=True)))
print("Number of attention heads: ", model.config.num_attention_heads)
print("Number of hidden layers: ", model.config.num_hidden_layers)
print("Number of key_value heads (Grouped Query Attention): ", model.config.num_key_value_heads)
print("Hidden size: ", model.config.hidden_size)
print("Hidden activation: ", model.config.hidden_act)
print("Intermediate size: ", model.config.intermediate_size)
print("Maximum sequence length (context): ", model.config.max_position_embeddings)
print("Vocabulary size: ", model.config.vocab_size)
print("\nModel:\n", model)

Number of parameters (all, excluding embeddings, trainable): (149952512, 84416512, 149952512)
Number of attention heads:  32
Number of hidden layers:  1
Number of key_value heads (Grouped Query Attention):  32
Hidden size:  2048
Hidden activation:  silu
Intermediate size:  11008
Maximum sequence length (context):  2048
Vocabulary size:  32000

Model:
 LlamaModel(
  (embed_tokens): Embedding(32000, 2048)
  (layers): ModuleList(
    (0): LlamaDecoderLayer(
      (self_attn): LlamaAttention(
        (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
        (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
        (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
        (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        (rotary_emb): LlamaRotaryEmbedding()
      )
      (mlp): LlamaMLP(
        (gate_proj): Linear(in_features=2048, out_features=11008, bias=False)
        (up_proj): Linear(in_features=2048, out_features

In [7]:
for name, param in model.named_parameters():
    print(name, param.size())

embed_tokens.weight torch.Size([32000, 2048])
layers.0.self_attn.q_proj.weight torch.Size([2048, 2048])
layers.0.self_attn.k_proj.weight torch.Size([2048, 2048])
layers.0.self_attn.v_proj.weight torch.Size([2048, 2048])
layers.0.self_attn.o_proj.weight torch.Size([2048, 2048])
layers.0.mlp.gate_proj.weight torch.Size([11008, 2048])
layers.0.mlp.up_proj.weight torch.Size([11008, 2048])
layers.0.mlp.down_proj.weight torch.Size([2048, 11008])
layers.0.input_layernorm.weight torch.Size([2048])
layers.0.post_attention_layernorm.weight torch.Size([2048])
norm.weight torch.Size([2048])
