In [1]:
!pip install transformers torch



In [2]:
from transformers import GPT2Model
model = GPT2Model.from_pretrained('gpt2')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def count_params(model, is_human: bool = False):
    params: int = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return f"{params / 1e6:.2f}M" if is_human else params

print(model)
print("Total # of params:", count_params(model, is_human=True))

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
Total # of params: 124.44M


In [4]:
V: int = model.config.vocab_size
E: int = model.config.n_embd
P: int = model.config.n_positions
expected_wte = V * E
expected_wpe: int = P * E
print(f"wte | Expected: {expected_wte}")
print(f"wte | True:     {count_params(model._modules['wte'])}")
print(f"wpe | Expected: {expected_wpe}")
print(f"wpe | True:     {count_params(model._modules['wpe'])}")

wte | Expected: 38597376
wte | True:     38597376
wpe | Expected: 786432
wpe | True:     786432


In [5]:
expected_ln_1 = 2 * E
print(f"ln_1 | Expected: {expected_ln_1}")
print(f"ln_1 | True:     {count_params(model._modules['h'][0].ln_1)}")

ln_1 | Expected: 1536
ln_1 | True:     1536


In [6]:
expected_c_attn = E * (3 * E) + (3 * E)
expected_c_proj = E * E + E
expected_attn_dropout = 0
expected_resid_dropout = 0
expected_attn = expected_c_attn + expected_c_proj + expected_attn_dropout + expected_resid_dropout
print(f"c_attn | Expected: {expected_c_attn}")
print(f"c_attn | True:     {count_params(model._modules['h'][0].attn.c_attn)}")
print(f"c_proj | Expected: {expected_c_proj}")
print(f"c_proj | True:     {count_params(model._modules['h'][0].attn.c_proj)}")
print(f"attn_dropout | Expected: {expected_attn_dropout}")
print(f"attn_dropout | True:     {count_params(model._modules['h'][0].attn.attn_dropout)}")
print(f"resid_dropout | Expected: {expected_resid_dropout}")
print(f"resid_dropout | True:     {count_params(model._modules['h'][0].attn.resid_dropout)}")
print(f"attn | Expected: {expected_attn}")
print(f"attn | True:     {count_params(model._modules['h'][0].attn)}")

c_attn | Expected: 1771776
c_attn | True:     1771776
c_proj | Expected: 590592
c_proj | True:     590592
attn_dropout | Expected: 0
attn_dropout | True:     0
resid_dropout | Expected: 0
resid_dropout | True:     0
attn | Expected: 2362368
attn | True:     2362368


In [7]:
expected_ln_2 = 2 * E
print(f"ln_2 | Expected: {expected_ln_2}")
print(f"ln_2 | True:     {count_params(model._modules['h'][0].ln_2)}")

ln_2 | Expected: 1536
ln_2 | True:     1536


In [8]:
H: int = 4 * E
expected_c_fc = E * H + H
expected_c_proj = H * E + E
expected_act = 0
expected_dropout = 0
expected_mlp = expected_c_fc + expected_c_proj + expected_act + expected_dropout
print(f"c_fc | Expected: {expected_c_fc}")
print(f"c_fc | True:     {count_params(model._modules['h'][0].mlp.c_fc)}")
print(f"c_proj | Expected: {expected_c_proj}")
print(f"c_proj | True:     {count_params(model._modules['h'][0].mlp.c_proj)}")
print(f"act | Expected: {expected_act}")
print(f"act | True:     {count_params(model._modules['h'][0].mlp.act)}")
print(f"dropout | Expected: {expected_dropout}")
print(f"dropout | True:     {count_params(model._modules['h'][0].mlp.dropout)}")
print(f"mlp | Expected: {expected_mlp}")
print(f"mlp | True:     {count_params(model._modules['h'][0].mlp)}")

c_fc | Expected: 2362368
c_fc | True:     2362368
c_proj | Expected: 2360064
c_proj | True:     2360064
act | Expected: 0
act | True:     0
dropout | Expected: 0
dropout | True:     0
mlp | Expected: 4722432
mlp | True:     4722432


In [9]:
expected_ln_f = 2 * E
print(f"ln_f | Expected: {expected_ln_f}")
print(f"ln_f | True:     {count_params(model._modules['ln_f'])}")

ln_f | Expected: 1536
ln_f | True:     1536


In [10]:
L: int = model.config.n_layer
expected_gpt2: int = E * (V + P) + L * (12 * E * E + 13 * E) + (2 * E)
print(f"gpt2 | Expected: {expected_gpt2}")
print(f"gpt2 | True:     {count_params(model)}")

gpt2 | Expected: 124439808
gpt2 | True:     124439808
