In [1]:
# Model configs
GPT2_small = {
    "num_layers": 12,
    "d_model": 768,
    "num_heads": 12,
    "d_ff": 4 * 768,   # 3072
    "vocab_size": 50257,
    "context_length": 1024
}

GPT2_medium = {
    "num_layers": 24,
    "d_model": 1024,
    "num_heads": 16,
    "d_ff": 4 * 1024,  # 4096
    "vocab_size": 50257,
    "context_length": 1024
}

GPT2_large = {
    "num_layers": 36,
    "d_model": 1280,
    "num_heads": 20,
    "d_ff": 4 * 1280,  # 5120
    "vocab_size": 50257,
    "context_length": 1024
}

GPT2_XL = {
    "num_layers": 48,
    "d_model": 1600,
    "num_heads": 25,
    "d_ff": 4 * 1600,  # 6400
    "vocab_size": 50257,
    "context_length": 1024
}

model_configs = {
    "gpt2_small": GPT2_small,
    "gpt2_medium": GPT2_medium,
    "gpt2_large": GPT2_large,
    "gpt2_xl": GPT2_XL,
}

## 3. Transformer LM resource accounting

In [15]:
# GPT2-XL config
vocab_size = 50257
context_length = 1024
num_layers = 48
d_model = 1600
num_heads = 25
d_ff = 6400

### a) memeory accounting

In [None]:
# Calculate manually
n_rms_norm = d_model

# Token embedding
n_token_embedding = vocab_size * d_model

# MHA
n_wq = d_model * num_heads * (d_model // num_heads)
n_wk = d_model * num_heads * (d_model // num_heads)
n_wv = d_model * num_heads * (d_model // num_heads)
n_wo = num_heads * (d_model // num_heads) * d_model
n_mha = n_wq + n_wk + n_wv + n_wq
# FFN
n_w1 = d_model * d_ff
n_w3 = d_model * d_ff
n_w2 = d_ff * d_model
n_ffn = n_w1 + n_w2 + n_w3

# trf_block
n_trf_block = n_rms_norm * 2 + n_mha + n_ffn

# Output
n_out_proj = d_model * vocab_size

total = n_token_embedding + n_trf_block * num_layers + n_rms_norm + n_out_proj
print(f"{total:,}")

2,127,057,600


In [None]:
# Calculate with functions
from cs336_basics.assignment_utils import get_model_parameter
from cs336_basics.model import *

lm = TransformerLM(
    vocab_size,
    d_model,
    num_heads,
    d_ff,
    context_length,
    10000,
    num_layers
)

num_paraters = get_model_parameter(lm)
print(f"{num_paraters:,}")

In [None]:
# Calculate memory
from cs336_basics.assignment_utils import get_model_size

get_model_size(lm)

Number of parameters: 2,127,057,600.
Size of parameters: 7.92 GB.
Number of parameters in training (with gradients): 4,254,115,200.
Size of parameters in training (with gradients): 15.85 GB.
Number of buffers: 6,291,456.
Size of buffers: 0.02 GB.


### b) Compute FLOPs

In [4]:
# MHA

FLOPs = {}
# Linear Projection
FLOPs["mha_linear"] = 2*context_length*d_model*d_model
# Compute attention
FLOPs["mha_attn_weight"] = 2*context_length*d_model*context_length
FLOPs["mha_context_vector"] = 2*context_length*context_length*d_model
# Output projection
FLOPs["mha_output_linear"] = 2*context_length*d_model*d_model

In [5]:
# FFN

FLOPs["ffn_linear1"] = 2*context_length*d_model*d_ff
FLOPs["ffn_linear3"] = 2*context_length*d_model*d_ff
FLOPs["ffn_linear2"] = 2*context_length*d_ff*d_model

In [6]:
# Transformer_block
qkv = 3
ffn_linears = 3
FLOPs["transformer_block"] = FLOPs["mha_linear"] * qkv \
                            + FLOPs["mha_attn_weight"]  \
                            + FLOPs["mha_context_vector"] \
                            + FLOPs["mha_output_linear"] \
                            + FLOPs["ffn_linear1"] * ffn_linears

In [7]:
# Final linear projection
FLOPs["final_linear"] = 2*context_length*d_model*vocab_size

In [8]:
# Total FLOPs in the model
total_FLOPs = num_layers * (FLOPs["transformer_block"]) + FLOPs["final_linear"]
print(f"total FLOPs: {total_FLOPs:,}")
print(f"approximate total FLOPs: {total_FLOPs:.2e}")

total FLOPs: 4,513,336,524,800
approximate total FLOPs: 4.51e+12


In [13]:
# from thop import profile

# def get_model_FLOPs(model: nn.Module, token_ids: Int[Tensor, "... seq_len"]):
#     macs, params = profile(model, (token_ids,))
#     flops = 2 * macs

#     print(f"Total FLOPs of the model and input: {flops:,}")
#     print(f"Approximated total FLOPs of the model and input: {flops:.2e}")

# token_ids = torch.randint(low=0, high=vocab_size, size=(1, context_length,))
# get_model_FLOPs(lm, token_ids)

### c) most FLOPs

In [None]:
FLOPs_sorted = sorted(FLOPs.items(), key=lambda x: x[1])
FLOPs_sorted

[('mha_attn_weight', 3355443200),
 ('mha_context_vector', 3355443200),
 ('mha_linear', 5242880000),
 ('mha_output_linear', 5242880000),
 ('ffn_linear1', 20971520000),
 ('ffn_linear3', 20971520000),
 ('ffn_linear2', 20971520000),
 ('transformer_block', 90596966400),
 ('final_linear', 164682137600)]

### d) FLOPs proportion varying with model size

In [1]:
def compute_flops_proportion(model_config: dict, context_length: int):
    '''
    Breakdown component: ((MHA, FFN) -> transformer_block) * num_layers -> trf_blocks, final linear
    '''
    # MHA
    flops_mha_linear = 2 * context_length * model_config["d_model"] * model_config["d_model"]
    flops_mha_attn_weight = 2 * context_length * context_length * model_config["d_model"]
    flops_mha_context_vector = 2 * context_length * context_length * model_config["d_model"]
    flops_mha_output_linear = 2 * context_length * model_config["d_model"] * model_config["d_model"]

    qkv = 3
    flops_mha = qkv * flops_mha_linear + flops_mha_attn_weight + flops_mha_context_vector + flops_mha_output_linear

    # FFN
    flops_ffn_linear1 = 2 * context_length * model_config["d_model"] * model_config["d_ff"]
    flops_ffn_linear2 = 2 * context_length * model_config["d_model"] * model_config["d_ff"]
    flops_ffn_linear3 = 2 * context_length * model_config["d_model"] * model_config["d_ff"]

    ffn_linears = 3
    flops_ffn = ffn_linears * flops_ffn_linear1

    # Transformer block(s)
    flops_transformer_block = flops_mha + flops_ffn
    flops_trf_blocks = model_config["num_layers"] * flops_transformer_block

    # Final linear projection
    flops_final_linear = 2 * context_length * model_config["d_model"] * model_config["vocab_size"]

    total_flops = flops_trf_blocks + flops_final_linear

    print(f"FLOPs proportion of MHA is {(flops_mha / total_flops) * 100:.2f}%")
    print(f"FLOPs proportion of FFN is {(flops_ffn / total_flops) * 100:.2f}%")
    print(f"FLOPs proportion of transformer_block is {(flops_transformer_block / total_flops) * 100:.2f}%")
    print(f"FLOPs proportion of all trf_blocks is {(flops_trf_blocks / total_flops) * 100:.2f}%")
    print(f"FLOPs proportion of all final linear projection is {(flops_final_linear / total_flops) * 100:.2f}%")

def compute_models_flops(model_configs: dict):
    context_length = 1024

    for model_size, cfg in model_configs.items():
        print(f"---------------{model_size}--------------")
        compute_flops_proportion(cfg, context_length)

In [None]:
compute_models_flops(model_configs)

---------------gpt2_small--------------
FLOPs proportion of MHA is 2.30%
FLOPs proportion of FFN is 4.15%
FLOPs proportion of transformer_block is 6.45%
FLOPs proportion of all trf_blocks is 77.39%
FLOPs proportion of all final linear projection is 22.61%
---------------gpt2_medium--------------
FLOPs proportion of MHA is 1.25%
FLOPs proportion of FFN is 2.49%
FLOPs proportion of transformer_block is 3.74%
FLOPs proportion of all trf_blocks is 89.80%
FLOPs proportion of all final linear projection is 10.20%
---------------gpt2_large--------------
FLOPs proportion of MHA is 0.83%
FLOPs proportion of FFN is 1.78%
FLOPs proportion of transformer_block is 2.62%
FLOPs proportion of all trf_blocks is 94.16%
FLOPs proportion of all final linear projection is 5.84%
---------------gpt2_xl--------------
FLOPs proportion of MHA is 0.61%
FLOPs proportion of FFN is 1.39%
FLOPs proportion of transformer_block is 2.01%
FLOPs proportion of all trf_blocks is 96.35%
FLOPs proportion of all final linear 

### 5) Increase context_length for FLOPs

In [25]:
compute_flops_proportion(GPT2_XL, context_length=1024)
print("--------------------------")
compute_flops_proportion(GPT2_XL, context_length=16384)

FLOPs proportion of MHA is 0.61%
FLOPs proportion of FFN is 1.39%
FLOPs proportion of transformer_block is 2.01%
FLOPs proportion of all trf_blocks is 96.35%
FLOPs proportion of all final linear projection is 3.65%
--------------------------
FLOPs proportion of MHA is 1.37%
FLOPs proportion of FFN is 0.67%
FLOPs proportion of transformer_block is 2.05%
FLOPs proportion of all trf_blocks is 98.24%
FLOPs proportion of all final linear projection is 1.76%


## 4. Training resource accounting

In [2]:
from cs336_basics.model import *

In [None]:
import importlib
import cs336_basics.assignment_utils
importlib.reload(cs336_basics.assignment_utils)

<module 'cs336_basics.utils' from 'e:\\LLM\\CS336\\assignment1-basics\\cs336_basics\\utils.py'>

### a) Peak memory

In [None]:
from cs336_basics.assignment_utils import calculate_model_peak_memory

calculate_model_peak_memory(GPT2_XL, "adamw", 4, torch.float32)


Size of parameters is 7.92 GB
Size of activations is 57.83 GB
Size of grads is 7.92 GB
Size of optimizer state is 15.85 GB
Estimated peak memory of the model is 89.53 GB
14.458137512207031
31.69562816619873


### b) Estimate max batch_size

In [None]:
from cs336_basics.assignment_utils import calculate_max_batch

memory_size_gb = 80
max_batch_size = calculate_max_batch(
    GPT2_XL,
    memory_size_gb
)
print(max_batch_size)

Size of parameters is 7.92 GB
Size of activations is 14.46 GB
Size of grads is 7.92 GB
Size of optimizer state is 15.85 GB
Estimated peak memory of the model is 46.15 GB
3


### c) FLOPs running optimizer

In [None]:
from cs336_basics.assignment_utils import calculate_optimizer_flops_step

opt_flops_per_step = calculate_optimizer_flops_step(GPT2_XL)
print(f"FLOPs of running one step of the AdamW optimizer is {opt_flops_per_step:.2e}")

FLOPs of running one step of the AdamW optimizer is 2.98e+10


### d) Training time estimation

In [None]:
from cs336_basics.assignment_utils import estimate_train_days

estimate_train_days(
    GPT2_XL,
    hardware_flops_tera=19.5,
    mfu=0.5,
    train_steps=400000,
    batch_size=1024
)

Estimated training days is 6583.57055222792 days.
