# Real Model Scaling

## 405B dense model

1. 405B, $2^{14}$ GPUs, TP8, PP16, DP128, seq $2^{13}$, 50Days. 
2. 8x 405B, $2^{18}$ GPUs, 200Days, TP16, CP2, PP64, DP128, seq $2^{12}$
3. 8x 405B, $2^{18}$ GPUs, 200Days, TP32, PP32, DP256, seq $2^{12}$.
4. 64x 405B, $2^{23}$ GPUs, 400Days, TP32, CP16, PP256, DP64, seq $2^{12}$
5. 64x 405B, $2^{23}$ GPUs, 400Days, TP64, CP8, PP128, DP128, seq $2^{12}$



# 2.4T

In [8]:
from pipeline import *
from simulator import *
from copy import deepcopy
import threading
from copy import deepcopy
from concurrent.futures import ThreadPoolExecutor, as_completed
# 5ms and 50ms
# 2^[0,2,4,6,8,12] GB/s

sim = SimConfig(
    model_cfg=CUSTOM_SIZE_TO_CONFIG[3200],
    seq_len=2**12,
    mbs=1,
    tp=16,
    pp=64,
    dp=128,
    cp=2,
    num_mb_per_pp_stage=2,
    num_chunks=2,
    gpu_mem_bytes=96 * 1024**3,  # around 96GB
    gpu_avg_perf_flops=350 * 10**12,
    num_DC=2,
    intra_DC_bandwidth=200 * 1024**3,
    DC_comm_latency=0.01,
    DC_comm_bandwidth=32 * 1024**3,
)

# result_unit_throughput, result_mem = defaultdict(dict), defaultdict(dict)
# for latency in [5, 50]:
#     for bandwidth in [2**0, 2**2, 2**4, 2**6, 2**8, 2**10, 2**12]:
#         new_sim = deepcopy(sim)
#         sim.DC_comm_latency = latency / 1000
#         sim.DC_comm_bandwidth = bandwidth
#         simulate = Simulator(sim)
#         utp, mem = simulate.benchmark_cross_DC_PP()
#         result_unit_throughput[(latency, bandwidth)] = utp
#         result_mem[(latency,bandwidth)] = mem


def run_simulation(latency, bandwidth, sim):
    new_sim = deepcopy(sim)
    new_sim.DC_comm_latency = latency / 1000
    new_sim.DC_comm_bandwidth = bandwidth * 1024**3
    simulate = Simulator(new_sim)
    utp, mem = simulate.benchmark_cross_DC_PP()
    return latency, bandwidth, utp, mem

def parallel_simulation(sim):
    result_unit_throughput = {}
    result_mem = {}
    
    with ThreadPoolExecutor(max_workers=64) as executor:
        futures = []
        for latency in [5, 50]:
            for bandwidth in [2**0, 2**2, 2**4, 2**6, 2**8, 2**10, 2**12]:
                futures.append(executor.submit(run_simulation, latency, bandwidth, sim))
        
        for future in as_completed(futures):
            latency, bandwidth, utp, mem = future.result()
            result_unit_throughput[(latency, bandwidth)] = utp
            result_mem[(latency, bandwidth)] = mem
    
    return result_unit_throughput, result_mem

result_unit_throughput, result_mem = parallel_simulation(sim)


In [None]:
result_unit_throughput

In [None]:
from collections import defaultdict
from matplotlib import pyplot as plt
keys = list(result_unit_throughput.keys())
pps = result_unit_throughput[keys[0]].keys()
for latency in [5, 50]:
    data_utp = defaultdict(list)
    data_mem = defaultdict(list)
    for bw in range(0, 13, 2):
        for pp in pps:
            data_utp[pp].append(result_unit_throughput[(latency, 2**bw)][pp])
            data_mem[pp].append(result_mem[(latency, 2**bw)][pp])
    
    # plot subfigure
    fig, ax = plt.subplots(1, 2, figsize=(10, 5))
    for pp in pps:
        ax[0].plot([2**x for x in range(0, 13, 2)], data_utp[pp], label=f'{pp}')
        ax[1].plot([2**x for x in range(0, 13, 2)], [x / 1024**3 for x in data_mem[pp]], label=f'{pp}')
    ax[1].axhline(y=96, color='r', linestyle='--', label='GH200 Memory')
    ax[0].set_xlabel('Inter-DC Bandwidth (GB/s)')
    ax[0].set_ylabel('Tokens per GPU per Second')
    ax[0].set_xscale('log', base=2)
    ax[0].set_yscale('log', base=2)
    ax[1].set_xlabel('Inter-DC Bandwidth (GB/s)')
    ax[1].set_ylabel('Peak Memory (GB)')
    ax[1].set_xscale('log', base=2)
    # ax[1].set_yscale('log', base=2)
    
    fig.set_tight_layout(True)
    ax[0].legend()
    ax[1].legend()
    fig.suptitle(f'3.2T dense model, 2DC, Inter-DC Latency: {latency}ms')
    plt.show()
   
    

In [14]:

from pipeline import *
from simulator import *
from copy import deepcopy
import threading
from copy import deepcopy
import math
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
from matplotlib import pyplot as plt
# single DC
def test_bench_1DC():

    sim = SimConfig(
        model_cfg=CUSTOM_SIZE_TO_CONFIG[3200],
        seq_len=2**12,
        mbs=1,
        tp=16,
        pp=64,
        dp=128,
        cp=2,
        num_mb_per_pp_stage=2,
        num_chunks=2,
        gpu_mem_bytes=96 * 1024**3,  # around 96GB
        gpu_avg_perf_flops=350 * 10**12,
        num_DC=1,
        intra_DC_bandwidth=200 * 1024**3,
        DC_comm_latency=0.01,
        DC_comm_bandwidth=32 * 1024**3,
    )

    # result_unit_throughput, result_mem = defaultdict(dict), defaultdict(dict)
    # for latency in [5, 50]:
    #     for bandwidth in [2**0, 2**2, 2**4, 2**6, 2**8, 2**10, 2**12]:
    #         new_sim = deepcopy(sim)
    #         sim.DC_comm_latency = latency / 1000
    #         sim.DC_comm_bandwidth = bandwidth
    #         simulate = Simulator(sim)
    #         utp, mem = simulate.benchmark_cross_DC_PP()
    #         result_unit_throughput[(latency, bandwidth)] = utp
    #         result_mem[(latency,bandwidth)] = mem


    def run_simulation(bandwidth, sim):
        new_sim = deepcopy(sim)
        new_sim.DC_comm_latency = 0
        new_sim.DC_comm_bandwidth = math.inf
        new_sim.intra_DC_bandwidth = bandwidth * 1024**3
        simulate = Simulator(new_sim)
        utp, mem = simulate.benchmark_cross_DC_PP()
        return bandwidth, utp, mem

    def parallel_simulation(sim):
        result_unit_throughput = {}
        result_mem = {}
        
        with ThreadPoolExecutor(max_workers=64) as executor:
            futures = []
            for bandwidth in [2**7, 2**8, 2**9, 2**10, 2**11, 2**12]:
                futures.append(executor.submit(run_simulation, bandwidth, sim))
            
            for future in as_completed(futures):
                bandwidth, utp, mem = future.result()
                result_unit_throughput[bandwidth] = utp
                result_mem[bandwidth] = mem
        
        return result_unit_throughput, result_mem

    return parallel_simulation(sim)


out_1DC = test_bench_1DC()

In [None]:
out_1DC[0]

In [None]:
def plot_1DC(result_unit_throughput, result_mem):
    keys = list(result_unit_throughput.keys())
    pps = result_unit_throughput[keys[0]].keys()
    data_utp = defaultdict(list)
    data_mem = defaultdict(list)
    for bw in range(7, 13):
        for pp in pps:
            data_utp[pp].append(result_unit_throughput[2**bw][pp])
            data_mem[pp].append(result_mem[2**bw][pp])
    
    # plot subfigure
    fig, ax = plt.subplots(1, 2, figsize=(10, 5))
    for pp in pps:
        ax[0].plot([2**x for x in range(7, 13)], data_utp[pp], label=f'{pp}')
        ax[1].plot([2**x for x in range(7, 13)], [x / 1024**3 for x in data_mem[pp]], label=f'{pp}')
    ax[1].axhline(y=96, color='r', linestyle='--', label='GH200 Memory')
    ax[0].set_xlabel('Intra-DC Bandwidth (GB/s)')
    ax[0].set_ylabel('Tokens per GPU per Second')
    ax[0].set_xscale('log', base=2)
    ax[0].set_yscale('log', base=2)
    ax[1].set_xlabel('Intra-DC Bandwidth (GB/s)')
    ax[1].set_ylabel('Peak Memory (GB)')
    ax[1].set_xscale('log', base=2)
    # ax[1].set_yscale('log', base=2)
    
    # fig.set_tight_layout(True)
    ax[0].legend()
    ax[1].legend()
    fig.suptitle('3.2T dense model, 1DC')
    plt.show()

plot_1DC(*out_1DC)