## Strong scaling
MILP solver timeout after 200s.


## Weak scaling


In [None]:
from auto_schedule import Simulator, AutoScheduleStore, AutoScheduleResult, get_best_BFSPP_schedule
import matplotlib.pyplot as plt

def is_milp_optimal(sim: AutoScheduleResult):
    if not sim:
        return False
    if sim.lp_status is not None and sim.lp_status == 1:
        return True
    return False

def plot_weak_scaling_runtime(n_DC, num_chunk):
    seq_len = 8192
    tp=8
    num_mb_per_pp_stage = 4
    dp = 64
    store = AutoScheduleStore("405_ud_store.pkl", "405_wave_store.pkl")
    n_layer_l = [32, 64, 128]
    n_pp_l = [2, 4, 8]
    runtime = []
    optimal_runtime = []
    tokens_per_iter_per_gpu = []
    bfs_tokens_per_sec_per_gpu = []
    is_milp_optimal_l = []
    
    for num_layers, pp in zip(n_layer_l, n_pp_l):
        
        bfs_sim = get_best_BFSPP_schedule(
            llama_model_size=405,
            seq_len=seq_len,
            mbs=1,
            tp=tp,
            pp=pp,
            dp=dp,
            gpu_mem_bytes=88 * 1024**3,  # around 96GB
            gpu_avg_perf_flops=350 * 10**12,
            num_DC=n_DC,
            DC_comm_latency=0.01,
            DC_comm_bandwidth=32 * 10**9,
            num_layers=num_layers,
        )
        
        bfs_tokens_per_sec_per_gpu.append(bfs_sim.get_tokens_per_second_per_device())
        
        if pp % n_DC != 0 or num_layers % (pp * 2) != 0:
            runtime.append(None)
            optimal_runtime.append(None)
            tokens_per_iter_per_gpu.append(None)
            is_milp_optimal_l.append(None)
            continue
        
        sim = Simulator(
            llama_model_size=405,
            seq_len=seq_len,
            mbs=1,
            tp=tp,
            pp=pp,
            dp=dp,
            num_mb_per_pp_stage=num_mb_per_pp_stage,
            num_chunks=num_chunk,
            gpu_mem_bytes=88 * 1024**3,  # around 96GB
            gpu_avg_perf_flops=350 * 10**12,
            num_DC=n_DC,
            DC_comm_latency=0.01,
            DC_comm_bandwidth=32 * 10**9,
            num_layers=num_layers,
        )
        tokens_per_iter_per_gpu.append(sim.get_tokens_per_iteration() / sim.pp / sim.dp / sim.tp)
        if num_chunk == 1:
            sim_res = store.get_ud_schedule_result(
                sim.get_system_config(), compute_if_not_exist=False
            )
        else:
            sim_res = store.get_wave_schedule_result(
                sim.get_system_config(), compute_if_not_exist=False
            )
        if not sim_res or sim_res.objective_value is None:
            runtime.append(None)
            optimal_runtime.append(sim.get_optimal_runtime())
            is_milp_optimal_l.append(False)
        else:    
            runtime.append(sim_res.objective_value / sim.time_scale_factor)
            optimal_runtime.append(sim.get_optimal_runtime())
            is_milp_optimal_l.append(is_milp_optimal(sim_res))
    
    normalized_runtime = [runtime[i] / optimal_runtime[i] if runtime[i] is not None else None for i in range(len(runtime))]
    ones = [1 for _ in range(len(n_layer_l))]
 
    tok_per_sec_per_gpu = [tokens_per_iter_per_gpu[i] / runtime[i] if runtime[i] is not None else None for i in range(len(runtime))]
    opt_tok_per_sec_per_gpu = [tokens_per_iter_per_gpu[i] / optimal_runtime[i] if optimal_runtime[i] is not None else None for i in range(len(runtime))]
    # plot
    # subplots
    fig, axes = plt.subplots(1,2, figsize=(12, 4))
    axes[0].plot(n_layer_l, opt_tok_per_sec_per_gpu, label="Optimal", marker='o')
    axes[0].plot(n_layer_l, tok_per_sec_per_gpu, label="Unidirectional", marker='^')
    axes[0].plot(n_layer_l, bfs_tokens_per_sec_per_gpu, label="BFS", marker='s')
    for layer, tok, optimal in zip(n_layer_l, tok_per_sec_per_gpu, is_milp_optimal_l):
        if not optimal:
            axes[0].scatter(layer, tok, color='r', marker='x', s=100)
    
    axes[1].plot(n_layer_l, ones, label="Optimal", marker='o')
    axes[1].plot(n_layer_l, normalized_runtime, label="Unidirectional", marker='^')
    
    axes[0].set_xticks(n_layer_l)
    axes[1].set_xticks(n_layer_l)
    axes[0].set_xlabel("Number of layers")
    axes[0].set_ylabel("Tokens/sec/GPU")
    axes[1].set_xlabel("Number of layers")
    axes[1].set_ylabel("Normalized Runtime (s)")
    schedule = "wave" if num_chunk > 1 else "ud"
    fig.suptitle(f'Weak scaling: l/pp = 16, {schedule} schedule, {n_DC} DCs')
    axes[0].legend()
    axes[1].legend()
    plt.show()

plot_weak_scaling_runtime(2,1)
plot_weak_scaling_runtime(4,1)
plot_weak_scaling_runtime(2,2)
plot_weak_scaling_runtime(4,2)
        


## Bandwidth Scaling

In [None]:


def plot_bw_scaling_runtime(n_DC, num_chunk):
    seq_len = 8192
    tp=8
    num_layers = 128
    num_mb_per_pp_stage = 4
    dp = 64
    pp = 4
    store = AutoScheduleStore("405_ud_store.pkl", "405_wave_store.pkl")
    DC_bw_l = [1, 2, 4, 8, 16, 32, 64, 128]
    runtime = []
    optimal_runtime = []
    tokens_per_iter_per_gpu = []
    bfs_tokens_per_sec_per_gpu = []
    is_milp_optimal_l = []
    
    for DC_bw in DC_bw_l:
        DC_comm_bandwidth = DC_bw * 10**9
        
        bfs_sim = get_best_BFSPP_schedule(
            llama_model_size=405,
            seq_len=seq_len,
            mbs=1,
            tp=tp,
            pp=pp,
            dp=dp,
            gpu_mem_bytes=88 * 1024**3,  # around 96GB
            gpu_avg_perf_flops=350 * 10**12,
            num_DC=n_DC,
            DC_comm_latency=0.01,
            DC_comm_bandwidth=DC_comm_bandwidth,
            num_layers=num_layers,
        )
        
        bfs_tokens_per_sec_per_gpu.append(bfs_sim.get_tokens_per_second_per_device())
        
        if pp % n_DC != 0 or num_layers % (pp * 2) != 0:
            runtime.append(None)
            optimal_runtime.append(None)
            tokens_per_iter_per_gpu.append(None)
            is_milp_optimal_l.append(None)
            continue
        
        sim = Simulator(
            llama_model_size=405,
            seq_len=seq_len,
            mbs=1,
            tp=tp,
            pp=pp,
            dp=dp,
            num_mb_per_pp_stage=num_mb_per_pp_stage,
            num_chunks=num_chunk,
            gpu_mem_bytes=88 * 1024**3,  # around 96GB
            gpu_avg_perf_flops=350 * 10**12,
            num_DC=n_DC,
            DC_comm_latency=0.01,
            DC_comm_bandwidth=DC_comm_bandwidth,
            num_layers=num_layers,
        )
        tokens_per_iter_per_gpu.append(sim.get_tokens_per_iteration() / sim.pp / sim.dp / sim.tp)
        if num_chunk == 1:
            sim_res = store.get_ud_schedule_result(
                sim.get_system_config(), compute_if_not_exist=False
            )
        else:
            sim_res = store.get_wave_schedule_result(
                sim.get_system_config(), compute_if_not_exist=False
            )
        if not sim_res or sim_res.objective_value is None:
            runtime.append(None)
            optimal_runtime.append(sim.get_optimal_runtime())
            is_milp_optimal_l.append(False)
        else:    
            runtime.append(sim_res.objective_value / sim.time_scale_factor)
            optimal_runtime.append(sim.get_optimal_runtime())
            is_milp_optimal_l.append(is_milp_optimal(sim_res))
    
    normalized_runtime = [runtime[i] / optimal_runtime[i] if runtime[i] is not None else None for i in range(len(runtime))]
    ones = [1 for _ in range(len(DC_bw_l))]
    tok_per_sec_per_gpu = [tokens_per_iter_per_gpu[i] / runtime[i] if runtime[i] is not None else None for i in range(len(runtime))]
    opt_tok_per_sec_per_gpu = [tokens_per_iter_per_gpu[i] / optimal_runtime[i] if optimal_runtime[i] is not None else None for i in range(len(runtime))]
    # plot
    # subplots
    schedule = "Wave" if num_chunk > 1 else "Unidirectional"
    fig, axes = plt.subplots(1,2, figsize=(12, 4))
    axes[0].plot(DC_bw_l, opt_tok_per_sec_per_gpu, label="Optimal", marker='o')
    axes[0].plot(DC_bw_l, tok_per_sec_per_gpu, label=schedule, marker='^')
    axes[0].plot(DC_bw_l, bfs_tokens_per_sec_per_gpu, label="BFS", marker='s')
    for layer, tok, optimal in zip(DC_bw_l, tok_per_sec_per_gpu, is_milp_optimal_l):
        if not optimal:
            axes[0].scatter(layer, tok, color='r', marker='x', s=100)
    
    axes[1].plot(DC_bw_l, ones, label="Optimal", marker='o')
    axes[1].plot(DC_bw_l, normalized_runtime, label="Unidirectional", marker='^')
    
    axes[0].set_xticks(DC_bw_l)
    axes[0].set_xlabel("Inter DC Bandwidth (GB/s)")
    axes[0].set_xscale('log', base=2)
    axes[1].set_xticks(DC_bw_l)
    axes[1].set_xlabel("Inter DC Bandwidth (GB/s)")
    axes[1].set_xscale('log', base=2)
    
    axes[0].set_ylabel("Tokens/sec/GPU")
    axes[1].set_ylabel("Normalized Runtime (s)")
    fig.suptitle(f'Bandwidth scaling: {schedule} schedule, {n_DC} DCs')
    axes[0].legend()
    axes[1].legend()
    plt.show()

plot_bw_scaling_runtime(2,1)
plot_bw_scaling_runtime(4,1)
plot_bw_scaling_runtime(2,2)
plot_bw_scaling_runtime(4,2)