# Load scaling efficiency data profiled on NVIDIA H100

In [21]:
#  Diffusion–request scheduler MILP (two-level objective)
# import Pkg
# Pkg.add("JSON3")
using JSON3
using JuMP
using Gurobi    # swap in Gurobi, CPLEX, ... if available
using Printf

# --------------------------- constants ----------------------
Δ = 0.8                      # slot length in seconds
GPU_COUNTS = [1, 2, 4, 8]    # admissible p
G = 8                        # total GPUs
β = 2                        # slack: may miss β extra jobs in stage-2
seq_len_to_img_size = Dict(256=>(256, 256), 1024=>(512, 512), 2048=>(512, 1024), 4096=>(1024, 1024), 8192=>(1024, 2048), 16384=>(2048, 2048))
# ---------------------- helper functions --------------------
ceil_slots(t) = ceil(Int, t/Δ)
floor_slots(t) = floor(Int, t/Δ)

profile_file = "e2e_scaling_efficiency_bs1.json"
samples = JSON3.read(open(profile_file, "r"))  # JSON3.Array of Objects

# Initialize the raw timing table: profiling[s][p] = best (min) avg_e2e_time
profiling = Dict{Int, Dict{Int, Float64}}()

for sample in samples
    s = sample["seq_len"]                # e.g. 256, 1024, …
    p = sample["mlp_world_size"]         # e.g. 1, 2, 4, 8
    t = sample["avg_e2e_time"]           # e.g. 0.394

    # only consider shapes we care about
    if !(s in keys(seq_len_to_img_size))
        continue
    end

    # ensure the inner dict exists
    if !haskey(profiling, s)
        profiling[s] = Dict{Int,Float64}()
    end

    # record the minimum observed time for (s,p)
    old = get(profiling[s], p, Inf)
    profiling[s][p] = min(old, t)
end

# ——— 2. Quantise into integer slots ———
w = Dict{Int, Dict{Int, Int}}()   # w[s][p] = runtime in slots
r = Dict{Int, Int}()              # r[s]      = single-GPU slots

for (s, inner) in profiling
    ws = Dict{Int, Int}()
    for (p, avg_sec) in inner
        ws[p] = ceil_slots(avg_sec)   # ⌈avg_sec / Δ⌉
    end
    w[s] = ws
    r[s] = ws[1]
end

# Service Level Objectives(SLO)
# deadlines from problem statement (seconds → slots)
SLO = Dict( 256=>2.5, 1024=>3.0, 2048=>4.0, 4096=>7.5, 8192=>12.0, 16384=>30.0 )
d  = Dict(s => ceil_slots(SLO[s]) for s in keys(SLO))


# ——— 3. Build the η‐table ———
sizes = sort(collect(keys(w)))  # e.g. [256,1024,…]
η = Dict{Int, Dict{Int, Float64}}()
for s in sizes
    inner = Dict{Int,Float64}()
    for p in GPU_COUNTS
        if haskey(w[s], p)
            # effective work per slot is r_s / w_{s,p}
            inner[p] = r[s] / w[s][p]
        end
    end
    η[s] = inner
end

# ——— 4. Print penalties ———
println("Scaling penalties (as a percentage of theoretical perfect scaling)")
for s in sizes
    img_shape = seq_len_to_img_size[s]
    for p in GPU_COUNTS
        if haskey(w[s], p)
            # penalty = 1 - (η[s,p] / p)
            penalty = max(0, 1 - (η[s][p] / p))
            pct = round(100*penalty; digits=1)
            println("image shape $(img_shape), p=$p: penalty = $pct%")
        end
    end
end


Scaling penalties (as a percentage of theoretical perfect scaling)
image shape (256, 256), p=1: penalty = 0.0%
image shape (256, 256), p=2: penalty = 66.7%
image shape (256, 256), p=4: penalty = 83.3%
image shape (256, 256), p=8: penalty = 91.7%
image shape (512, 512), p=1: penalty = 0.0%
image shape (512, 512), p=2: penalty = 58.3%
image shape (512, 512), p=4: penalty = 79.2%
image shape (512, 512), p=8: penalty = 89.6%
image shape (512, 1024), p=1: penalty = 0.0%
image shape (512, 1024), p=2: penalty = 33.3%
image shape (512, 1024), p=4: penalty = 66.7%
image shape (512, 1024), p=8: penalty = 83.3%
image shape (1024, 1024), p=1: penalty = 0.0%
image shape (1024, 1024), p=2: penalty = 20.0%
image shape (1024, 1024), p=4: penalty = 42.9%
image shape (1024, 1024), p=8: penalty = 66.7%
image shape (1024, 2048), p=1: penalty = 0.0%
image shape (1024, 2048), p=2: penalty = 13.2%
image shape (1024, 2048), p=4: penalty = 25.0%
image shape (1024, 2048), p=8: penalty = 48.4%
image shape (2048,

# Load request arrival trace

In [2]:
trace_file = "trace_readable_data_parallel_rate_1.2_cv_0.7_dist_short_seqlen_high.json"
trace_json = JSON3.read(open(trace_file, "r"))

# expand jobs (batch size bs) into individual requests
struct Job
    id::Int
    size::Int
    a::Int       # arrival slot
    d::Int       # deadline slot
    R::Int       # required work
end

jobs = Job[]
job_id = 1
for req in trace_json
    if !haskey(req, "bs")
        continue
    end
    bs   = req["bs"]             # batch size
    size = Int(req["seq_len"])
    a    = floor_slots(req["start_time"])
    for _ = 1:bs
        push!(jobs, Job(job_id, size, a, d[size] + a, r[size]))
        job_id += 1
    end
end
J = 1:length(jobs)
T_max = maximum(j.d for j in jobs)
P     = [p for p in GPU_COUNTS if p ≤ G]

println("Trace has $(length(jobs)) requests.")
println("Max runtime: $T_max slots (=$(@sprintf("%.1f",T_max*Δ)) s)")

Trace has 118 requests.
Max runtime: 61 slots (=48.8 s)


# Build and run the ILP model   

In [15]:

function print_job_completion()
    for j in J
        job = jobs[j]
        s    = job.size
        R    = job.R
        shape = seq_len_to_img_size[s]

        # accumulate effective progress until we hit R
        prog = 0.0
        ct   = nothing
        for t in 0:T_max
            for p in P
                prog += η[s][p] * value(x[j,p,t])
            end
            if prog + 1e-9 >= R
                ct = t
                break
            end
        end

        if ct === nothing
            @printf(" • Job %2d (shape %4dx%-4d) DID NOT COMPLETE (prog=%.2f/%.0f)\n",
                    j, shape[1], shape[2], prog, R)
        else
            @printf(" • Job %2d (shape %4dx%-4d) completed at slot %3d (%.2f s)  deadline=%3d  met=%s\n",
                    j, shape[1], shape[2], ct, ct*Δ, job.d, value(f[j])>0.5 ? "yes" : "no")
        end
    end
end


# helper: latest completion slot among finished jobs
function latest_completion_slot(J, jobs_sub, P, x, η, f)
    latest = 0
    for j in J
        # skip unfinished jobs
        if value(f[j]) < 0.5         
            continue
        end
        job = jobs_sub[j]
        prog = 0.0
        for t in job.a:job.d
            for p in P
                prog += η[job.size][p] * value(x[j,p,t])
            end
            if prog + 1e-9 ≥ job.R
                latest = max(latest, t)
                break
            end
        end
    end
    return latest        # slot index
end


function silent_env()  
    redirect_stdout(devnull) do  
        return Gurobi.Env()  
    end  
end  
const gurobi_env = silent_env()  
optimizer_with_env = () -> Gurobi.Optimizer(gurobi_env)

function build_model(jobs::Vector{Job}, num_jobs::Int=length(jobs))
    # slice off the prefix
    jobs_sub = jobs[1:num_jobs]
    J = 1:length(jobs_sub)
    T_max = maximum(job.d for job in jobs_sub)
    P = [p for p in GPU_COUNTS if p ≤ G]

    model = Model(optimizer_with_env)
    set_silent(model)

    # Vars
    @variable(model, x[j in J, p in P, t in 0:T_max], Bin)  # C1, C2, C4 refer to these
    @variable(model, f[j in J], Bin)                       # one var per job

    # (C1) at most one mode per job‐slot
    @constraint(model, [j in J, t in 0:T_max],
        sum(x[j,p,t] for p in P) ≤ 1)

    # (C2) GPU capacity
    @constraint(model, [t in 0:T_max],
        sum(p * x[j,p,t] for j in J, p in P) ≤ G)

    # (C3) no execution before arrival
    for (j, job) in enumerate(jobs_sub)  # j is index into jobs_sub
        for p in P, t in 0:min(job.a-1, T_max)
            fix(x[j,p,t], 0.0; force=true)
        end
    end

    # (C4) progress requirement
    for j in J
        job = jobs_sub[j]
        @constraint(model,
            sum(η[job.size][p] * x[j, p, t] for p in P, t in job.a:job.d)
          ≥ job.R * f[j])
    end

    return model, x, f, jobs_sub, J, P, T_max
end

using Dates  # for high-resolution timing

function solve_with_timelimit!(model; timelimit_s=60)
    # set Gurobi’s time limit (seconds)
    set_optimizer_attribute(model, "TimeLimit", timelimit_s)

    # record start as Float64 seconds
    t0 = time()                                       # :contentReference[oaicite:0]{index=0}
    optimize!(model)
    return time() - t0                               # Float64, in seconds
end


function run_schedule!(
    model, x, f, jobs_sub, J, P, T_max;
    stage1::Bool=false,
    stage2::Bool=false,
    timelimit_s::Real=60,
    verbose::Bool=true
)

    if stage1
        @objective(model, Max, sum(f[j] for j in J))
        solve_time1 = solve_with_timelimit!(model; timelimit_s=timelimit_s)

        Fstar = round(Int, objective_value(model))
        println("Stage 1: met $Fstar / $(length(jobs_sub))  (solver time $(round(solve_time1; digits=2)) s)")

        # compute makespan in slots & seconds
        ms1_slots = latest_completion_slot(J, jobs_sub, P, x, η, f)
        ms1_s     = ms1_slots * Δ
        println("Stage 1 makespan: $(round(ms1_s; digits=2)) s")

        # throughput = #jobs / makespan
        tp1 = Fstar / ms1_s
        println("Stage 1 throughput: $(round(tp1; digits=2)) req/s")
    end

    if stage2
        @assert Fstar !== nothing "run stage1 first"
        @constraint(model, sum(f[j] for j in J) ≥ Fstar - β)
        # Minimize GPU slots used
        # @objective(model, Min, sum(p * x[j,p,t] for j in J, p in P, t in 0:T_max))

        # Or minimize makespan
        @variable(model, T_fin >= 0, Int)
        # T_fin is the last slot used
        @constraint(model, [j in J, p in P, t in 0:T_max],
            T_fin ≥ t * x[j,p,t])
    
        @objective(model, Min, T_fin)
        
        solve_time2 = solve_with_timelimit!(model; timelimit_s=timelimit_s)
        met2       = round(Int, sum(value(f[j]) for j in J))
        println("Stage 2: met $met2 / $(length(jobs_sub))  (solver time $(round(solve_time2; digits=2)) s)")

        # compute makespan again
        ms2_slots = latest_completion_slot(J, jobs_sub, P, x, η, f)
        ms2_s     = ms2_slots * Δ
        println("Stage 2 makespan: $(round(ms2_s; digits=2)) s")

        # throughput
        tp2 = met2 / ms2_s
        println("Stage 2 throughput: $(round(tp2; digits=2)) req/s")
    end

    if verbose  
        print_job_completion()
end

return nothing
end






run_schedule! (generic function with 1 method)

# Stage 1 optimization  

In [17]:
model, x, f, jobs_sub, J, P, T = build_model(jobs)
run_schedule!(model, x, f, jobs_sub, J, P, T; stage1=true, stage2=false)

Stage 1: met 72 / 118  (solver time 4.52 s)
Stage 1 makespan: 44.8 s
Stage 1 throughput: 1.61 req/s
 • Job  1 (shape  512x512 ) completed at slot   4 (3.20 s)  deadline=  4  met=yes
 • Job  2 (shape  512x512 ) completed at slot   4 (3.20 s)  deadline=  4  met=yes
 • Job  3 (shape  512x512 ) completed at slot   4 (3.20 s)  deadline=  4  met=yes
 • Job  4 (shape  512x512 ) completed at slot   4 (3.20 s)  deadline=  4  met=yes
 • Job  5 (shape 1024x1024) DID NOT COMPLETE (prog=0.00/16)
 • Job  6 (shape 1024x1024) DID NOT COMPLETE (prog=3.89/16)
 • Job  7 (shape  256x256 ) completed at slot   5 (4.00 s)  deadline=  5  met=yes
 • Job  8 (shape  256x256 ) completed at slot   6 (4.80 s)  deadline=  6  met=yes
 • Job  9 (shape  256x256 ) completed at slot   6 (4.80 s)  deadline=  6  met=yes
 • Job 10 (shape  256x256 ) completed at slot   6 (4.80 s)  deadline=  6  met=yes
 • Job 11 (shape  256x256 ) completed at slot   6 (4.80 s)  deadline=  6  met=yes
 • Job 12 (shape 1024x1024) DID NOT COMPLE

# Stage 2 optimization

In [22]:
model, x, f, jobs_sub, J, P, T = build_model(jobs)
run_schedule!(model, x, f, jobs_sub, J, P, T; stage1=true, stage2=true)

Stage 1: met 72 / 118  (solver time 4.92 s)
Stage 1 makespan: 44.8 s
Stage 1 throughput: 1.61 req/s
Stage 2: met 70 / 118  (solver time 34.71 s)
Stage 2 makespan: 43.2 s
Stage 2 throughput: 1.62 req/s
 • Job  1 (shape  512x512 ) completed at slot   4 (3.20 s)  deadline=  4  met=yes
 • Job  2 (shape  512x512 ) completed at slot   4 (3.20 s)  deadline=  4  met=yes
 • Job  3 (shape  512x512 ) completed at slot   4 (3.20 s)  deadline=  4  met=yes
 • Job  4 (shape  512x512 ) completed at slot   4 (3.20 s)  deadline=  4  met=yes
 • Job  5 (shape 1024x1024) DID NOT COMPLETE (prog=0.00/16)
 • Job  6 (shape 1024x1024) DID NOT COMPLETE (prog=3.29/16)
 • Job  7 (shape  256x256 ) completed at slot   5 (4.00 s)  deadline=  5  met=yes
 • Job  8 (shape  256x256 ) completed at slot   6 (4.80 s)  deadline=  6  met=yes
 • Job  9 (shape  256x256 ) completed at slot   6 (4.80 s)  deadline=  6  met=yes
 • Job 10 (shape  256x256 ) completed at slot   6 (4.80 s)  deadline=  6  met=yes
 • Job 11 (shape  256x2

# Show the runtime tradeoff with varying slot size

In [6]:
using CSV, DataFrames

results = DataFrame(slot_size=Float64[], solve_time=Float64[])

for Δ in 0.2:0.2:1.0
    println("------------------------------------------------")
    println("Slot size Δ = $Δ")

    model, x, f, jobs_sub, J, P, T = build_model(jobs)

    @objective(model, Max, sum(f[j] for j in J))

    t = solve_with_timelimit!(model; timelimit_s=60)

    push!(results, (Δ, t))
    println(" Solve time = $(round(t; digits=2)) s")
end

CSV.write("timings.csv", results)



------------------------------------------------
Slot size Δ = 0.2
 Solve time = 4.53 s
------------------------------------------------
Slot size Δ = 0.4
 Solve time = 4.63 s
------------------------------------------------
Slot size Δ = 0.6
 Solve time = 5.06 s
------------------------------------------------
Slot size Δ = 0.8
 Solve time = 4.93 s
------------------------------------------------
Slot size Δ = 1.0
 Solve time = 4.89 s


"timings.csv"

# Visualize the request placement for 8 requests

In [7]:
# after
model, x, f, jobs_sub, J, P, T_max = build_model(jobs, 8)
run_schedule!(model, x, f, jobs_sub, J, P, T_max; stage1=true, stage2=false)

rows = NamedTuple[]   # will hold (job,gpu,start_s,dur_s)

for t in 0:T_max
    # collect (j, p_jt) for all jobs active at slot t
    active = []
    for j in J
        # find p such that x[j,p,t] = 1
        pj = findfirst(p->value(x[j,p,t]) > 0, P)
        if pj !== nothing
            push!(active, (j, pj))
        end
    end

    # sort by job ID (optional, just to be deterministic)
    sort!(active, by = first)

    # maintain a free‐GPU queue
    freeGPUs = collect(0:G-1)

    for (j, pj) in active
        # take the first pj GPUs
        assigned = freeGPUs[1:pj]
        freeGPUs = freeGPUs[(pj+1):end]
        # record each 0.2s slice
        for g in assigned
            push!(rows, (job=j-1, gpu=g, start_s=t*Δ, dur_s=Δ))
        end
    end
end

# write out
CSV.write("schedule8.csv", rows)



Stage 1: met 8 / 8  (solver time 0.01 s)
Stage 1 makespan: 8.0 s
Stage 1 throughput: 1.0 req/s
 • Job  1 (shape  512x512 ) completed at slot   4 (3.20 s)  deadline=  4  met=yes
 • Job  2 (shape  512x512 ) completed at slot   4 (3.20 s)  deadline=  4  met=yes
 • Job  3 (shape  512x512 ) completed at slot   4 (3.20 s)  deadline=  4  met=yes
 • Job  4 (shape  512x512 ) completed at slot   4 (3.20 s)  deadline=  4  met=yes
 • Job  5 (shape 1024x1024) completed at slot  10 (8.00 s)  deadline= 10  met=yes
 • Job  6 (shape 1024x1024) completed at slot  10 (8.00 s)  deadline= 10  met=yes
 • Job  7 (shape  256x256 ) completed at slot   5 (4.00 s)  deadline=  5  met=yes
 • Job  8 (shape  256x256 ) completed at slot   6 (4.80 s)  deadline=  6  met=yes


"schedule8.csv"