In [40]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from functools import reduce

# Training Time

In [3]:
epoch_time = pd.read_csv(
    '../../training_logs/epoch_time.csv',
    names=['recorded_at', 'job_id', 'global_rank', 'local_rank', 'model_job_id', 'epoch', 'time']
)
epoch_time.head()

Unnamed: 0,recorded_at,job_id,global_rank,local_rank,model_job_id,epoch,time
0,2025-05-18 13:49:43,single-v5vktkkcbg046c,0,0,single-v5vktkkcbg046c,0,140.659864
1,2025-05-18 13:52:11,single-v5vktkkcbg046c,0,0,single-v5vktkkcbg046c,1,141.699433
2,2025-05-18 13:54:38,single-v5vktkkcbg046c,0,0,single-v5vktkkcbg046c,2,141.926468
3,2025-05-18 13:57:06,single-v5vktkkcbg046c,0,0,single-v5vktkkcbg046c,3,142.066725
4,2025-05-18 13:59:34,single-v5vktkkcbg046c,0,0,single-v5vktkkcbg046c,4,141.929731


In [4]:
jobs = [
    'single-v5vktkkcbg046c',
    'single-hmxvdzqqz1x9cd',
    'single-gg6r7plwk2b4vd',
    'ddp-mdgx1trqq0h4gd',
    'ddp-j0ldqsn0pdpwfc',
    'ddp-mxgcxhdj9zmkmd',
    'ddp-zjmd7l6nggc6rd',
    'ddp-nr0klkplw1m0f',
    'ddp-hp7bv729zlxnwc',
    'pp-kw02jgxb3h7mwd',
    'pp-c3xpvdw643bsnd',
    'pp-pzg56qpv9dsjwd',
    'pp-qpznchjf4m6pxc',
    'pp-pxlrnqx606chrd',
    'pp-ssbwpk61xtdchc',
    'pp-ghp6hmmmz9x2zc',
    'ddpnpp-dv2spt5t37x7ld',
    'ddpnpp-btq0f9zfqks6jc',
]
epoch_time.loc[epoch_time['job_id'].isin(jobs), ['job_id', 'time']].groupby('job_id').mean()

Unnamed: 0_level_0,time
job_id,Unnamed: 1_level_1
ddp-hp7bv729zlxnwc,90.648669
ddp-j0ldqsn0pdpwfc,61.143508
ddp-mdgx1trqq0h4gd,74.566353
ddp-mxgcxhdj9zmkmd,97.14029
ddp-nr0klkplw1m0f,95.943574
ddp-zjmd7l6nggc6rd,102.201446
ddpnpp-btq0f9zfqks6jc,135.909647
ddpnpp-dv2spt5t37x7ld,142.115394
pp-c3xpvdw643bsnd,196.381244
pp-ghp6hmmmz9x2zc,115.502256


In [36]:
single_jobs = ["single-pxp7blkzmh72vd", "single-qpbm7wzb6ksttd", "single-gzw720trx2914c", "single-m2hd9x94sp1vf", "single-q7qw9n3pgkp5c", "single-jk9rtlhp0x6hhd", "single-zgk7ff93h7jwcd", "single-l47hdqp31rfq2c", "single-tjfx6bgwpm2x2c", "single-tp5p690xczdp6"]
pp_jobs = ["pp-c3243sdg7gvxgd", "pp-bklxvlzbffjq4c", "pp-m7spzd56200jzc", "pp-mzch31r9fbgwtd", "pp-dd39w57gtmkc9", "pp-p32rg1r2qwf2k", "pp-k7r7c9535vs5pc", "pp-wrjmzq7w6d6h2c", "pp-psz9ssvx17vw7", "pp-ntk9p2jp27176c"]
ddp_jobs = ["ddp-hk1jnplmhdhggd", "ddp-zw2kd0jbfzsq1", "ddp-z5fw246lk66bhc", "ddp-h4177jppc4kxnc", "ddp-qcf7563f7l0s", "ddp-rq2p99vvstb2bd", "ddp-b554snl19tvwz", "ddp-f7qx13pkb0ggwc", "ddp-wh16cmzkg9b5fc", "ddp-fx1dp5fjpr0z1c"]
ddpnpp_jobs = ["ddpnpp-jr6hh5gl13mcw", "ddpnpp-j934625txq00ld", "ddpnpp-lsx95jv5r29hr", "ddpnpp-rqlgsqrrkzvm3", "ddpnpp-g5gzb6nww66ltd", "ddpnpp-r6cd7mdgsfgsd", "ddpnpp-cb1wrjt45jj0td", "ddpnpp-vrcbwq4pg4dqj", "ddpnpp-n7hm93bdfqmddd", "ddpnpp-lw4rrd7g71cvkd"]

jobs_by_strategy = {
    'single': single_jobs,
    'pp': pp_jobs,
    'ddp': ddp_jobs,
    'ddpnpp': ddpnpp_jobs
}

metrics = ['loss', 'train_accuracy', 'val_loss', 'val_accuracy', 'weighted_f1']

In [41]:
df_by_metric = defaultdict(pd.DataFrame)

for metric in metrics:
    df_by_strategy = defaultdict(pd.DataFrame)

    for strat, jobs in jobs_by_strategy.items():
        dfs = []

        for job in jobs:
            df = pd.read_csv(
                f"../../training_logs/by_job_id/{job}/{metric}.csv",
                names=['recorded_at', 'job_id', 'global_rank', 'local_rank', 'model_job_id', 'epoch', metric]
            ).query("epoch == 29")
            df['strategy'] = strat
            dfs.append(df)

        df = pd.concat(dfs, ignore_index=True, axis=0)
        df_by_strategy[strat] = df
    
    merged_df = pd.concat(df_by_strategy.values(), ignore_index=True, axis=0)
    df_by_metric[metric] = merged_df.groupby('strategy')[metric].mean().reset_index()

all_metric_df = reduce(lambda left, right: pd.merge(left, right, on='strategy', how='inner'), df_by_metric.values())
all_metric_df.head()


Unnamed: 0,strategy,loss,train_accuracy,val_loss,val_accuracy,weighted_f1
0,ddp,0.066394,0.977992,1.318804,0.77582,0.780778
1,ddpnpp,0.066784,0.977583,1.12866,0.777222,0.784455
2,pp,0.074408,0.975792,1.391337,0.737222,0.749175
3,single,0.05708,0.980194,1.449469,0.773806,0.781318


In [43]:
jobs = ["communicationtime-kk5nb4q5905j3c", "communicationtime-n9g5vfqvqxd2t", "communicationtime-j6lt2sghb1pq6", "communicationtime-b5tr929qvf6scc", "communicationtime-b40x143zw00c7c", "communicationtime-m9h951n1lnrj4c"]

communication_time = pd.read_csv(
    "../../training_logs/communication_time.csv",
    names=['job_id', 'iteration', 'elapsed_time']
).query("job_id in @jobs")
communication_time.shape

(6000, 3)

In [44]:
communication_time.head()

Unnamed: 0,job_id,iteration,elapsed_time
1000,communicationtime-kk5nb4q5905j3c,0,297.702393
1001,communicationtime-kk5nb4q5905j3c,1,0.544768
1002,communicationtime-kk5nb4q5905j3c,2,0.5376
1003,communicationtime-kk5nb4q5905j3c,3,0.534528
1004,communicationtime-kk5nb4q5905j3c,4,0.530432


In [50]:
communication_time.query("iteration == 0")

Unnamed: 0,job_id,iteration,elapsed_time
1000,communicationtime-kk5nb4q5905j3c,0,297.702393
2000,communicationtime-b5tr929qvf6scc,0,473.579529
5000,communicationtime-n9g5vfqvqxd2t,0,353.307648
6000,communicationtime-j6lt2sghb1pq6,0,204.596313
7000,communicationtime-b40x143zw00c7c,0,472.99054
8000,communicationtime-m9h951n1lnrj4c,0,292.983185


In [None]:
# exclude first iteration (initialization)
communication_time.query("iteration != 0").groupby('job_id')['elapsed_time'].mean().reset_index()

Unnamed: 0,job_id,elapsed_time
0,communicationtime-b40x143zw00c7c,18.907679
1,communicationtime-b5tr929qvf6scc,18.917247
2,communicationtime-j6lt2sghb1pq6,0.396791
3,communicationtime-kk5nb4q5905j3c,0.502281
4,communicationtime-m9h951n1lnrj4c,18.150475
5,communicationtime-n9g5vfqvqxd2t,0.774237
