# Extraction of DGEMM raw data and computation of regression coefficients

First, we download the HDF5 file (warning: this is a large file, several
gigabytes).

In [1]:
!test -f /tmp/data.db || wget https://gitlab.in2p3.fr/cornebize/g5k_data_non_regression/-/raw/master/data.db? -O /tmp/data.db
!du -sh /tmp/data.db

--2020-06-30 10:46:50--  https://gitlab.in2p3.fr/cornebize/g5k_data_non_regression/-/raw/master/data.db?
Résolution de gitlab.in2p3.fr (gitlab.in2p3.fr)… 134.158.69.41
Connexion à gitlab.in2p3.fr (gitlab.in2p3.fr)|134.158.69.41|:443… connecté.
requête HTTP transmise, en attente de la réponse… 200 OK
Taille : 12542731120 (12G) [application/octet-stream]
Sauvegarde en : « /tmp/data.db »


2020-06-30 10:52:37 (34,5 MB/s) — « /tmp/data.db » sauvegardé [12542731120/12542731120]

12G	/tmp/data.db


According to the [changelog](https://gitlab.in2p3.fr/cornebize/g5k_data_non_regression/-/blob/master/exp_changelog.org):
- the cooling issue started on `2019-09-01`
- we changed the protocol on `2019-10-18`
- the cooling issue was fixed on `2019-11-27`

So, we will take the measures done between these last two dates.

In [2]:
import pandas
import datetime

import cashew
print(cashew.__version__)
print(cashew.__git_version__)
from cashew import linear_regression as lr

0.0.0
f6e1abe82ebba1eec668652189985c631c10b5b5


In [3]:
def to_epoch(date_s):
    return int(datetime.datetime.strptime(date_s, '%Y-%m-%d').timestamp())

conditions = [f'start_time > {to_epoch("2019-10-18")}',
              f'start_time < {to_epoch("2019-11-27")}',
              'cluster == dahu']
print(conditions)
df = pandas.read_hdf('/tmp/data.db', where=conditions)
print(len(df))
df.head()

['start_time > 1571349600', 'start_time < 1574809200', 'cluster == dahu']
12818400


Unnamed: 0,function,m,n,k,timestamp,duration,core,node,cluster,jobid,cpu,start_time,index,expfile_hash
0,dgemm,1591,5117,819,216.854232,0.50794,0,1,dahu,1889380,0,1571386167,0,8897548660748450ec25f28ed7edc8e8278e0f54119439...
1,dgemm,2873,3609,964,217.362197,0.740749,0,1,dahu,1889380,0,1571386167,1,8897548660748450ec25f28ed7edc8e8278e0f54119439...
2,dgemm,1263,2995,1321,218.102957,0.38351,0,1,dahu,1889380,0,1571386167,2,8897548660748450ec25f28ed7edc8e8278e0f54119439...
3,dgemm,3755,1896,1358,218.486476,0.729757,0,1,dahu,1889380,0,1571386167,3,8897548660748450ec25f28ed7edc8e8278e0f54119439...
4,dgemm,7954,2317,127,219.216242,0.178241,0,1,dahu,1889380,0,1571386167,4,8897548660748450ec25f28ed7edc8e8278e0f54119439...


In [4]:
def compute_lin_reg(df):
    df = df.copy()
    lr.compute_variable_products(df, 'mnk')
    reg = lr.compute_full_reg(df, 'duration', ['mnk'])
    total_flop = (2 * df['mnk']).sum()
    total_time = df['duration'].sum()
    reg['avg_gflops'] = total_flop / total_time * 1e-9
    reg['function'] = lr.get_unique(df, 'function')
    return reg

reg = pandas.DataFrame(lr.regression(df, compute_lin_reg))
print(len(reg))
reg.head()

1470


Unnamed: 0,intercept,mnk,tvalue_mnk,intercept_residual,mnk_residual,tvalue_mnk_residual,avg_gflops,function,cluster,node,expfile_hash,cpu,jobid,start_time
0,2e-06,7.690653e-11,464.469828,4.358831e-07,2.025623e-12,13.720815,25.852715,dgemm,dahu,1,8897548660748450ec25f28ed7edc8e8278e0f54119439...,0,1889380,1571386167
1,3e-06,7.423239e-11,462.48546,6.070353e-07,2.233619e-12,15.765462,26.802925,dgemm,dahu,1,8897548660748450ec25f28ed7edc8e8278e0f54119439...,1,1889380,1571386167
2,3e-06,7.563839e-11,450.665959,3.608536e-07,2.025757e-12,13.636862,26.28409,dgemm,dahu,2,8897548660748450ec25f28ed7edc8e8278e0f54119439...,0,1889383,1571386167
3,3e-06,7.30066e-11,439.117718,4.691992e-07,1.98056e-12,13.547613,27.229567,dgemm,dahu,2,8897548660748450ec25f28ed7edc8e8278e0f54119439...,1,1889383,1571386167
4,3e-06,7.570254e-11,475.357107,4.046269e-07,2.168812e-12,14.964029,26.280493,dgemm,dahu,3,8897548660748450ec25f28ed7edc8e8278e0f54119439...,0,1889384,1571386080


In [5]:
reg.to_csv('/tmp/dgemm_calibration_slownodes.csv', index=False)