In [1]:
import pandas as pd

In [2]:
job_chars = pd.read_parquet('job_chars.parquet')
host_logs = pd.read_parquet('host_logs.parquet')

### Strip dataframe

In [3]:
len(host_logs)

6475643

In [4]:
batch_classes = ['clexpres', 'clmedium', 'cllong', 'clbigmem', 'clfo2', 'feque']

In [5]:
host_logs = host_logs[host_logs['QueueName'].isin(batch_classes)]

In [6]:
host_logs = host_logs[host_logs['Free_CPUs']>0]

In [7]:
# convert date format
host_logs['Date'] = host_logs['Date'].dt.tz_localize('Europe/Berlin')

In [8]:
len(host_logs)

2823178

In [9]:
host_logs

Unnamed: 0,Date,Free_CPUs,Free_Mem1,Used_CPUs,Cpu,Used_Mem1,ExecutionHost,QueueName,Free_Swap1,Used_Swap1,Load
111,2020-01-12 18:17:31+01:00,2,175.707031,30,30.6,15.003906,neshcl266,clmedium,,,
112,2020-01-12 18:17:31+01:00,3,111.296875,21,21.0,16.597656,neshcl203,clfo2,,,
113,2020-01-12 18:17:31+01:00,4,105.191406,20,20.0,22.703125,neshcl200,clfo2,,,
114,2020-01-12 18:17:31+01:00,4,109.203125,20,20.0,18.691406,neshcl201,clfo2,,,
115,2020-01-12 18:17:31+01:00,4,100.140625,20,20.0,27.753906,neshcl206,clfo2,,,
...,...,...,...,...,...,...,...,...,...,...,...
6475622,2020-10-07 16:10:08+02:00,15,116.773438,17,17.0,73.937500,neshcl379,cllong,15.257812,0.000000,17.0
6475623,2020-10-07 16:10:08+02:00,28,186.167969,4,4.0,4.542969,neshcl380,cllong,15.253906,0.003906,4.1
6475628,2020-10-07 16:10:08+02:00,31,81.968750,1,1.0,108.742188,neshcl385,cllong,15.242188,0.015625,1.0
6475630,2020-10-07 16:10:08+02:00,31,128.128906,1,1.0,62.582031,neshcl387,cllong,15.160156,0.097656,1.0


### Define calculation

In [10]:
i=9

In [11]:
job_chars.iloc[i]

Batch_class                         clmedium
Submission_date    2020-05-09 02:05:09+02:00
Waited                               163.817
Start_date         2020-05-09 04:48:58+02:00
Duration                              2010.4
Nodes                                      1
CPU                                  19.9475
Memory                                 15.41
delta_CPU                           0.344407
delta_Memory                            0.07
Name: 000545061a76c75a, dtype: object

In [12]:
f1 = (host_logs['Date']>=job_chars.iloc[i].Submission_date) & \
(host_logs['Date']<=job_chars.iloc[i].Start_date) & \
(host_logs['QueueName'].str.contains(job_chars.iloc[i].Batch_class))

In [13]:
print(job_chars.iloc[i].CPU)
print(host_logs.loc[f1]['Free_CPUs'])

19.94751513527545
2509630    12
2509632    32
2509634    32
2509636    32
2509644    12
           ..
2512708    12
2512713    22
2512714    32
2512715    12
2512716    12
Name: Free_CPUs, Length: 613, dtype: int64


In [14]:
(host_logs.loc[f1]['Free_CPUs']/job_chars.iloc[i].CPU).mean()

0.8816775209364305

In [15]:
print(job_chars.iloc[i].Memory)
print(host_logs.loc[f1]['Free_Mem1'])

15.41
2509630    161.878906
2509632    185.335938
2509634    187.726562
2509636    187.660156
2509644    170.515625
              ...    
2512708    166.960938
2512713    160.667969
2512714    172.757812
2512715    170.570312
2512716    170.632812
Name: Free_Mem1, Length: 613, dtype: float64


In [16]:
(host_logs.loc[f1]['Free_Mem1']/job_chars.iloc[i].Memory).mean()

10.96920750241099

### Parallelize using Multiprocessing
a bit background knowledge: https://medium.com/@bfortuner/python-multithreading-vs-multiprocessing-73072ce5600b

In [17]:
import multiprocessing

In [18]:
def multiprocessing_worker(i):
    f1 = (host_logs['Date']>=job_chars.loc[i].Submission_date) & \
        (host_logs['Date']<=job_chars.loc[i].Start_date) & \
        (host_logs['QueueName'].str.contains(job_chars.loc[i].Batch_class))
    a = (host_logs.loc[f1]['Free_CPUs']/job_chars.loc[i].CPU).mean()
    b = (host_logs.loc[f1]['Free_Mem1']/job_chars.loc[i].Memory).mean()
    return i,a,b

In [19]:
len(job_chars)

126027

In [20]:
sum(job_chars['Waited']>0.0)

53366

In [21]:
indices = job_chars.index.where(job_chars['Waited']>0.0).dropna()
subset = 16*10
indices = indices[:subset] # for now, only do this on a subset

In [22]:
%%time
pool = multiprocessing.Pool(processes=16)
t = pool.map(multiprocessing_worker, indices)

CPU times: user 235 ms, sys: 277 ms, total: 512 ms
Wall time: 10.7 s


In [25]:
print('duration estimate: ', sum(job_chars['Waited']>0.0)*(10.7/subset)/60, ' minutes')

duration estimate:  59.48085416666665  minutes


In [26]:
t

[('000030ecd19c9c04', 216.16746085772635, 2400.171257456378),
 ('00020fcd04947084', nan, nan),
 ('00027e89035bec8d', 0.7158739219547768, 1.2684329640086998),
 ('0003a37a7a8562f1', nan, nan),
 ('00044e0a50d292e0', 0.5747807892775363, 64.79835739060137),
 ('000545061a76c75a', 0.8816775209364305, 10.96920750241099),
 ('0005bc60f93ebf7f', 0.45998184297725064, 78.09589527739438),
 ('0005f3be12f14ee2', 4474.247832738399, 3239.550852803003),
 ('00076c148e19768c', 0.6530874693624757, 11.295009132233936),
 ('000a8aec3a0f9051', 34.30119818789957, 40.1760859929078),
 ('000c3234e7b9c898', 0.5212648328848679, 80.57954642879412),
 ('000dd6b4587cd5fe', nan, nan),
 ('000fd86c40edf54e', 18.7130951933879, 213.74980089066347),
 ('00112d07bd1d1d9b', 0.6353050887924374, 78.0018188531773),
 ('0011ab30ec7dbac2', 21.572768256950695, 514.2942205752688),
 ('0013aaa63ec9ea37', nan, nan),
 ('0018c54ef12c5fa3', 11.013445261768018, 75.74941928762448),
 ('00191204988c0512', 0.5797077892159824, 82.43107086036743),
 (

### Python environment

In [50]:
!conda list --explicit

# This file may be used to create an environment using:
# $ conda create --name <env> --file <this file>
# platform: linux-64
@EXPLICIT
https://conda.anaconda.org/conda-forge/linux-64/git-lfs-2.11.0-0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/nomkl-1.0-h5ca1d4c_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2020.6.20-hecda079_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.35-h769bd43_9.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-9.3.0-he4bcb1c_17.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-9.3.0-h2ae2ef3_17.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/pandoc-2.11.0.2-hd18ef5c_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/poppler-data-0.4.9-1.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-9.3.0-he4bcb1c_17.tar.bz2
https://conda.anaconda.org/