In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
job_logs_raw = pd.read_parquet('data/job_logs_raw.parquet')

In [3]:
job_logs_raw[job_logs_raw['Identifier'].str.contains('50f38293dabf53e9')]

Unnamed: 0,Queue,Pri,STT,S,Memory,CPU,Elapse,R,H,M,Jobs,Identifier,Time,Filename
0,clbigmem,0,RUN,-,45.40,10000.86,560223.0,Y,Y,Y,1.0,50f38293dabf53e9,2020-02-06 19:53:11+01:00,../../NESH-monitoring/request_logs/cycle_2020-...
384,clbigmem,0,RUN,-,22.91,30086.57,560860.0,Y,Y,Y,1.0,50f38293dabf53e9,2020-02-06 20:03:48+01:00,../../NESH-monitoring/request_logs/cycle_2020-...
762,clbigmem,0,RUN,-,54.38,15956.50,561498.0,Y,Y,Y,1.0,50f38293dabf53e9,2020-02-06 20:14:26+01:00,../../NESH-monitoring/request_logs/cycle_2020-...
1135,clbigmem,0,RUN,-,66.66,36984.72,562141.0,Y,Y,Y,1.0,50f38293dabf53e9,2020-02-06 20:25:09+01:00,../../NESH-monitoring/request_logs/cycle_2020-...
1507,clbigmem,0,RUN,-,66.66,57028.56,562780.0,Y,Y,Y,1.0,50f38293dabf53e9,2020-02-06 20:35:48+01:00,../../NESH-monitoring/request_logs/cycle_2020-...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113131,clbigmem,0,RUN,-,66.32,174542.95,716959.0,Y,Y,Y,1.0,50f38293dabf53e9,2020-02-08 15:25:26+01:00,../../NESH-monitoring/request_logs/cycle_2020-...
113529,clbigmem,0,RUN,-,82.08,194503.49,717579.0,Y,Y,Y,1.0,50f38293dabf53e9,2020-02-08 15:35:47+01:00,../../NESH-monitoring/request_logs/cycle_2020-...
113925,clbigmem,0,RUN,-,44.89,16785.23,718199.0,Y,Y,Y,1.0,50f38293dabf53e9,2020-02-08 15:46:07+01:00,../../NESH-monitoring/request_logs/cycle_2020-...
114319,clbigmem,0,RUN,-,13.79,4875.27,718819.0,Y,Y,Y,1.0,50f38293dabf53e9,2020-02-08 15:56:28+01:00,../../NESH-monitoring/request_logs/cycle_2020-...


Sorting logs by identifier and log time.

In [4]:
job_logs_raw = job_logs_raw.set_index('Identifier').sort_values(by = ['Identifier','Time'], ascending = [False,True])

In [5]:
subset = job_logs_raw[['CPU','Elapse']]

In [6]:
subset

Unnamed: 0_level_0,CPU,Elapse
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1
ffffddf606283b7b,0.00,0.0
ffffddf606283b7b,0.00,0.0
ffff662620b8e573,0.00,0.0
ffff662620b8e573,0.00,0.0
ffff662620b8e573,0.00,0.0
...,...,...
0000e2fb93bf367f,17965.79,13506.0
000030ecd19c9c04,0.00,0.0
000030ecd19c9c04,0.00,0.0
000030ecd19c9c04,0.00,0.0


Computing the difference in cumulative CPU and elapse time for every job. The first row of every job is deleted since no difference to previous logs can be computed. 

In [7]:
subset_diff = subset.diff()
first_logs_per_job  = subset_diff.reset_index()['Identifier'].eq(subset_diff.reset_index()['Identifier'].shift()).values
subset_diff = subset_diff[first_logs_per_job]

Instantaneous CPU load is estimated as `instantaneous_CPU_load` = `CPU.diff()` / `Elapse.diff()`

In [97]:
subset_diff['CPU_Load'] = subset_diff['CPU'] / subset_diff['Elapse']
instantaneous_CPU_load  = subset_diff[['CPU_Load']].fillna(0)

In [98]:
instantaneous_CPU_load

Unnamed: 0_level_0,CPU_Load
Identifier,Unnamed: 1_level_1
ffffddf606283b7b,0.000000
ffff662620b8e573,0.000000
ffff662620b8e573,0.000000
ffff662620b8e573,0.000000
ffff662620b8e573,0.000000
...,...
0000e2fb93bf367f,32.884910
0000e2fb93bf367f,-125.883432
000030ecd19c9c04,0.000000
000030ecd19c9c04,0.000000


In [107]:
def quantile_diff(df, quantile = 0.05):
    '''
    This statistic to estimate an interactivity flag uses the difference of the outer quantiles given
    as the quantile argument. This difference is normalized to the maximum absolute value of the subset.
    Thereby all flags are values between 0 (not interactive) and 1 (very interactive).
    '''
    df          = df.reset_index()
    df_grouped  = df.groupby(['Identifier'])
    df_flag  = (df_grouped.quantile(1-quantile)-df_grouped.quantile(quantile))
    df_mean  = df_flag/df_grouped.mean()
    df_max   = df_flag/df_grouped.max()
    df_flag  = df_flag.rename(columns={'CPU_Load':'interactivity score'})
    df_mean  = df_mean.rename(columns={'CPU_Load':'normalized mean'})
    df_max   = df_max.rename(columns={'CPU_Load':'normalized max'})
    interactivity = pd.merge(df_flag, df_mean, left_index=True, right_index=True).merge(df_max, left_index=True, right_index=True)
    return(interactivity)


In [108]:
interactivity = quantile_diff(instantaneous_CPU_load)

In [109]:
interactivity

Unnamed: 0_level_0,interactivity score,normalized mean,normalized max
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000030ecd19c9c04,0.151966,2.700000,0.900000
0000e2fb93bf367f,39.535012,117.285097,1.193402
00020fcd04947084,0.000000,0.000000,0.000000
0002286e1d7e428d,0.004353,0.004375,0.004365
0002441ec4986135,0.001644,1.256186,0.785502
...,...,...,...
fffb2d67011101c5,4.089659,22.448665,1.974427
fffd086716a4d46c,0.004373,0.004393,0.004382
ffff38341e8c6b3b,31.907856,1.648952,0.953827
ffff662620b8e573,18.321450,42.827236,1.109107


In [111]:
interactivity.to_parquet('data/interactivity.parquet', compression='gzip')