In [1]:
import sys
import numpy as np
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
KB=1024
MB=KB*1024
GB=MB*1024
TB=GB*1024

In [3]:
df = pd.read_parquet('data/xcache_2022-06-01.parquet')  
df.drop(columns=['lfn','host'],inplace=True)
df.head()

Unnamed: 0,access,site,scope,fn,b_hit,b_miss,b_bypass,fsize,fill
0,2,AGLT2,mc15_13TeV,EVNT.27585588._000719.pool.root.1,364496131,33432150,0,398338556,1.0
1,2,AGLT2,mc15_13TeV,EVNT.27585588._001775.pool.root.1,363018480,34105107,0,397532163,1.0
2,2,AGLT2,mc15_13TeV,EVNT.27585588._001661.pool.root.1,363880281,33804761,0,398093435,1.0
3,8,BNL,data17_13TeV,DAOD_EXOT2.19869372._000158.pool.root.1,307294784,0,0,874949234,0.72139
4,21,BNL,user.pfalke,user.pfalke.27690994._001499.output.root,406370,4885409,0,2737217806,0.052768


In [4]:
def Acc(r):
    cols = ['Accesses','tot files','avg acc per file', 'fill']
    return pd.Series( (
            r.access.sum(), r.shape[0], r.access.sum()/r.shape[0], 
            r.fill.mean())
            , index=cols)

def perc_read(r):
    cols = ['accesses', 'read perc', 'tot hit [TB]', 'tot miss[TB]', 'tot bypass [TB]']
    return pd.Series( 
        ( r.shape[0],  
         ( r.b_hit.sum() + r.b_miss.sum() + r.b_bypass.sum() ) / r.fsize.sum(),
         r.b_hit.sum()/TB, 
         r.b_miss.sum()/TB, 
         r.b_bypass.sum()/TB 
        ) ,
        index=cols
    )


In [5]:
df['ext']=df.fn.str.split('.').str[-2]+'.'+df.fn.str.split('.').str[-1]

df['ext1'] = np.where(df['ext'].str.find('root')>-1, 'root', df['ext'] )
df['ext1'] = np.where(df['ext1'].str.endswith('.1'), 'root', df['ext1'] )

df.groupby('ext1').apply(perc_read).sort_values('accesses',ascending=False).head()

Unnamed: 0_level_0,accesses,read perc,tot hit [TB],tot miss[TB],tot bypass [TB]
ext1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
root,4367401.0,0.165634,967.5884,363.2974,20.897982
log.tgz,215860.0,0.999972,0.04661514,0.02319304,0.000726
lib.tgz,95882.0,1.191513,6.584221,0.1793005,0.016355
output.h5,1616.0,1.000794,0.4832305,0.1608961,0.0
tar.gz,903.0,0.943259,0.01421237,0.003880123,0.0
_0002.2,42.0,0.136825,0.01009133,0.01150994,0.0
_0001.2,38.0,0.093243,0.004109668,0.008622483,0.0
_0003.2,36.0,0.099918,0.00327836,0.01034709,0.0
_0001.data,19.0,1.103766,0.02021269,0.005146639,0.0
_0005.2,7.0,0.098527,0.0003519665,0.002360956,0.0


In [6]:
perc_read(df)

accesses           4.681871e+06
read perc          1.664243e-01
tot hit [TB]       9.747791e+02
tot miss[TB]       3.637137e+02
tot bypass [TB]    2.091506e+01
dtype: float64

prints percentage of file read in an average access.
Total data delivered on hit (in TB), on miss, and bypassed.

the same but grouped per scope

In [7]:
print(df.groupby('scope').apply(perc_read).sort_values('accesses',ascending=False).to_string())

                 accesses  read perc  tot hit [TB]  tot miss[TB]  tot bypass [TB]
scope                                                                            
mc16_13TeV      1621651.0   0.132937  3.992653e+02  2.307652e+02     1.014334e+01
data18_13TeV     736960.0   0.199064  1.479719e+02  4.630360e+01     4.806284e+00
panda            684600.0   1.033744  2.189840e+01  4.741724e+00     1.924114e-01
mc15_13TeV       439249.0   0.288118  1.615325e+00  1.240362e+00     0.000000e+00
data17_13TeV     297249.0   0.316342  1.000577e+02  2.588172e+01     3.206215e+00
data16_13TeV     289179.0   0.454897  1.202956e+02  2.346653e+01     1.943588e+00
user.stkazako    141570.0   0.000805  9.131096e-02  2.307229e-01     1.875329e-04
data15_13TeV      72095.0   0.220195  1.214677e+01  2.627009e+00     1.253473e-01
user.pfalke       51453.0   0.002287  8.529001e-02  4.942675e-01     0.000000e+00
phys-exotics      47502.0   0.005843  5.055730e-01  8.141562e-01     4.561911e-04
user.nbruscin   

and grouped by site

In [8]:
print(df.groupby('site').apply(perc_read).sort_values('accesses',ascending=False).to_string())

             accesses  read perc  tot hit [TB]  tot miss[TB]  tot bypass [TB]
site                                                                         
MWT2        1693568.0   0.094827    153.139817    119.388625         1.027513
AGLT2       1139053.0   0.090452    121.777575    100.863091         2.236732
NET2         677395.0   0.678729    512.762376     82.455900        17.650819
BNL          502404.0   0.145985     90.693096     23.366053         0.000000
LRZ-LMU      177954.0   0.148214     36.173795     10.493323         0.000000
SWT2         164956.0   0.089432     18.476801     19.300001         0.000000
Birmingham   163857.0   0.104666     11.053031      3.901255         0.000000
praguelcg2   162684.0   0.148729     30.702659      3.945403         0.000000


total number of accesses. number of unique files. average number of times files were accessed, and current file fill factor.

In [9]:
idx = df.groupby(['lfn'])['access'].transform(max) == df['access']
mdf = df[idx]
print(Acc(mdf))

KeyError: 'lfn'

In [None]:
print(mdf.groupby('scope').apply(Acc).sort_values('Accesses',ascending=False).to_string())

In [None]:
print(mdf.groupby('site').apply(Acc).sort_values('Accesses',ascending=False).to_string())

Frequently accessed files

In [None]:
print(mdf.groupby('ext1').apply(Acc).sort_values('Accesses',ascending=False).to_string())

In [None]:

# ldf.index = pd.to_datetime(ldf["time"])
# # ldf.plot(y=["loss"],figsize=(15,4), style='.')
# groups = ldf.groupby("host")
# for name, group in groups:
#     # plt.plot()
#     plt.plot(group["loss"], marker="o", linestyle="", label=name)
# plt.legend()
# fig = matplotlib.pyplot.gcf()
# fig.set_size_inches(18.5, 10.5)
# # fig.savefig('losses.png', dpi=100)