In [1]:
import sys
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [3]:
KB=1024
MB=KB*1024
GB=MB*1024
TB=GB*1024

In [8]:
def Acc(r):
    cols = ['tot acc','tot files','avg acc per file', 'fill']
    return pd.Series( (
            r.access.sum(), r.shape[0], r.access.sum()/r.shape[0], 
            r.fill.mean())
            , index=cols)

def perc_read(r):
    cols = ['read perc', 'tot hit', 'tot miss', 'tot bypass']
    return pd.Series( (   ( r.b_hit.sum() + r.b_miss.sum() + r.b_bypass.sum() ) / r.fsize.sum(),
    r.b_hit.sum()/TB, r.b_miss.sum()/TB, r.b_bypass.sum()/TB ) ,index=cols)

prints percentage of file read in an average access.
Total data delivered on hit (in TB), on miss, and bypassed.

In [9]:
df = pd.read_parquet('xcache_2022_II.parquet')  

perc_read(df)
# df.head()


read perc     1.317715e-01
tot hit       1.389763e+02
tot miss      6.407506e+01
tot bypass    6.980863e+12
dtype: float64

the same but grouped per scope

In [19]:
df.groupby('scope').apply(perc_read)

Unnamed: 0_level_0,read perc,tot hit,tot miss,tot bypass
scope,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
data15_13TeV,0.076775,0.786591,0.487094,9527020000.0
data16_13TeV,0.112051,3.213048,2.410303,6574467000.0
data17_13TeV,0.136549,4.876177,7.329815,6543764000.0
data18_13TeV,0.20888,33.083039,12.254642,5498553000000.0
data18_hi,0.069106,0.0501,0.099764,0.0
group,0.590856,2.567431,0.134101,86388280.0
mc15_13TeV,0.117334,0.064471,0.039624,0.0
mc15_14TeV,0.069308,0.116411,0.024448,0.0
mc15_7TeV,0.429019,0.051214,0.003393,0.0
mc16_13TeV,0.123527,92.118961,40.057598,1302634000000.0


and grouped by host

In [20]:
df.groupby('host').apply(perc_read)

Unnamed: 0_level_0,read perc,tot hit,tot miss,tot bypass
host,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
atlas-slate01.bu.edu,0.191516,1.368643,2.262657,0.0
lcg-lrz-xcache0.grid.lrz.de,0.096913,6.301486,3.732986,0.0
lcg-lrz-xcache1.grid.lrz.de,0.117313,6.969367,2.638335,0.0
sl-uc-es1.slateci.io,0.124066,17.669664,17.638627,0.0
sl-uc-xcache1.slateci.io,0.144362,67.088645,14.876033,6980863000000.0
sl-um-es2.slateci.io,0.101653,2.872151,2.459839,0.0
sl-um-es5.slateci.io,0.092907,3.939675,1.808967,0.0
slate01.atlas-swt2.org,0.135309,7.954075,4.249229,0.0
xcache1.farm.particle.cz,0.138172,24.812627,14.408387,0.0


total number of accesses. number of unique files. average number of times files were accessed, and current file fill factor.

In [16]:
idx = df.groupby(['lfn'])['access'].transform(max) == df['access']
mdf = df[idx]
print(Acc(mdf))

tot acc             1.722156e+06
tot files           1.653600e+05
avg acc per file    1.041459e+01
fill                7.408854e-01
dtype: float64


In [17]:
mdf.groupby('scope').apply(Acc)

Unnamed: 0_level_0,tot acc,tot files,avg acc per file,fill
scope,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
data15_13TeV,34077.0,3518.0,9.68647,0.725379
data16_13TeV,86743.0,9580.0,9.054593,0.779041
data17_13TeV,172826.0,18124.0,9.535754,0.781742
data18_13TeV,530791.0,39531.0,13.427209,0.782961
data18_hi,1034.0,277.0,3.732852,0.275915
group,2276.0,747.0,3.046854,0.436631
mc15_13TeV,8140.0,5693.0,1.429826,0.19519
mc15_14TeV,20655.0,3300.0,6.259091,0.354868
mc15_7TeV,9170.0,4516.0,2.030558,0.880685
mc16_13TeV,573610.0,60734.0,9.444627,0.719607


In [18]:
mdf.groupby('host').apply(Acc)

Unnamed: 0_level_0,tot acc,tot files,avg acc per file,fill
host,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
atlas-slate01.bu.edu,24958.0,13139.0,1.899536,0.905863
lcg-lrz-xcache0.grid.lrz.de,104610.0,10877.0,9.617542,0.653612
lcg-lrz-xcache1.grid.lrz.de,162262.0,11404.0,14.228516,0.679204
sl-uc-es1.slateci.io,128016.0,28176.0,4.543441,0.709392
sl-uc-xcache1.slateci.io,814087.0,49250.0,16.529685,0.74381
sl-um-es2.slateci.io,103472.0,4502.0,22.983563,0.789692
sl-um-es5.slateci.io,134891.0,6668.0,20.229604,0.782629
slate01.atlas-swt2.org,17512.0,4439.0,3.945033,0.782544
xcache1.farm.particle.cz,232348.0,36905.0,6.295841,0.728566


Frequently accessed files

In [30]:
mdf[mdf.scope=='user'].sort_values('access',ascending=False).head(20)

Unnamed: 0,access,host,lfn,scope,fn,b_hit,b_miss,b_bypass,fsize,fill
945832,975,xcache1.farm.particle.cz,user/bngair/a9/a0/user.bngair.23953539._000038...,user,user.bngair.23953539._000038.output.root,54583641,0,0,4244582352,0.994442
945393,962,xcache1.farm.particle.cz,user/bngair/8d/56/user.bngair.23953539._000037...,user,user.bngair.23953539._000037.output.root,54463206,0,0,4233800774,0.99449
945967,960,xcache1.farm.particle.cz,user/bngair/44/44/user.bngair.23953539._000039...,user,user.bngair.23953539._000039.output.root,55232828,0,0,4292295824,0.992061
946178,622,xcache1.farm.particle.cz,user/bngair/97/1e/user.bngair.23953539._000040...,user,user.bngair.23953539._000040.output.root,54659560,0,0,4250508096,0.99445
283446,522,sl-um-es5.slateci.io,user/elham/df/38/user.elham.23942700._001819.o...,user,user.elham.23942700._001819.output.root,186689,0,0,25963035,1.0
283379,522,sl-um-es5.slateci.io,user/elham/d5/9a/user.elham.23942700._001813.o...,user,user.elham.23942700._001813.output.root,195260,0,0,27574335,1.0
283334,521,sl-um-es5.slateci.io,user/elham/eb/5e/user.elham.23942700._001807.o...,user,user.elham.23942700._001807.output.root,182506,0,0,25428633,1.0
283360,521,sl-um-es5.slateci.io,user/elham/27/57/user.elham.23942700._001812.o...,user,user.elham.23942700._001812.output.root,184245,0,0,25828357,1.0
283333,521,sl-um-es5.slateci.io,user/elham/c3/dd/user.elham.23942700._001805.o...,user,user.elham.23942700._001805.output.root,130727,0,0,17538475,1.0
283448,521,sl-um-es5.slateci.io,user/elham/50/2a/user.elham.23942700._001822.o...,user,user.elham.23942700._001822.output.root,133901,0,0,17955080,1.0


In [22]:

# ldf.index = pd.to_datetime(ldf["time"])
# # ldf.plot(y=["loss"],figsize=(15,4), style='.')
# groups = ldf.groupby("host")
# for name, group in groups:
#     # plt.plot()
#     plt.plot(group["loss"], marker="o", linestyle="", label=name)
# plt.legend()
# fig = matplotlib.pyplot.gcf()
# fig.set_size_inches(18.5, 10.5)
# # fig.savefig('losses.png', dpi=100)