In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import utilix
import numpy as np
from tqdm import tqdm
import os
from tqdm import tqdm
path_start = '/dali/lgrandi/rucio/xnt_'

In [None]:
import cutax
st = cutax.contexts.xenonnt_online()
runs = st.select_runs()
modes = set(runs["mode"])

In [None]:
sr0_right = 34731
sr0_left  = 17918
sr1_left  = 43039
sr1_right = 53000

In [None]:
modes_list = list(modes)

In [None]:
runs_dtype = np.dtype([('number', np.int32), 
                       ('mode', 'O'),
                       ('bad', np.bool),
                       ('messy', np.bool),
                       ('hot_spot', np.bool),
                       ('ramp_down', np.bool),
                       ('ramp_up', np.bool),
                       ('pmt_trip', np.bool),
                       ('rn220_fast_alphas', np.bool),
                       ('after_rn220', np.bool),
                       ('abandon', np.bool),
                       ('RAD_commissioning', np.bool)])
runs = np.zeros(sr1_right, dtype = runs_dtype)

In [None]:
import pymongo
from utilix import xent_collection
coll = xent_collection()

for i in tqdm(range(sr1_right)):
    query = {'number': i}
    doc = coll.find_one(query)
    runs[i]['number'] = i
    runs[i]['mode'] = doc['mode']
    try:
        tags = doc['tags']
        for t in tags:
            if t['name'] == 'bad':
                runs[i]['bad'] = True
            elif t['name'] == 'messy':
                runs[i]['messy'] = True
            elif t['name'] == 'hot_spot':
                runs[i]['hot_spot'] = True
            elif t['name'] == 'ramp_down':
                runs[i]['ramp_down'] = True
            elif t['name'] == 'ramp_up':
                runs[i]['ramp_up'] = True            
            elif t['name'] == 'pmt_trip':
                runs[i]['pmt_trip'] = True
            elif t['name'] == 'rn220_fast_alphas':
                runs[i]['rn220_fast_alphas'] = True
            elif t['name'] == 'after_rn220':
                runs[i]['after_rn220'] = True
            elif t['name'] == 'abandon':
                runs[i]['abandon'] = True
            elif t['name'] == 'RAD_commissioning':
                runs[i]['RAD_commissioning'] = True
    except:
        pass

In [None]:
for i in tqdm(range(20)):
    if i == 0:
        rules_info = np.load('/project2/lgrandi/yuanlq/shared/dali_cleanup/rucio_20230628_all_rules0.npy', 
                             allow_pickle=True)
    else:
        new = np.load('/project2/lgrandi/yuanlq/shared/dali_cleanup/rucio_20230628_all_rules%s.npy'%(i), 
                      allow_pickle=True)
        rules_info = np.concatenate((rules_info, new))

In [None]:
rules_info = rules_info[rules_info['UC_OSG_USERDISK']]

In [None]:
runids = rules_info['runid'].astype(np.int32)

In [None]:
interested_dtypes = ['raw_records_mv', 'raw_records_nv', 'raw_records_he', 'raw_records', 'records']

In [None]:
all_dtypes = np.unique(rules_info['data_type'])
names = []
sizes_gb = []
for dt in all_dtypes:
    selected = rules_info[rules_info['data_type']==dt]
    names.append(dt)
    sizes_gb.append(selected['size_gb'].sum())
names = np.array(names)
sizes_gb = np.array(sizes_gb)
argsort = sizes_gb.argsort()
names_sort = names[argsort]
sizes_gb_sort = sizes_gb[argsort]
for i in range(len(names)):
    print(names_sort[i], int(sizes_gb_sort[i]/1024), 'TB')

In [None]:
def size_vs_runs(rules, runid_min=0, runid_max=sr1_right, nbins=100):
    runids = rules['runid'].astype(np.int32)
    bins_bound = np.linspace(runid_min, runid_max, nbins+1)
    sizes_tb = np.zeros(nbins)
    for b in range(nbins):
        selected_mask = runids>=bins_bound[b]
        selected_mask &= runids<=bins_bound[b+1]
        rules_selected = rules[selected_mask]
        if len(rules_selected):
            sizes_tb[b] = np.sum(rules_selected['size_gb']/1024)
    
    cum_sizes_tb = np.cumsum(sizes_tb)
    return cum_sizes_tb

In [None]:
def filter_out_rad(rules, runs):
    is_rad = np.zeros(len(rules), np.bool)
    for i,r in enumerate(rules):
        if runs[int(r['runid'])]['RAD_commissioning']:
            is_rad[i] = True
    return rules[~is_rad]

In [None]:
def find_with_tags(rules, runs, tags):
    tagged = np.zeros(len(rules), np.bool)
    for i,r in enumerate(rules):
        for t in tags:
            if runs[int(r['runid'])][t]:
                tagged[i] = True
    print(np.sum(tagged))
    return rules[tagged]

In [None]:
def find_with_mode(rules, runs, mode):
    is_mode = np.zeros(len(rules), np.bool)
    for i,r in enumerate(rules):
        if runs[int(r['runid'])]['mode'] == mode:
            is_mode[i] = True
    print(np.sum(is_mode))
    return rules[is_mode]

In [None]:
cum_sizes_tb_rr = size_vs_runs(rules_info[(rules_info['data_type']=='raw_records')])
cum_sizes_tb_rrhe = size_vs_runs(rules_info[rules_info['data_type']=='raw_records_he'])
cum_sizes_tb_rrmv = size_vs_runs(rules_info[rules_info['data_type']=='raw_records_mv'])
cum_sizes_tb_rrnv = size_vs_runs(rules_info[rules_info['data_type']=='raw_records_nv'])
cum_sizes_tb_pkl = size_vs_runs(rules_info[rules_info['data_type']=='peaklets'])
cum_sizes_tb_r = size_vs_runs(rules_info[rules_info['data_type']=='records'])

In [None]:
plt.figure(dpi=100)
plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_rr, label='raw_records')
plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_rrhe, label='raw_records_he')
plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_rrmv, label='raw_records_mv')
plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_rrnv, label='raw_records_nv')
plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_pkl, label='peaklets')
plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_r, label='records')
plt.xlabel('RunID')
plt.title('OSG Top 6 Datatype')
plt.ylabel('Size [TB]')
plt.axvspan(sr0_left, sr0_right, alpha=0.3, color='k', label='SR0')
plt.axvspan(sr1_left, sr1_right, alpha=0.3, color='r', label='SR1')
#plt.yscale('log')
plt.gca().set_ylim(bottom=1)
plt.legend()

In [None]:
rules_info_norad = filter_out_rad(rules=rules_info, runs=runs)

In [None]:
cum_sizes_tb_rr_norad = size_vs_runs(rules_info_norad[(rules_info_norad['data_type']=='raw_records')])
cum_sizes_tb_rrhe_norad = size_vs_runs(rules_info_norad[rules_info_norad['data_type']=='raw_records_he'])
cum_sizes_tb_rrmv_norad = size_vs_runs(rules_info_norad[rules_info_norad['data_type']=='raw_records_mv'])
cum_sizes_tb_rrnv_norad = size_vs_runs(rules_info_norad[rules_info_norad['data_type']=='raw_records_nv'])
cum_sizes_tb_pkl_norad = size_vs_runs(rules_info_norad[rules_info_norad['data_type']=='peaklets'])
cum_sizes_tb_r_norad = size_vs_runs(rules_info_norad[rules_info_norad['data_type']=='records'])

In [None]:
plt.figure(dpi=100)
plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_rr_norad, label='raw_records')
plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_rrhe_norad, label='raw_records_he')
plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_rrmv_norad, label='raw_records_mv')
plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_rrnv_norad, label='raw_records_nv')
plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_pkl_norad, label='peaklets')
plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_r_norad, label='records')
plt.xlabel('RunID')
plt.title('OSG Top 6 Datatype (Excluded RAD runs)')
plt.ylabel('Size [TB]')
plt.axvspan(sr0_left, sr0_right, alpha=0.3, color='k', label='SR0')
plt.axvspan(sr1_left, sr1_right, alpha=0.3, color='r', label='SR1')
#plt.yscale('log')
plt.gca().set_ylim(bottom=1)
plt.legend()

In [None]:
rules_info_lowq = find_with_tags(rules=rules_info, runs=runs, tags=['bad', 'messy', 'abandon'])

In [None]:
cum_sizes_tb_rr_lowq = size_vs_runs(rules_info_lowq[(rules_info_lowq['data_type']=='raw_records')])
cum_sizes_tb_rrhe_lowq = size_vs_runs(rules_info_lowq[rules_info_lowq['data_type']=='raw_records_he'])
cum_sizes_tb_rrmv_lowq = size_vs_runs(rules_info_lowq[rules_info_lowq['data_type']=='raw_records_mv'])
cum_sizes_tb_rrnv_lowq = size_vs_runs(rules_info_lowq[rules_info_lowq['data_type']=='raw_records_nv'])
cum_sizes_tb_r_lowq = size_vs_runs(rules_info_lowq[rules_info_lowq['data_type']=='records'])

In [None]:
plt.figure(dpi=100)
plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_rr_lowq, label='raw_records')
plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_rrhe_lowq, label='raw_records_he')
plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_rrmv_lowq, label='raw_records_mv')
plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_rrnv_lowq, label='raw_records_nv')
plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_r_lowq, label='records')
plt.xlabel('RunID')
plt.title('Bad or Messy or Abandon Tagged')
plt.ylabel('Size [TB]')
plt.axvspan(sr0_left, sr0_right, alpha=0.3, color='k', label='SR0')
plt.axvspan(sr1_left, sr1_right, alpha=0.3, color='r', label='SR1')
#plt.yscale('log')
plt.gca().set_ylim(bottom=1)
plt.legend()

In [None]:
rules_info_lowq['size_gb'].sum()

In [None]:
np.save('delete_dcache_20230628/low_quality_tagged_raw_records.npy', rules_info_lowq)

In [None]:
cum_sizes_tb_rr_lowq[-1] + cum_sizes_tb_rrhe_lowq[-1] + cum_sizes_tb_rrmv_lowq[-1] + cum_sizes_tb_rrnv_lowq[-1] + cum_sizes_tb_r_lowq[-1]

### preSR0

In [None]:
pre_sr0_rules_info = rules_info_norad[rules_info_norad['runid'].astype(int)<sr0_left]

In [None]:
rules_info_norad['size_gb'].sum()

In [None]:
pre_sr0_rules_info['size_gb'].sum()

In [None]:
np.save('delete_dcache_20230628/pre_sr0_rr_no_rad', pre_sr0_rules_info)

### All time

In [None]:
total_rr = 0
total_rrhe = 0
total_rrmv = 0
total_rrnv = 0
total_r = 0

size_rr = []
size_rrhe = []
size_rrmv = []
size_rrnv = []
size_r = []

for m in modes_list:
    rules_info_mode = find_with_mode(rules=rules_info_norad, runs=runs, mode=m)
    cum_sizes_tb_rr_mode = size_vs_runs(rules_info_mode[(rules_info_mode['data_type']=='raw_records')])
    cum_sizes_tb_rrhe_mode = size_vs_runs(rules_info_mode[rules_info_mode['data_type']=='raw_records_he'])
    cum_sizes_tb_rrmv_mode = size_vs_runs(rules_info_mode[rules_info_mode['data_type']=='raw_records_mv'])
    cum_sizes_tb_rrnv_mode = size_vs_runs(rules_info_mode[rules_info_mode['data_type']=='raw_records_nv'])
    cum_sizes_tb_r_mode = size_vs_runs(rules_info_mode[rules_info_mode['data_type']=='records'])

    # ybe nv
    if m=='ybe_linked':
        to_save = rules_info_mode[rules_info_mode['data_type']=='raw_records_nv']
        np.save('delete_dcache_20230628/ybe_rr_nv.npy',to_save)
    
    if m=='tpc_pmtap':
        to_save = rules_info_mode[(rules_info_mode['runid'].astype(int)>=sr0_left)&
                                  (rules_info_mode['runid'].astype(int)<=sr1_left)]
        #to_save = to_save[np.where(np.isin(to_save['runid'], luisa_ap_list, invert=True))]
        #np.save('delete_dcache_20230628/pre_sr1_tpc_pmtap_rr.npy',to_save)
    if m=='tpc_pmtgain':
        to_save = rules_info_mode[(rules_info_mode['runid'].astype(int)>=sr0_left)&
                                  (rules_info_mode['runid'].astype(int)<=sr1_left)]
        #np.save('delete_dcache_20230628/pre_sr1_tpc_pmtgain_rr.npy',to_save)
    if m=='tpc_kr83m':
        to_save = rules_info_mode[(rules_info_mode['runid'].astype(int)>=sr0_right)&
                                  (rules_info_mode['runid'].astype(int)<=sr1_left)]
        #np.save('delete_dcache_20230628/post_sr0_pre_sr1_tpc_kr83m_rr.npy',to_save)
    if m=='tpc_kr83m':
        to_save = rules_info_mode[(rules_info_mode['runid'].astype(int)>=sr0_left)&
                                  (rules_info_mode['runid'].astype(int)<=sr0_right)]
        #np.save('delete_dcache_20230628/sr0_tpc_kr83m_rr.npy',to_save)
    if m=='tpc_radon':
        to_save = rules_info_mode[(rules_info_mode['runid'].astype(int)>=sr0_left)&
                                  (rules_info_mode['runid'].astype(int)<=sr0_right)]
        #np.save('delete_dcache_20230628/sr0_tpc_radon_rr.npy',to_save)
    if m=='tpc_radon_hev':
        to_save = rules_info_mode[(rules_info_mode['runid'].astype(int)>=sr0_left)&
                                  (rules_info_mode['runid'].astype(int)<=sr0_right)]
        #np.save('delete_dcache_20230628/sr0_tpc_radon_hev_rr.npy',to_save)
    
    total_rr += cum_sizes_tb_rr_mode[-1]
    total_rrhe += cum_sizes_tb_rrhe_mode[-1]
    total_rrmv += cum_sizes_tb_rrmv_mode[-1]
    total_rrnv += cum_sizes_tb_rrnv_mode[-1]
    total_r += cum_sizes_tb_r_mode[-1]
    
    size_rr.append(int(cum_sizes_tb_rr_mode[-1]))
    size_rrhe.append(int(cum_sizes_tb_rrhe_mode[-1]))
    size_rrmv.append(int(cum_sizes_tb_rrmv_mode[-1]))
    size_rrnv.append(int(cum_sizes_tb_rrnv_mode[-1]))
    size_r.append(int(cum_sizes_tb_r_mode[-1]))
    
    if (cum_sizes_tb_rr_mode[-1] + 
        cum_sizes_tb_rrhe_mode[-1] + 
        cum_sizes_tb_rrmv_mode[-1] + 
        cum_sizes_tb_rrnv_mode[-1] +
        cum_sizes_tb_r_mode[-1]) > 0:
    
        plt.figure(dpi=100)
        plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_rr_mode, label='raw_records')
        plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_rrhe_mode, label='raw_records_he')
        plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_rrmv_mode, label='raw_records_mv')
        plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_rrnv_mode, label='raw_records_nv')
        plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_r_mode, label='records')
        plt.xlabel('RunID')
        plt.title('Mode %s'%(m))
        plt.ylabel('Size [TB]')
        plt.axvspan(sr0_left, sr0_right, alpha=0.3, color='k', label='SR0')
        plt.axvspan(sr1_left, sr1_right, alpha=0.3, color='r', label='SR1')
        #plt.yscale('log')
        plt.gca().set_ylim(bottom=0)
        plt.legend()
        plt.savefig('osg_straxdata_mode_%s'%(m))
        plt.show()

size_rr = np.array(size_rr)  
size_rrhe = np.array(size_rrhe)  
size_rrmv = np.array(size_rrmv)  
size_rrnv = np.array(size_rrnv)  
size_r = np.array(size_r)  

## Size by Run Mode

In [None]:
modes_list = np.array(modes_list)
print('\n raw_records')
for i in range(len(modes_list)):
    if size_rr[size_rr.argsort()][i] > 1:
        print(modes_list[size_rr.argsort()][i], size_rr[size_rr.argsort()][i], 'TB')
    
print('\n raw_records_he')
for i in range(len(modes_list)):
    if size_rrhe[size_rrhe.argsort()][i] > 1:
        print(modes_list[size_rrhe.argsort()][i], size_rrhe[size_rrhe.argsort()][i], 'TB')
        
print('\n records')
for i in range(len(modes_list)):
    if size_r[size_r.argsort()][i] > 1:
        print(modes_list[size_r.argsort()][i], size_r[size_r.argsort()][i], 'TB')

print('\n raw_records_nv')
for i in range(len(modes_list)):
    if size_rrnv[size_rrnv.argsort()][i] > 1:
        print(modes_list[size_rrnv.argsort()][i], size_rrnv[size_rrnv.argsort()][i], 'TB')

print('\n raw_records_mv')
for i in range(len(modes_list)):
    if size_rrmv[size_rrmv.argsort()][i] > 1:
        print(modes_list[size_rrmv.argsort()][i], size_rrmv[size_rrmv.argsort()][i], 'TB')

## Size by Run

In [None]:
presr0 = rules_info[(rules_info['runid'].astype(int)<sr0_left)]['size_gb'].sum()/1024
print('Beofre SR0:', int(presr0), 'TB')
sr0 = rules_info[(rules_info['runid'].astype(int)<sr0_right)&(rules_info['runid'].astype(int)>sr0_left)]['size_gb'].sum()/1024
print('During SR0:', int(sr0), 'TB')
btwn = rules_info[(rules_info['runid'].astype(int)<sr1_left)&(rules_info['runid'].astype(int)>sr0_right)]['size_gb'].sum()/1024
print('Between SR0 and SR1:', int(btwn), 'TB')
sr1 = rules_info[(rules_info['runid'].astype(int)>sr1_left)]['size_gb'].sum()/1024
print('Between SR0 and SR1:', int(sr1), 'TB')

# To delete

## Pre-SR0 non RAD

In [None]:
pre_sr0_rr_no_rad = np.load('delete_dcache_20230628/pre_sr0_rr_no_rad.npy', allow_pickle=True)

In [None]:
ybe_rr_nv['size_gb'].sum()/1024

## Bad Quality

In [None]:
low_quality_tagged_rr = np.load('delete_dcache_20230628/low_quality_tagged_raw_records.npy', allow_pickle=True)

In [None]:
low_quality_tagged_rr['size_gb'].sum()/1024

## SR0 NV

In [None]:
sr0_rr_nv = rules_info[((rules_info['data_type']=="raw_records_nv")&
                        (rules_info['runid'].astype(np.int32)<=sr0_right))]

In [None]:
np.save('delete_dcache_20230628/sr0_rr_nv.npy', sr0_rr_nv)

In [None]:
sr0_rr_nv = np.load('delete_dcache_20230628/sr0_rr_nv.npy', allow_pickle=True)

In [None]:
sr0_rr_nv['size_gb'].sum()/1024

## SR0 MV

In [None]:
sr0_rr_mv = rules_info[((rules_info['data_type']=="raw_records_mv")&
                        (rules_info['runid'].astype(np.int32)<=sr0_right))]

In [None]:
np.save('delete_dcache_20230628/sr0_rr_mv.npy', sr0_rr_mv)

In [None]:
sr0_rr_mv = np.load('delete_dcache_20230628/sr0_rr_mv.npy', allow_pickle=True)

In [None]:
sr0_rr_mv['size_gb'].sum()/1024

## YBe NV

In [None]:
ybe_rr_nv = np.load('delete_dcache_20230628/ybe_rr_nv.npy', allow_pickle=True)

In [None]:
ybe_rr_nv['size_gb'].sum()/1024

## Overall

In [None]:
overall = np.concatenate((pre_sr0_rr_no_rad, low_quality_tagged_rr, sr0_rr_nv, sr0_rr_mv, ybe_rr_nv))

In [None]:
overall = np.unique(overall)

In [None]:
np.sum(overall['size_gb'])/1024

In [None]:
np.save('delete_dcache_20230628/osg_20230628.npy', overall)

# v11

In [None]:
import pickle
import admix
with open('/project2/lgrandi/xenonnt/reprocessing_runlist/global_v11/runlists_reprocessing_global_v11.pickle', 'rb') as f:
    jingqiang = pickle.load(f)


In [None]:
jingqiang['runlists'].keys()

In [None]:
for i in tqdm(range(20)):
    if i == 0:
        all_rules = np.load('/project2/lgrandi/yuanlq/shared/dali_cleanup/rucio_20230628_all_rules0.npy', 
                             allow_pickle=True)
    else:
        new = np.load('/project2/lgrandi/yuanlq/shared/dali_cleanup/rucio_20230628_all_rules%s.npy'%(i), 
                      allow_pickle=True)
        all_rules = np.concatenate((rules_info, new))

In [None]:
from utilix import xent_collection
rundb = xent_collection()

In [None]:
all_rules = all_rules[(all_rules['data_type']=='raw_records')]

In [None]:
sizes_gb = []
missing_runs = []
total_runs = []
for runtype in jingqiang['runlists'].keys():
    missing_run = 0
    print(runtype)
    runlist = jingqiang['runlists'][runtype]
    total_runs.append(len(runlist))
    size_gb = 0
    for run in tqdm(runlist):
        found = False
        runid = str(run).zfill(6)
        try:
            size_gb += all_rules[all_rules['runid']==runid][0]['size_gb']
            found = True
        except:
            run = int(run)
            query = {"number": run}
            doc = rundb.find_one(query)
            for d in doc['data']:
                if d['type'] == 'raw_records':
                    try:
                        size_gb += d['size_mb']
                        found = True
                        break
                    except:
                        pass
            if not found:
                missing_run += 1
    missing_runs.append(missing_run)
    sizes_gb.append(size_gb)


In [None]:
sizes_gb = np.array(sizes_gb)
missing_runs = np.array(missing_runs)
total_runs = np.array(total_runs)

In [None]:
estimated_size_tb = sizes_gb/1024* total_runs/(total_runs-missing_runs)

In [None]:
estimated_size_tb.sum()

In [None]:
3/(0.75*(89+32)/500)