In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import utilix
import numpy as np
from tqdm import tqdm
import os
path_start = '/dali/lgrandi/rucio/xnt_'

In [None]:
sr0_right = 34731
sr0_left  = 17918
sr1_left  = 43039
sr1_right = 53000

In [None]:
def find_with_mode(rules, runs, mode):
    is_mode = np.zeros(len(rules), np.bool)
    for i,r in enumerate(rules):
        if int(r['runid'])<50126:
            if runs[int(r['runid'])]['mode'] == mode:
                is_mode[i] = True
    print(np.sum(is_mode))
    return rules[is_mode]

def size_vs_runs(rules, runid_min=0, runid_max=50200, nbins=100):
    runids = rules['runid'].astype(np.int32)
    bins_bound = np.linspace(runid_min, runid_max, nbins+1)
    sizes_tb = np.zeros(nbins)
    for b in range(nbins):
        selected_mask = runids>=bins_bound[b]
        selected_mask &= runids<=bins_bound[b+1]
        rules_selected = rules[selected_mask]
        if len(rules_selected):
            sizes_tb[b] = np.sum(rules_selected['size_gb']/1024)
    
    cum_sizes_tb = np.cumsum(sizes_tb)
    return cum_sizes_tb

def find_with_tags(rules, runs, tags):
    tagged = np.zeros(len(rules), np.bool)
    for i,r in enumerate(rules):
        for t in tags:
            if runs[int(r['runid'])][t]:
                tagged[i] = True
    print(np.sum(tagged))
    return rules[tagged]

def filter_out_rad(rules, runs):
    is_rad = np.zeros(len(rules), np.bool)
    for i,r in enumerate(rules):
        if runs[int(r['runid'])]['RAD_commissioning']:
            is_rad[i] = True
    return rules[~is_rad]

In [None]:
import pymongo
from utilix import xent_collection
coll = xent_collection()

runs_dtype = np.dtype([('number', np.int32), 
                       ('mode', 'O'),
                       ('bad', np.bool),
                       ('messy', np.bool),
                       ('hot_spot', np.bool),
                       ('ramp_down', np.bool),
                       ('ramp_up', np.bool),
                       ('pmt_trip', np.bool),
                       ('rn220_fast_alphas', np.bool),
                       ('after_rn220', np.bool),
                       ('abandon', np.bool),
                       ('RAD_commissioning', np.bool)])
runs = np.zeros(sr1_right, dtype = runs_dtype)

for i in tqdm(range(sr1_right)):
    query = {'number': i}
    doc = coll.find_one(query)
    runs[i]['number'] = i
    runs[i]['mode'] = doc['mode']
    try:
        tags = doc['tags']
        for t in tags:
            if t['name'] == 'bad':
                runs[i]['bad'] = True
            elif t['name'] == 'messy':
                runs[i]['messy'] = True
            elif t['name'] == 'hot_spot':
                runs[i]['hot_spot'] = True
            elif t['name'] == 'ramp_down':
                runs[i]['ramp_down'] = True
            elif t['name'] == 'ramp_up':
                runs[i]['ramp_up'] = True            
            elif t['name'] == 'pmt_trip':
                runs[i]['pmt_trip'] = True
            elif t['name'] == 'rn220_fast_alphas':
                runs[i]['rn220_fast_alphas'] = True
            elif t['name'] == 'after_rn220':
                runs[i]['after_rn220'] = True
            elif t['name'] == 'abandon':
                runs[i]['abandon'] = True
            elif t['name'] == 'RAD_commissioning':
                runs[i]['RAD_commissioning'] = True
    except:
        pass

In [None]:
for i in range(20):
    if i == 0:
        rules_info = np.load('/project2/lgrandi/yuanlq/shared/dali_cleanup/rucio_20230628_all_rules0.npy', 
                             allow_pickle=True)
    else:
        new = np.load('/project2/lgrandi/yuanlq/shared/dali_cleanup/rucio_20230628_all_rules%s.npy'%(i), 
                      allow_pickle=True)
        rules_info = np.concatenate((rules_info, new))

In [None]:
rules_info = rules_info[rules_info['UC_DALI_USERDISK']]

In [None]:
rules_info = rules_info[rules_info['runid'].astype(int)<sr1_right]
rules_info_norad = filter_out_rad(rules=rules_info, runs=runs)

In [None]:
size_all = rules_info['size_gb'].sum()
print('All rucio data on dali: %sTB'%(int(size_all/1024)))

In [None]:
size_peaklets = rules_info['size_gb'][rules_info['data_type']=='peaklets'].sum()
print('All peaklets on dali: %sTB'%(int(size_peaklets/1024)))

In [None]:
size_hitlets_nv = rules_info['size_gb'][rules_info['data_type']=='hitlets_nv'].sum()
print('All hitlets_nv on dali: %sTB'%(int(size_hitlets_nv/1024)))

In [None]:
size_merged_s2s = rules_info['size_gb'][rules_info['data_type']=='merged_s2s'].sum()
print('All merged_s2s on dali: %sTB'%(int(size_merged_s2s/1024)))

In [None]:
all_dtypes = np.unique(rules_info['data_type'])
names = []
sizes_gb = []
for dt in all_dtypes:
    selected = rules_info[rules_info['data_type']==dt]
    names.append(dt)
    sizes_gb.append(selected['size_gb'].sum())
names = np.array(names)
sizes_gb = np.array(sizes_gb)
argsort = sizes_gb.argsort()
names_sort = names[argsort]
sizes_gb_sort = sizes_gb[argsort]
for i in range(len(names)):
    if int(sizes_gb_sort[i]/1024) >= 1:
        print(names_sort[i], int(sizes_gb_sort[i]/1024), 'TB')

In [None]:
cum_sizes_tb_pkl = size_vs_runs(rules_info_norad[rules_info_norad['data_type']=='peaklets'])
cum_sizes_tb_hnv = size_vs_runs(rules_info_norad[rules_info_norad['data_type']=='hitlets_nv'])
cum_sizes_tb_ms2 = size_vs_runs(rules_info_norad[rules_info_norad['data_type']=='merged_s2s'])

In [None]:
plt.figure(dpi=100)
plt.plot(np.linspace(0, 50200, 100), cum_sizes_tb_pkl, label='peaklets')
plt.plot(np.linspace(0, 50200, 100), cum_sizes_tb_hnv, label='hitlets_nv')
plt.plot(np.linspace(0, 50200, 100), cum_sizes_tb_ms2, label='merged_s2s')

plt.xlabel('RunID')
plt.title('DaLI Top 3 Datatype')
plt.ylabel('Size [TB]')
plt.axvspan(sr0_left, sr0_right, alpha=0.3, color='k', label='SR0')
plt.axvspan(sr1_left, sr1_right, alpha=0.3, color='r', label='SR1')
#plt.yscale('log')
#plt.gca().set_ylim(top=80)
plt.legend()

In [None]:
import cutax
st = cutax.contexts.xenonnt_online()
all_runs = st.select_runs()
modes = set(all_runs["mode"])
modes_list = list(modes)

In [None]:
total_pkl = 0

size_pkl = []

for m in modes_list:
    rules_info_mode = find_with_mode(rules=rules_info_norad, runs=runs, mode=m)
    cum_sizes_tb_pkl_mode = size_vs_runs(rules_info_mode[(rules_info_mode['data_type']=='peaklets')])
        
    total_pkl += cum_sizes_tb_pkl_mode[-1]
    
    size_pkl.append(int(cum_sizes_tb_pkl_mode[-1]))
    
    if cum_sizes_tb_pkl_mode[-1] > 0.2: 
    
        plt.figure(dpi=100)
        plt.plot(np.linspace(0, sr1_right, 100), cum_sizes_tb_pkl_mode, label='peaklets')
        plt.xlabel('RunID')
        plt.title('Mode %s'%(m))
        plt.ylabel('Size [TB]')
        plt.axvspan(sr0_left, sr0_right, alpha=0.3, color='k', label='SR0')
        plt.axvspan(sr1_left, sr1_right, alpha=0.3, color='r', label='SR1')
        #plt.yscale('log')
        #plt.gca().set_ylim(bottom=1)
        plt.legend()
        plt.savefig('osg_straxdata_mode_%s'%(m))
        plt.show()

size_pkl = np.array(size_pkl)   

modes_list = np.array(modes_list)

for i in range(len(modes_list)):
    if size_pkl[size_pkl.argsort()][i] > 0.2:
        print(modes_list[size_pkl.argsort()][i], size_pkl[size_pkl.argsort()][i], 'TB')

In [None]:
total_ms2 = 0

size_ms2 = []

for m in modes_list:
    rules_info_mode = find_with_mode(rules=rules_info_norad, runs=runs, mode=m)
    cum_sizes_tb_ms2_mode = size_vs_runs(rules_info_mode[(rules_info_mode['data_type']=='merged_s2s')])
        
    total_ms2 += cum_sizes_tb_ms2_mode[-1]
    
    size_ms2.append(int(cum_sizes_tb_ms2_mode[-1]))
    
    if cum_sizes_tb_ms2_mode[-1] > 0.2: 
    
        plt.figure(dpi=100)
        plt.plot(np.linspace(0, 50200, 100), cum_sizes_tb_ms2_mode, label='merged_s2s')
        plt.xlabel('RunID')
        plt.title('Mode %s'%(m))
        plt.ylabel('Size [TB]')
        plt.axvspan(sr0_left, sr0_right, alpha=0.3, color='k', label='SR0')
        plt.axvspan(sr1_left, sr1_right, alpha=0.3, color='r', label='SR1')
        #plt.yscale('log')
        #plt.gca().set_ylim(bottom=1)
        plt.legend()
        plt.savefig('osg_straxdata_mode_%s'%(m))
        plt.show()

size_ms2 = np.array(size_ms2)   

modes_list = np.array(modes_list)

for i in range(len(modes_list)):
    if size_ms2[size_ms2.argsort()][i] > 0.2:
        print(modes_list[size_ms2.argsort()][i], size_ms2[size_ms2.argsort()][i], 'TB')

In [None]:
total_rrnv = 0

size_rrnv = []

for m in modes_list:
    rules_info_mode = find_with_mode(rules=rules_info_norad, runs=runs, mode=m)
    cum_sizes_tb_rrnv_mode = size_vs_runs(rules_info_mode[(rules_info_mode['data_type']=='hitlets_nv')])
        
    total_rrnv += cum_sizes_tb_rrnv_mode[-1]
    
    size_rrnv.append(int(cum_sizes_tb_rrnv_mode[-1]))
    
    if cum_sizes_tb_rrnv_mode[-1] > 0.2: 
    
        plt.figure(dpi=100)
        plt.plot(np.linspace(0, 50200, 100), cum_sizes_tb_rrnv_mode, label='hitlets_nv')
        plt.xlabel('RunID')
        plt.title('Mode %s'%(m))
        plt.ylabel('Size [TB]')
        plt.axvspan(sr0_left, sr0_right, alpha=0.3, color='k', label='SR0')
        plt.axvspan(sr1_left, sr1_right, alpha=0.3, color='r', label='SR1')
        #plt.yscale('log')
        #plt.gca().set_ylim(bottom=1)
        plt.legend()
        plt.savefig('osg_straxdata_mode_%s'%(m))
        plt.show()

size_rrnv = np.array(size_rrnv)   

modes_list = np.array(modes_list)

for i in range(len(modes_list)):
    if size_rrnv[size_rrnv.argsort()][i] > 0.2:
        print(modes_list[size_rrnv.argsort()][i], size_rrnv[size_rrnv.argsort()][i], 'TB')

## Low quality

In [None]:
rules_info_lowq = find_with_tags(rules=rules_info, runs=runs, tags=['bad', 'messy', 'abandon'])
rules_info_lowq = rules_info_lowq[(rules_info_lowq['data_type']=='peaklets')|
                                    (rules_info_lowq['data_type']=='merged_s2s')|
                                    (rules_info_lowq['data_type']=='hitlets_nv')]
np.sum(rules_info_lowq['size_gb'])

In [None]:
np.save('/home/yuanlq/software/xeda/msc/dali/delete_dali_20230628/peaks_lowq', rules_info_lowq)

In [None]:
cum_sizes_tb_pkl = size_vs_runs(rules_info_norad[rules_info_norad['data_type']=='peaklets'])

## SR1 Kr83m

In [None]:
sr1_kr83m = find_with_mode(rules=rules_info_norad, runs=runs, mode='tpc_kr83m')

In [None]:
sr1_kr83m = sr1_kr83m[sr1_kr83m['runid'].astype(int)>sr1_left]

In [None]:
sr1_kr83m = sr1_kr83m[(sr1_kr83m['data_type']=='peaklets')|
                        (sr1_kr83m['data_type']=='merged_s2s')]

In [None]:
np.sum(sr1_kr83m['size_gb'])

In [None]:
np.save('/home/yuanlq/software/xeda/msc/dali/delete_dali_20230628/peaks_sr1_kr83m.npy', sr1_kr83m)

## SR1 Rn220

In [None]:
sr1_radon = find_with_mode(rules=rules_info_norad, runs=runs, mode='tpc_radon')
sr1_radon = sr1_radon[sr1_radon['runid'].astype(int)>sr1_left]
sr1_radon = sr1_radon[(sr1_radon['data_type']=='peaklets')|
                        (sr1_radon['data_type']=='merged_s2s')]

In [None]:
np.sum(sr1_radon['size_gb'])

In [None]:
sr1_radon_hev = find_with_mode(rules=rules_info_norad, runs=runs, mode='tpc_radon_hev')
sr1_radon_hev = sr1_radon_hev[sr1_radon_hev['runid'].astype(int)>sr1_left]
sr1_radon_hev = sr1_radon_hev[(sr1_radon_hev['data_type']=='peaklets')|
                        (sr1_radon_hev['data_type']=='merged_s2s')]

In [None]:
np.sum(sr1_radon_hev['size_gb'])

In [None]:
sr1_rn220 = np.concatenate((sr1_radon_hev, sr1_radon))
np.save('/home/yuanlq/software/xeda/msc/dali/delete_dali_20230628/peaks_sr1_rn220.npy', sr1_rn220)

In [None]:
np.sum(sr1_rn220['size_gb'])

## SR1 YBe

In [None]:
sr1_tpc_ybe = find_with_mode(rules=rules_info_norad, runs=runs, mode='ybe_tpc')
sr1_tpc_ybe = sr1_tpc_ybe[sr1_tpc_ybe['runid'].astype(int)>sr1_left]
sr1_tpc_ybe = sr1_tpc_ybe[(sr1_tpc_ybe['data_type']=='peaklets')|
                        (sr1_tpc_ybe['data_type']=='merged_s2s')|
                        (sr1_tpc_ybe['data_type']=='hitlets_nv')]
np.sum(sr1_tpc_ybe['size_gb'])

In [None]:
sr1_ybe_linked = find_with_mode(rules=rules_info_norad, runs=runs, mode='ybe_linked')
sr1_ybe_linked = sr1_ybe_linked[sr1_ybe_linked['runid'].astype(int)>sr1_left]
sr1_ybe_linked = sr1_ybe_linked[(sr1_ybe_linked['data_type']=='peaklets')|
                        (sr1_ybe_linked['data_type']=='merged_s2s')|
                        (sr1_ybe_linked['data_type']=='hitlets_nv')]
np.sum(sr1_ybe_linked['size_gb'])

In [None]:
sr1_ybe = np.concatenate((sr1_ybe_linked, sr1_tpc_ybe))
np.save('/home/yuanlq/software/xeda/msc/dali/delete_dali_20230628/peaks_sr1_ybe.npy', sr1_ybe)

# To delete

In [None]:
rules_info_lowq = np.load('/home/yuanlq/software/xeda/msc/dali/delete_dali_20230628/peaks_lowq.npy', allow_pickle=True)
sr1_ybe = np.load('/home/yuanlq/software/xeda/msc/dali/delete_dali_20230628/peaks_sr1_ybe.npy', allow_pickle=True)
sr1_rn220 = np.load('/home/yuanlq/software/xeda/msc/dali/delete_dali_20230628/peaks_sr1_rn220.npy', allow_pickle=True)
sr1_kr83m = np.load('/home/yuanlq/software/xeda/msc/dali/delete_dali_20230628/peaks_sr1_kr83m.npy', allow_pickle=True)

In [None]:
overall = np.concatenate((rules_info_lowq, sr1_ybe, sr1_rn220, sr1_kr83m))

In [None]:
overall['size_gb'].sum()/1024

In [None]:
np.save('/home/yuanlq/software/xeda/msc/dali/delete_dali_20230628/dali_20230628.npy', overall)

In [None]:
overall = np.unique(overall)

In [None]:
len(np.unique(overall['runid']))

In [None]:
len(np.unique(sr1_ybe['runid']))

In [None]:
len(np.unique(sr1_rn220['runid']))

In [None]:
len(np.unique(sr1_kr83m['runid']))

In [None]:
len(np.unique(overall[overall['runid']]['runid']))

In [None]:
plt.hist(np.unique(overall['runid']).astype(int), bins=100)
plt.xlabel('runid')
plt.ylabel('counts')