The purpose of this notebook is to track the progress of reprocessing

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import utilix
import numpy as np
from tqdm import tqdm
import os
path_start = '/dali/lgrandi/rucio/xnt_'

In [17]:
def find_with_mode(rules, runs, mode):
    is_mode = np.zeros(len(rules), np.bool)
    for i,r in enumerate(rules):
        if int(r['runid'])<50126:
            if runs[int(r['runid'])]['mode'] == mode:
                is_mode[i] = True
    print(np.sum(is_mode))
    return rules[is_mode]

def size_vs_runs(rules, runid_min=0, runid_max=50200, nbins=100):
    runids = rules['runid'].astype(np.int32)
    bins_bound = np.linspace(runid_min, runid_max, nbins+1)
    sizes_tb = np.zeros(nbins)
    for b in range(nbins):
        selected_mask = runids>=bins_bound[b]
        selected_mask &= runids<=bins_bound[b+1]
        rules_selected = rules[selected_mask]
        if len(rules_selected):
            sizes_tb[b] = np.sum(rules_selected['size_gb']/1024)
    
    cum_sizes_tb = np.cumsum(sizes_tb)
    return cum_sizes_tb

def find_with_tags(rules, runs, tags):
    tagged = np.zeros(len(rules), np.bool)
    for i,r in enumerate(rules):
        for t in tags:
            if runs[int(r['runid'])][t]:
                tagged[i] = True
    print(np.sum(tagged))
    return rules[tagged]

def filter_out_rad(rules, runs):
    is_rad = np.zeros(len(rules), np.bool)
    for i,r in enumerate(rules):
        if runs[int(r['runid'])]['RAD_commissioning']:
            is_rad[i] = True
    return rules[~is_rad]

In [18]:
import pymongo
from utilix import xent_collection
coll = xent_collection()

runs_dtype = np.dtype([('number', np.int32), 
                       ('mode', 'O'),
                       ('bad', np.bool),
                       ('messy', np.bool),
                       ('hot_spot', np.bool),
                       ('ramp_down', np.bool),
                       ('ramp_up', np.bool),
                       ('pmt_trip', np.bool),
                       ('rn220_fast_alphas', np.bool),
                       ('after_rn220', np.bool),
                       ('abandon', np.bool),
                       ('RAD_commissioning', np.bool)])
runs = np.zeros(50126, dtype = runs_dtype)

for i in tqdm(range(50126)):
    query = {'number': i}
    doc = coll.find_one(query)
    runs[i]['number'] = i
    runs[i]['mode'] = doc['mode']
    try:
        tags = doc['tags']
        for t in tags:
            if t['name'] == 'bad':
                runs[i]['bad'] = True
            elif t['name'] == 'messy':
                runs[i]['messy'] = True
            elif t['name'] == 'hot_spot':
                runs[i]['hot_spot'] = True
            elif t['name'] == 'ramp_down':
                runs[i]['ramp_down'] = True
            elif t['name'] == 'ramp_up':
                runs[i]['ramp_up'] = True            
            elif t['name'] == 'pmt_trip':
                runs[i]['pmt_trip'] = True
            elif t['name'] == 'rn220_fast_alphas':
                runs[i]['rn220_fast_alphas'] = True
            elif t['name'] == 'after_rn220':
                runs[i]['after_rn220'] = True
            elif t['name'] == 'abandon':
                runs[i]['abandon'] = True
            elif t['name'] == 'RAD_commissioning' or (i in radon_presr0):
                runs[i]['RAD_commissioning'] = True
    except:
        pass

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ('bad', np.bool),
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ('messy', np.bool),
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ('hot_spot', np.bool),
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ('ramp_down', np.bool),
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ('ramp_up', np.bool),
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ('pmt_trip', np.bool),
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ('

In [19]:
for i in range(20):
    if i == 0:
        rules_info = np.load('/project2/lgrandi/yuanlq/shared/dali_cleanup/rucio_20230113_all_rules0.npy', 
                             allow_pickle=True)
    else:
        new = np.load('/project2/lgrandi/yuanlq/shared/dali_cleanup/rucio_20230113_all_rules%s.npy'%(i), 
                      allow_pickle=True)
        rules_info = np.concatenate((rules_info, new))

In [20]:
sr0_right = 34731
sr0_left  = 17918
sr1_left  = 43039
sr1_right = 50200

In [21]:
rules_info = rules_info[rules_info['UC_DALI_USERDISK']]

In [22]:
rules_info = rules_info[rules_info['runid'].astype(int)<50126]
rules_info_norad = filter_out_rad(rules=rules_info, runs=runs)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  is_rad = np.zeros(len(rules), np.bool)


In [8]:
size_all = rules_info['size_gb'].sum()
print('All rucio data on dali: %sTB'%(int(size_all/1024)))

All rucio data on dali: 154TB


In [9]:
size_peaklets = rules_info['size_gb'][rules_info['data_type']=='peaklets'].sum()
print('All peaklets on dali: %sTB'%(int(size_peaklets/1024)))

All peaklets on dali: 78TB


In [10]:
size_hitlets_nv = rules_info['size_gb'][rules_info['data_type']=='hitlets_nv'].sum()
print('All hitlets_nv on dali: %sTB'%(int(size_hitlets_nv/1024)))

All hitlets_nv on dali: 35TB


In [None]:
rules_info['size_gb'].sum() - \
rules_info['size_gb'][rules_info['data_type']=='peaklets'].sum() - \
rules_info['size_gb'][rules_info['data_type']=='hitlets_nv'].sum() - rules_info['size_gb'][rules_info['data_type']=='merged_s2s'].sum()

In [None]:
all_dtypes = np.unique(rules_info['data_type'])
names = []
sizes_gb = []
for dt in all_dtypes:
    selected = rules_info[rules_info['data_type']==dt]
    names.append(dt)
    sizes_gb.append(selected['size_gb'].sum())
names = np.array(names)
sizes_gb = np.array(sizes_gb)
argsort = sizes_gb.argsort()
names_sort = names[argsort]
sizes_gb_sort = sizes_gb[argsort]
for i in range(len(names)):
    if int(sizes_gb_sort[i]/1024) >= 1:
        print(names_sort[i], int(sizes_gb_sort[i]/1024), 'TB')

In [None]:
cum_sizes_tb_pkl = size_vs_runs(rules_info_norad[rules_info_norad['data_type']=='peaklets'])
cum_sizes_tb_hnv = size_vs_runs(rules_info_norad[rules_info_norad['data_type']=='hitlets_nv'])
cum_sizes_tb_ms2 = size_vs_runs(rules_info_norad[rules_info_norad['data_type']=='merged_s2s'])

In [None]:
plt.figure(dpi=100)
plt.plot(np.linspace(0, 50200, 100), cum_sizes_tb_pkl, label='peaklets')
plt.plot(np.linspace(0, 50200, 100), cum_sizes_tb_hnv, label='hitlets_nv')
plt.plot(np.linspace(0, 50200, 100), cum_sizes_tb_ms2, label='merged_s2s')

plt.xlabel('RunID')
plt.title('DaLI Top 3 Datatype')
plt.ylabel('Size [TB]')
plt.axvspan(sr0_left, sr0_right, alpha=0.3, color='k', label='SR0')
plt.axvspan(sr1_left, sr1_right, alpha=0.3, color='r', label='SR1')
#plt.yscale('log')
plt.gca().set_ylim(top=80)
plt.legend()

In [None]:
import cutax
st = cutax.contexts.xenonnt_online()
all_runs = st.select_runs()
modes = set(all_runs["mode"])
modes_list = list(modes)

In [None]:
total_pkl = 0

size_pkl = []

for m in modes_list:
    rules_info_mode = find_with_mode(rules=rules_info_norad, runs=runs, mode=m)
    cum_sizes_tb_pkl_mode = size_vs_runs(rules_info_mode[(rules_info_mode['data_type']=='peaklets')])
        
    total_pkl += cum_sizes_tb_pkl_mode[-1]
    
    size_pkl.append(int(cum_sizes_tb_pkl_mode[-1]))
    
    if cum_sizes_tb_pkl_mode[-1] > 0.2: 
    
        plt.figure(dpi=100)
        plt.plot(np.linspace(0, 50200, 100), cum_sizes_tb_pkl_mode, label='peaklets')
        plt.xlabel('RunID')
        plt.title('Mode %s'%(m))
        plt.ylabel('Size [TB]')
        plt.axvspan(sr0_left, sr0_right, alpha=0.3, color='k', label='SR0')
        plt.axvspan(sr1_left, sr1_right, alpha=0.3, color='r', label='SR1')
        #plt.yscale('log')
        #plt.gca().set_ylim(bottom=1)
        plt.legend()
        plt.savefig('osg_straxdata_mode_%s'%(m))
        plt.show()

size_pkl = np.array(size_pkl)   

modes_list = np.array(modes_list)

for i in range(len(modes_list)):
    if size_pkl[size_pkl.argsort()][i] > 0.2:
        print(modes_list[size_pkl.argsort()][i], size_pkl[size_pkl.argsort()][i], 'TB')

In [None]:
total_ms2 = 0

size_ms2 = []

for m in modes_list:
    rules_info_mode = find_with_mode(rules=rules_info_norad, runs=runs, mode=m)
    cum_sizes_tb_ms2_mode = size_vs_runs(rules_info_mode[(rules_info_mode['data_type']=='merged_s2s')])
        
    total_ms2 += cum_sizes_tb_ms2_mode[-1]
    
    size_ms2.append(int(cum_sizes_tb_ms2_mode[-1]))
    
    if cum_sizes_tb_ms2_mode[-1] > 0.2: 
    
        plt.figure(dpi=100)
        plt.plot(np.linspace(0, 50200, 100), cum_sizes_tb_ms2_mode, label='merged_s2s')
        plt.xlabel('RunID')
        plt.title('Mode %s'%(m))
        plt.ylabel('Size [TB]')
        plt.axvspan(sr0_left, sr0_right, alpha=0.3, color='k', label='SR0')
        plt.axvspan(sr1_left, sr1_right, alpha=0.3, color='r', label='SR1')
        #plt.yscale('log')
        #plt.gca().set_ylim(bottom=1)
        plt.legend()
        plt.savefig('osg_straxdata_mode_%s'%(m))
        plt.show()

size_ms2 = np.array(size_ms2)   

modes_list = np.array(modes_list)

for i in range(len(modes_list)):
    if size_ms2[size_ms2.argsort()][i] > 0.2:
        print(modes_list[size_ms2.argsort()][i], size_ms2[size_ms2.argsort()][i], 'TB')

In [None]:
total_rrnv = 0

size_rrnv = []

for m in modes_list:
    rules_info_mode = find_with_mode(rules=rules_info_norad, runs=runs, mode=m)
    cum_sizes_tb_rrnv_mode = size_vs_runs(rules_info_mode[(rules_info_mode['data_type']=='hitlets_nv')])
        
    total_rrnv += cum_sizes_tb_rrnv_mode[-1]
    
    size_rrnv.append(int(cum_sizes_tb_rrnv_mode[-1]))
    
    if cum_sizes_tb_rrnv_mode[-1] > 0.2: 
    
        plt.figure(dpi=100)
        plt.plot(np.linspace(0, 50200, 100), cum_sizes_tb_rrnv_mode, label='hitlets_nv')
        plt.xlabel('RunID')
        plt.title('Mode %s'%(m))
        plt.ylabel('Size [TB]')
        plt.axvspan(sr0_left, sr0_right, alpha=0.3, color='k', label='SR0')
        plt.axvspan(sr1_left, sr1_right, alpha=0.3, color='r', label='SR1')
        #plt.yscale('log')
        #plt.gca().set_ylim(bottom=1)
        plt.legend()
        plt.savefig('osg_straxdata_mode_%s'%(m))
        plt.show()

size_rrnv = np.array(size_rrnv)   

modes_list = np.array(modes_list)

for i in range(len(modes_list)):
    if size_rrnv[size_rrnv.argsort()][i] > 0.2:
        print(modes_list[size_rrnv.argsort()][i], size_rrnv[size_rrnv.argsort()][i], 'TB')

In [None]:
rules_info_lowq = find_with_tags(rules=rules_info, runs=runs, tags=['bad', 'messy', 'abandon',
                                                                    'ramp_down', 'ramp_up',
                                                                    'pmt_trip', 'rn220_fast_alphas',
                                                                    'after_rn220', 'abandon'])

In [None]:
np.sum(rules_info_lowq['size_gb'])

In [None]:
cum_sizes_tb_pkl = size_vs_runs(rules_info_norad[rules_info_norad['data_type']=='peaklets'])

In [None]:
#np.save('delete_dali_20230216/low_quality_data.npy', rules_info_lowq)

# Find Duplicate Outdated 

In [11]:
len(rules_info)

805014

In [None]:
import utilix
import pandas as pd

pd.DataFrame(
    utilix.rundb.xent_collection(collection='contexts').find(
    {'hashes.peaklets' :'wfhfmcbq5z'})
)

Logic:
- Loop over datatype for a certain run:
    - check if this guy has an older version.

In [23]:
peaklets = rules_info[rules_info['data_type']=='peaklets']
hitlets_nv = rules_info[rules_info['data_type']=='hitlets_nv']
merged_s2s = rules_info[rules_info['data_type']=='merged_s2s']
lone_hits = rules_info[rules_info['data_type']=='lone_hits']
peak_basics = rules_info[rules_info['data_type']=='peak_basics']

In [28]:
type(pd.DataFrame(
    utilix.rundb.xent_collection(collection='contexts').find(
    {'hashes.peaklets' :'wfhfmcbq5z'})
).straxen_version)

pandas.core.series.Series

In [29]:
from tqdm import tqdm
def determine_context(rules_info, dtype='peaklets'):
    contexts = []
    for rule in tqdm(rules_info):
        p = pd.DataFrame(
            utilix.rundb.xent_collection(collection='contexts').find(
            {'hashes.%s'%(dtype) :rule['hash']})
            )
        try:
            names = p.straxen_version
            contexts.append(names.max())
        except:
            contexts.append('0.0.0')
    return contexts

In [None]:
pd.DataFrame(
            utilix.rundb.xent_collection(collection='contexts').find(
            {'hashes.%s'%('merged_s2s') :'f3qbwvvh2w'})
            )

In [None]:
merged_s2s['hash']

In [15]:
peaklets_contexts = determine_context(rules_info=peaklets, dtype='peaklets')

100%|██████████| 25614/25614 [02:54<00:00, 146.63it/s]


In [16]:
lone_hits_contexts = determine_context(rules_info=lone_hits, dtype='lone_hits')

100%|██████████| 22571/22571 [02:37<00:00, 143.74it/s]


In [17]:
peak_basics_contexts = determine_context(rules_info=peak_basics, dtype='peak_basics')

100%|██████████| 28395/28395 [03:35<00:00, 131.95it/s]


In [18]:
merged_s2s_contexts = determine_context(rules_info=merged_s2s, dtype='merged_s2s')

100%|██████████| 17205/17205 [02:21<00:00, 121.98it/s]


In [19]:
hitlets_nv_contexts = determine_context(rules_info=hitlets_nv, dtype='hitlets_nv')

100%|██████████| 25544/25544 [03:01<00:00, 140.48it/s]


For peaklets, if there is a lower version, delete that one.

In [None]:
peaklets['runid']

In [None]:
peaklets_redundant_mask = np.zeros(len(peaklets), dtype=np.bool)
peaklets_to_delete = []
peaklets_contexts = np.array(peaklets_contexts)
for i,rule in tqdm(enumerate(peaklets)):
    same_run_mask = peaklets['runid']==rule['runid']
    contexts = peaklets_contexts[same_run_mask]
    if peaklets_contexts[i] != np.max(pd.Series(contexts)):
        peaklets_to_delete.append(rule)
peaklets_to_delete = np.array(peaklets_to_delete)

In [None]:
lone_hits_redundant_mask = np.zeros(len(lone_hits), dtype=np.bool)
lone_hits_to_delete = []
lone_hits_contexts = np.array(lone_hits_contexts)
for i,rule in tqdm(enumerate(lone_hits)):
    same_run_mask = lone_hits['runid']==rule['runid']
    contexts = lone_hits_contexts[same_run_mask]
    if lone_hits_contexts[i] != np.max(pd.Series(contexts)):
        lone_hits_to_delete.append(rule)
lone_hits_to_delete = np.array(lone_hits_to_delete)

In [None]:
len(peaklets_to_delete)

In [None]:
len(peaklets)

In [None]:
peaklets['size_gb'].sum()

In [None]:
peaklets_to_delete['size_gb'].sum()

In [None]:
np.save('delete_dali_20230216/duplicated_peaklets.npy', peaklets_to_delete)

In [None]:
np.save('delete_dali_20230216/duplicated_lone_hits.npy', lone_hits_to_delete)

In [None]:
plt.hist(peaklets['runid'].astype(int), bins=np.linspace(12000,50000,100), label='All rules')
plt.hist(peaklets_to_delete['runid'].astype(int), bins=np.linspace(12000,50000,100), label='Outdated rules')
plt.xlabel('RunID')
plt.legend()
plt.title('peaklets on DaLI')
plt.ylabel('Counts')

In [None]:
plt.hist(lone_hits['runid'].astype(int), bins=np.linspace(12000,50000,100), label='All rules')
plt.hist(lone_hits_to_delete['runid'].astype(int), bins=np.linspace(12000,50000,100), label='Outdated rules')
plt.xlabel('RunID')
plt.legend()
plt.title('lone_hits on DaLI')
plt.ylabel('Counts')

In [None]:
lone_hits_to_delete['size_gb'].sum()

In [None]:
merged_s2s_to_delete = merged_s2s[merged_s2s['hash']=='xlkyjhsuda']

In [None]:
np.sum(merged_s2s_to_delete['size_gb'])

In [None]:
np.save('delete_dali_20230216/duplicated_merged_s2s.npy', merged_s2s_to_delete)

In [None]:
#v3 and v7
hitlets_nv_to_delete = hitlets_nv[(hitlets_nv['hash']=='jg5y7hv3iq')|(hitlets_nv['hash']=='w2vwlxy2wh')]

In [None]:
np.sum(hitlets_nv_to_delete['size_gb'])

In [None]:
np.save('delete_dali_20230216/duplicated_hitlets_nv.npy', hitlets_nv_to_delete)

In [45]:
peaklets_to_delete = np.load('delete_dali_20230216/duplicated_peaklets.npy', allow_pickle=True)

In [6]:
hitlets_nv_to_delete = np.load('delete_dali_20230216/duplicated_hitlets_nv.npy', allow_pickle=True)

In [46]:
lone_hits_to_delete = np.load('delete_dali_20230216/duplicated_lone_hits.npy', allow_pickle=True)

In [4]:
np.unique(peaklets_to_delete['hash'])

array(['6et4xh7yqh', 'az7dbwq5pl', 'gwiwppglt4', 'pvdcwix5ar',
       'siob2wxxm5', 'wfhfmcbq5z'], dtype=object)

In [9]:
np.unique(lone_hits_to_delete['hash'])

array(['gwiwppglt4', 'pvdcwix5ar', 'q3fr7osh7u', 'siob2wxxm5',
       'uhfusstvab', 'wfhfmcbq5z'], dtype=object)

In [7]:
np.unique(hitlets_nv_to_delete['hash'])

array(['jg5y7hv3iq', 'w2vwlxy2wh'], dtype=object)

In [49]:
v8_like_peaklets = peaklets_to_delete[peaklets_to_delete['hash']=='siob2wxxm5']['runid']
v8_like_lonehits = lone_hits_to_delete[lone_hits_to_delete['hash']=='siob2wxxm5']['runid']

In [37]:
outdated = []

for runid in v8_like_lonehits:
    selected = lone_hits[(lone_hits['hash']!='siob2wxxm5') & (lone_hits['runid']==runid)]
    print(runid, selected['hash'])

018159 ['pvdcwix5ar' 'xl4tm6igvz']
018253 ['pvdcwix5ar' 'xl4tm6igvz']
018362 ['pvdcwix5ar' 'xl4tm6igvz']
018415 ['pvdcwix5ar' 'xl4tm6igvz']
018483 ['pvdcwix5ar' 'xl4tm6igvz']
018548 ['pvdcwix5ar' 'xl4tm6igvz']
018604 ['pvdcwix5ar' 'xl4tm6igvz']
018660 ['pvdcwix5ar' 'xl4tm6igvz']
018705 ['pvdcwix5ar' 'xl4tm6igvz']
018761 ['pvdcwix5ar' 'xl4tm6igvz']
018812 ['pvdcwix5ar' 'xl4tm6igvz']
018867 ['pvdcwix5ar' 'xl4tm6igvz']
018947 ['pvdcwix5ar' 'xl4tm6igvz']
019150 ['pvdcwix5ar' 'xl4tm6igvz']
019227 ['pvdcwix5ar' 'xl4tm6igvz']
019769 ['pvdcwix5ar' 'xl4tm6igvz']
019820 ['pvdcwix5ar' 'xl4tm6igvz']
019997 ['pvdcwix5ar' 'xl4tm6igvz']
020045 ['pvdcwix5ar' 'xl4tm6igvz']
020087 ['pvdcwix5ar' 'xl4tm6igvz']
020138 ['pvdcwix5ar' 'xl4tm6igvz']
021509 ['pvdcwix5ar' 'xl4tm6igvz']
021743 ['pvdcwix5ar' 'xl4tm6igvz']
021993 ['pvdcwix5ar' 'xl4tm6igvz']
022058 ['pvdcwix5ar' 'xl4tm6igvz']
022228 ['pvdcwix5ar' 'xl4tm6igvz']
022354 ['pvdcwix5ar' 'xl4tm6igvz']
022477 ['pvdcwix5ar' 'xl4tm6igvz']
022543 ['pvdcwix5ar'

In [27]:
outdated = []

for runid in v8_like_peaklets:
    selected = peaklets[(peaklets['hash']!='siob2wxxm5') & (peaklets['runid']==runid)]
    print(runid, selected['hash'])

011949 ['xl4tm6igvz']
011954 ['xl4tm6igvz']
011961 ['xl4tm6igvz']
011969 ['xl4tm6igvz']
011970 ['xl4tm6igvz']
011971 ['xl4tm6igvz']
011973 ['xl4tm6igvz']
011974 ['xl4tm6igvz']
011975 ['xl4tm6igvz']
011976 ['xl4tm6igvz']
011978 ['xl4tm6igvz']
011979 ['xl4tm6igvz']
011982 ['xl4tm6igvz']
011984 ['xl4tm6igvz']
011985 ['xl4tm6igvz']
011995 ['xl4tm6igvz']
011997 ['xl4tm6igvz']
011999 ['xl4tm6igvz']
012000 ['xl4tm6igvz']
012001 ['xl4tm6igvz']
012003 ['xl4tm6igvz']
012006 ['xl4tm6igvz']
012007 ['xl4tm6igvz']
012008 ['xl4tm6igvz']
012010 ['xl4tm6igvz']
012011 ['xl4tm6igvz']
012014 ['xl4tm6igvz']
012015 ['xl4tm6igvz']
012017 ['xl4tm6igvz']
012039 ['xl4tm6igvz']
012052 ['xl4tm6igvz']
012061 ['xl4tm6igvz']
018159 ['6et4xh7yqh' 'pvdcwix5ar' 'xl4tm6igvz']
018253 ['6et4xh7yqh' 'pvdcwix5ar' 'xl4tm6igvz']
018362 ['6et4xh7yqh' 'pvdcwix5ar' 'xl4tm6igvz']
018415 ['6et4xh7yqh' 'pvdcwix5ar' 'xl4tm6igvz']
018483 ['6et4xh7yqh' 'pvdcwix5ar' 'xl4tm6igvz']
018548 ['6et4xh7yqh' 'pvdcwix5ar' 'xl4tm6igvz']
018604 [

In [31]:
sr0_left

17918

In [35]:
sr0_right

34731

We must keep v8 peaklets and lone_hits ("siob2wxxm5") for SR0! Excluding these guys from the list.

In [50]:
peaklets_to_delete = peaklets_to_delete[~((peaklets_to_delete['runid'].astype(int)<=sr0_right)
                      &(peaklets_to_delete['runid'].astype(int)>=sr0_left)
                      &(peaklets_to_delete['hash']=='siob2wxxm5'))]

In [51]:
lone_hits_to_delete = lone_hits_to_delete[~((lone_hits_to_delete['runid'].astype(int)<=sr0_right)
                      &(lone_hits_to_delete['runid'].astype(int)>=sr0_left)
                      &(lone_hits_to_delete['hash']=='siob2wxxm5'))]

In [42]:
peaklets_to_delete['size_gb'].sum()

21085.64

In [44]:
lone_hits_to_delete['size_gb'].sum()

1996.166

In [47]:
peaklets_to_delete['size_gb'].sum()

21267.469

In [48]:
lone_hits_to_delete['size_gb'].sum()

2032.3486

In [None]:
np.save('delete_dali_20230216/duplicated_peaklets.npy', peaklets_to_delete)

In [None]:
# After 