### Imports

We need to pull in a few packages first

In [1]:
import torch, h5py, numpy as np, glob, tqdm, sys, multiprocessing as mp
from uuid import uuid4 as uuidgen
sys.path.append('/scratch') # Set up local python environment
from Core import utils
from SparseNOvA import datasets

In [2]:
def match_truth(file, im_idx):
    '''Returns list of index locations of trainingdata for a given slicemap'''
    keys = ['run','subrun','cycle','evt','subevt']
    tag = { key: file['rec.training.slicemaps'][key][im_idx,0] for key in keys }
    mask = None
    for key in keys:
        m = (file['rec.training.trainingdata'][key][:,0] == tag[key])
        mask = m if mask is None else mask & m
    true_idx = mask.nonzero()[0]
    if len(true_idx) > 1:
        raise Exception(f'{len(true_idx)} truths found for slicemap {im_idx}.')
    return true_idx

def get_alias(flav):
    '''Function that alias interaction enum to the following list'''
    # [0 == nu_mu, 1 == nu_e, 2 == nu_tau, 3 == NC, 4 == others]
    if 0 <= flav < 4:    return 0
    elif 4 <= flav < 8:  return 1
    elif 8 <= flav < 12: return 2
    elif flav == 13:     return 3
    else:                return 4

def process_file(filename):
    
    file = h5py.File(filename, 'r')
    mask = np.nonzero(file['rec.mc']['nnu'][:,0]==1)[0]

    # Loop over each neutrino image to look for the associated truth
    for i, nu in enumerate(mask):
        true_idx = match_truth(file, nu)
        if len(true_idx) == 0: continue
        image = file['rec.training.slicemaps']['slicemap'][nu]
        xview, yview = image.reshape(2, 448, 384)[:]
        truth = get_alias(file['rec.training.trainingdata']['interaction'][true_idx,0][0])
        xsparse = torch.tensor(xview).float().to_sparse()
        ysparse = torch.tensor(yview).float().to_sparse()
        data = { 'xfeats': xsparse._values().unsqueeze(dim=-1),
                 'xcoords': xsparse._indices().T.int(),
                 'yfeats': ysparse._values().unsqueeze(dim=-1),
                 'ycoords': ysparse._indices().T.int(),
                 'truth': torch.tensor(truth).long() }
        torch.save(data, f'/data/mp5/processed/{uuidgen()}.pt')

### Preprocessing

Pull the interesting events out of the HDF5 files, preprocess them, and write them as individual PyTorch files instead.

In [None]:
nonswap = sorted(glob.glob('/data/mp5/nonswap/*.h5'))
fluxswap = sorted(glob.glob('/data/mp5/fluxswap/*.h5'))
tauswap = sorted(glob.glob('/data/mp5/tauswap/*.h5'))
files = nonswap + fluxswap + tauswap

with mp.Pool(processes=50) as pool: pool.map(process_file, files)

### Filtering

Remove all images with no hits in either view

In [11]:
files = glob.glob('/data/mp5/processed/*.pt')
for filename in files:
    data = torch.load(filename)
    if data['xcoords'].shape[0] == 0 or data['ycoords'].shape[0] == 0:
        print(f'{filename} is bad, removing.')
        os.remove(filename)



  0%|          | 11791/3105407 [00:20<12:03, 4273.01it/s][A[A

/data/mp5/processed/8af5a52a-125e-4a48-b3e7-45b5e79ce8e4.pt is bad, removing.
/data/mp5/processed/b40c50a6-9f95-4df9-9f75-09c21d45e998.pt is bad, removing.
/data/mp5/processed/d91bd237-7b61-47d1-bee9-2f0e8857af9c.pt is bad, removing.
/data/mp5/processed/afead1d3-ec1f-4d4f-8412-976315bf3efa.pt is bad, removing.
/data/mp5/processed/ec3f31ec-7270-4908-82c5-b87962c5c5f6.pt is bad, removing.
/data/mp5/processed/3fb08f09-0885-4549-b9ae-c3c85b3e584c.pt is bad, removing.
/data/mp5/processed/35b566d6-3c90-4b01-86a0-6a0bd5f00524.pt is bad, removing.
/data/mp5/processed/dd0b6ddc-7d67-4f81-a374-8124e5ea2b20.pt is bad, removing.
/data/mp5/processed/5ebc6c13-1a1b-4d27-a3be-0198f944fbd1.pt is bad, removing.
/data/mp5/processed/727ab1d7-0043-43e6-af34-e48c65ffa05d.pt is bad, removing.
/data/mp5/processed/9798f053-c403-4327-a082-44c6beca0bc3.pt is bad, removing.
/data/mp5/processed/4cfae11a-06cf-4099-a81f-33e449825839.pt is bad, removing.
/data/mp5/processed/d14fafba-357a-4961-941c-5f4f082e8a8a.pt is b