In [1]:
# import external modules
import os
import sys
import json
import joblib
import numpy as np
import matplotlib.pyplot as plt
import importlib

In [2]:
# import parquet and 2 functions

import pyarrow as pa
from pyarrow.parquet import ParquetFile, ParquetDataset

def read_parquet(path, verbose=False, 
                 columns=None, batch_size=None, first_batch=None, last_batch=None, batch_ids=None):
    if batch_size is None:
        # standard case where all rows are read
        df = pd.read_parquet(path, columns=columns)
    else:
        # more involved case where only a section of rows is read
        pf = ParquetFile(path)
        if verbose:
            print('Found following parquet metadata:')
            print(pf.metadata)
            
        # check if contradictory arguments were provided
        if batch_ids is not None:
            if first_batch is not None or last_batch is not None:
                if verbose:
                    msg = 'WARNING in read_parquet: cannot provide both batch_ids and first_batch / last_batch;'
                    msg += ' first_batch and last_batch will be ignored.'
                    print(msg)
        else:
            if first_batch is None or last_batch is None:
                msg = 'ERROR in read_parquet: in batched mode, either batch_ids, or first_batch and last_batch'
                msg += ' must be provided; returning None.'
                print(msg)
                return None
            if last_batch < first_batch:
                last_batch = first_batch
                if verbose:
                    msg = f'WARNING in read_parquet: setting last_batch to {last_batch}'
                    msg += ' as values smaller than first_batch are not supported.'
                    print(msg)
            batch_ids = list(range(first_batch, last_batch+1))
        
        # check available rows and batches
        num_rows = pf.metadata.num_rows
        num_batches = int((num_rows-1)/batch_size)+1
        if max(batch_ids) >= num_batches:
            if verbose:
                msg = f'WARNING in read_parquet: batch indices greater than {num_batches-1} will be ignored.'
                print(msg)
        
        # iterate through the batches
        iterobj = pf.iter_batches(batch_size = batch_size)
        batches = []
        for batch_idx in range(num_batches):
            batch = next(iterobj)
            if batch_idx in batch_ids: batches.append(batch)
        df = pa.Table.from_batches(batches).to_pandas()
        
    if verbose:
        print(f'Read dataframe with {len(df)} rows and {len(df.columns)} columns.')
    return df

def get_mes(df, datacolumn='data', xbinscolumn='xbins', ybinscolumn='ybins',
            runcolumn='run', lumicolumn='lumi',
            runs=None, lumis=None):
    if runs is not None: df = select_runs(df, runs, runcolumn=runcolumn)
    if lumis is not None: df = select_ls(df, lumis, lumicolumn=lumicolumn)
    xbins = int(df[xbinscolumn].values[0])
    ybins = int(df[ybinscolumn].values[0])
    # note: df['data'][idx] yields an array of 1d arrays;
    # need to convert it to a 2d array with np.stack
    mes = np.array([np.stack(df[datacolumn].values[i]).reshape(ybins,xbins) for i in range(len(df))])
    runs = df[runcolumn].values
    lumis = df[lumicolumn].values
    return (mes, runs, lumis)

def find_oms_indices(runs, lumis, omsjson,
                     run_key='run_number', lumi_key='lumisection_number',
                     verbose=True):
    # check if run_key and lumi_key are present in the omsjson
    if not run_key in omsjson.keys():
        msg = 'Run key "{}" not found in provided omsjson.'.format(run_key)
        msg += ' Available keys are: {}'.format(omsjson.keys())
        raise Exception(msg)
    if not lumi_key in omsjson.keys():
        msg = 'Lumi key "{}" not found in provided omsjson.'.format(lumi_key)
        msg += ' Available keys are: {}'.format(omsjson.keys())
        raise Exception(msg)
    # parse runs and lumis
    runs = np.array(runs).astype(int)
    lumis = np.array(lumis).astype(int)
    if len(runs.shape)!=1:
        raise Exception(f'Provided run numbers array has unexpected shape: {runs.shape}')
    if len(lumis.shape)!=1:
        raise Exception(f'Provided lumisection numbers array has unexpected shape: {lumis.shape}')
    # make sure provided oms runs and lumis are in the same data format
    omsruns = np.array(omsjson[run_key]).astype(int)
    omslumis = np.array(omsjson[lumi_key]).astype(int)
    # combine run and lumisection number into a single unique number
    idfactor = 10000 # warning: do not use 1e4 instead of 10000 to avoid conversion from int to float
    ids = runs*idfactor + lumis
    ids = ids.astype(int)
    omsids = omsruns*idfactor + omslumis
    omsids = omsids.astype(int)
    # check if all ids are in omsids
    # note: reduce from error to warning,
    # since it seems some lumisections are intrinsically missing in OMS,
    # e.g. run 380147, LS 186.
    threshold = None
    if np.any(np.isin(ids, omsids, invert=True)):
        missing_ids_inds = np.isin(ids, omsids, invert=True).nonzero()[0]
        missing_ids = ids[missing_ids_inds]
        if verbose:
            msg = 'WARNING: not all provided lumisections could be found in the oms data.'
            msg += f' Missing lumisections are: {missing_ids}'
            msg += f' ({len(missing_ids)} / {len(ids)})'
            print(msg)
        # temporarily add the missing ids to omsids
        # (corresponding indices will be set to -1 later)
        threshold = len(omsids)
        omsids = np.concatenate((omsids, missing_ids))
    # find indices of ids in omsids
    omsids_sorted_inds = np.argsort(omsids)
    omsids_sorted = omsids[omsids_sorted_inds]
    indices = np.searchsorted(omsids_sorted, ids, side='left')
    indices = omsids_sorted_inds[indices]
    if threshold is not None: indices = np.where(indices>=threshold, -1, indices)
    return indices

def find_oms_attr_for_lumisections(runs, lumis, omsjson, omsattr, **kwargs):
    # check if attribute is present
    if not omsattr in omsjson.keys():
        msg = 'Attribute "{}" not found in provided omsjson.'.format(omsattr)
        msg += ' Available keys are: {}'.format(omsjson.keys())
        raise Exception(msg)
    # retrieve indices for provided lumisections
    indices = find_oms_indices(runs, lumis, omsjson, **kwargs)
    # make array of values corresponding to indices
    # (assume index -1 is used as a default for lumisections that are missing in the omsjson)
    values = np.array([(omsjson[omsattr][idx] if idx>-1 else 0) for idx in indices])
    return values

In [4]:
# Load parquet for Ring 2 Zero Bias 2024 era Iv1

menames = ['Ring2']
datadir = '../Development/data/'
parquet = 'ZeroBias-Run2024I-PromptReco-v1-DQMIO-PixelPhase1-Phase1_MechanicalView-PXForward-clusters_per_SignedDiskCoord_per_SignedBladePanelCoord_PXRing_2.parquet'
#parquet = 'ZeroBias-Run2025D-PromptReco-v1-DQMIO-PixelPhase1-Phase1_MechanicalView-PXForward-clusters_per_SignedDiskCoord_per_SignedBladePanelCoord_PXRing_2.parquet'

X = {}
for mename in menames:
    f = os.path.join(datadir, parquet)
    X[mename] = read_parquet(f, verbose=False, batch_size=750, batch_ids=[29])
#    X[mename] = read_parquet(f, verbose=False, batch_size=750, batch_ids=[114])
#    X[mename] = read_parquet(f, verbose=False, batch_size=2000, batch_ids=[42])
    
# Print run numbers
run_numbers = X[menames[0]]['run_number'].values
ls_numbers = X[menames[0]]['ls_number'].values
runs = np.unique(run_numbers)
print(runs)
#print(np.unique(ls_numbers))
#[386660 386661 386663 386665 386667 386668]

#In This run range, there is only 1 Multi-Disk anomaly (Ring2 model find only LS 103) and 3 Single-Disk (Ring 2 model find all, but ignore)
#386661 - 103 - 104 - 2 - Disk +1+2+3 - Multi-Disk
#386661 - 303 - 305 - 3 - Disk +3 - Single-Disk

[386660 386661 386663 386665 386667 386668]


In [5]:
# Extract np arrays from dataframes

X_data = {}
for mename in menames:
    mes, _, _ = get_mes(X[mename],
                    xbinscolumn='x_bin', ybinscolumn='y_bin',
                    runcolumn='run_number', lumicolumn='ls_number')
    X_data[mename] = mes

In [7]:
# Load OMS data
oms_data_dir = '../Development/omsdata/'
oms_info_file = f'{oms_data_dir}omsdata_Run2024I-v1.json'
#oms_info_file = f'{oms_data_dir}omsdata_Run2025D-v1.json'
with open(oms_info_file, 'r') as f:
    oms_info = json.load(f)
oms_attrs = [
    "beams_stable",
    "cms_active",
    "bpix_ready",
    "fpix_ready",
    "tibtid_ready",
    "tob_ready",
    "tecp_ready",
    "tecm_ready",
    "pileup"
]
oms_info_new = {}
for key, val in oms_info.items():
    if key not in oms_attrs: continue
    oms_info_new['oms__' + key] = find_oms_attr_for_lumisections(run_numbers, ls_numbers, oms_info, key)
oms_info = oms_info_new

# add filter info to input data
filter_info = {**oms_info}
filter_info['oms__run_number'] = run_numbers
filter_info['oms__lumisection_number'] = ls_numbers
print('Filter info:')
for key, val in filter_info.items():
    print(f'  - {key}: {val.shape}')
X_data.update(filter_info)
print('Input data keys:')
print(X_data.keys())

Filter info:
  - oms__pileup: (750,)
  - oms__beams_stable: (750,)
  - oms__cms_active: (750,)
  - oms__bpix_ready: (750,)
  - oms__fpix_ready: (750,)
  - oms__tibtid_ready: (750,)
  - oms__tob_ready: (750,)
  - oms__tecp_ready: (750,)
  - oms__tecm_ready: (750,)
  - oms__run_number: (750,)
  - oms__lumisection_number: (750,)
Input data keys:
dict_keys(['Ring2', 'oms__pileup', 'oms__beams_stable', 'oms__cms_active', 'oms__bpix_ready', 'oms__fpix_ready', 'oms__tibtid_ready', 'oms__tob_ready', 'oms__tecp_ready', 'oms__tecm_ready', 'oms__run_number', 'oms__lumisection_number'])


In [8]:
# Store data in pickle format
import pickle    
    
# add the run and lumisection numbers to the data to be stored in .pkl file
X_data_towrite = {}
for key, val in X_data.items():
    if key.startswith('oms__'): continue # do not save OMS data to .pkl (will be retrieved on the fly by DIALS)
    X_data_towrite[key] = val
X_data_towrite['run_number'] = run_numbers
X_data_towrite['ls_number'] = ls_numbers

# printouts for checking
print('Will write following arrays:')
for key, val in X_data_towrite.items():
    print(f'  - {key}: {val.shape}')

with open('test_data.pkl', 'wb') as f:
    pickle.dump(X_data_towrite, f)

Will write following arrays:
  - Ring2: (750, 140, 56)
  - run_number: (750,)
  - ls_number: (750,)


In [9]:
# Load the picke file (check file content)
with open("test_data.pkl", "rb") as f:
    data = pickle.load(f)

# Print dictionary keys
print("Keys:", list(data.keys()))

# Print shape e type of each element
for key, value in data.items():
    print(f"{key}: shape = {value.shape}, dtype = {value.dtype}")

Keys: ['Ring2', 'run_number', 'ls_number']
Ring2: shape = (750, 140, 56), dtype = float64
run_number: shape = (750,), dtype = int64
ls_number: shape = (750,), dtype = int64
