# Imports

In [6]:
import os
import pickle
import waffles.input.raw_hdf5_reader as reader
from waffles.np04_analysis.LED_calibration.run_number_to_LED_configuration import run_to_config

# Tools definitions

In [7]:
# Some convenient definitions

def dump_object_to_pickle(
        object, 
        saving_folderpath : str,
        output_filename : str,
        verbose : bool = True) -> None:
    """This function gets the following positional argument:

    - object
    - saving_folderpath (str): Path to the folder
    where to save the file.
    - output_filename (str): Name of the output 
    pickle file.

    And the following keyword argument:

    - verbose (bool): Whether to print functioning
    related messages.
    
    It saves the given object, object, to a pickle file 
    which is stored in the path given by saving_filepath"""

    # If the saving folder does not exist, create it
    if not os.path.exists(saving_folderpath):

        if verbose:
            print(f"In function dump_object_to_pickle(): Folder {saving_folderpath} does not exist. It will be created.")

        os.makedirs(saving_folderpath)

    # Create the output filepath
    output_filepath = os.path.join(
        saving_folderpath, 
        output_filename)
    
    with open(
        output_filepath, 
        "wb") as output_file:

        pickle.dump(object, output_file)

        return

def save_run_to_pickled_WaveformSet(
    run : int,
    saving_folderpath : str,
    average_wfs_per_channel : int = 4000,
    channels_no : int = 40,
    rucio_filepaths_folderpath : str = "/eos/experiment/neutplatform/protodune/experiments/ProtoDUNE-II/PDS_Commissioning/waffles/1_rucio_paths/",
    read_full_streaming_data : bool = False,
    subsample_seed : int = 3,
    verbose : bool = True):
    """This function gets the following positional arguments:

    - run (int): Number of the run whose data we want to convert
    to a pickle'd WaveformSet.
    - saving_folderpath (str): Path to the folder where to save
    the pickle'd WaveformSet(s).

    This function gets the following keyword arguments:

    - average_wfs_per_channel (int): Assuming that the read data
    is homogeneously distributed along the detector channels,
    the pickle'd WaveformSet object(s) will contain, on average,
    average_wfs_per_channel Waveform objects per detector
    channel.
    - channels_no (int): Number of channels in the detector.
    - rucio_filepaths_folderpath (str): Path to the folder
    where the files with the rucio filepaths are stored.
    The file which contains the rucio filepaths for a
    given run number, <run>, is assumed to be called 
    '0<run>.txt',
    - read_full_streaming_data (bool): Whether to read the
    full-streaming data of the self-trigger data.
    - subsample_seed (int): The seed for the subsample
    parameter. This parameter is decreased unit by unit
    until the number of Waveform objects in the resulting
    WaveformSet object reaches
    average_wfs_per_channel * channels_no. This parameter
    is given to the 'subsample' parameter of the
    WaveformSet_from_hdf5_file() function. Check such function
    docstring for more information.
    - verbose (bool): Whether to print functioning-related
    messages.
    
    This function looks for a file called '0<run>.txt' within
    the folder whose path is given by 
    rucio_filepaths_folderpath. Such file is assumed to be a
    text file with a list of filepaths. Parsing such file is
    delegated to the get_filepaths_from_rucio() function of
    'raw_hdf5_reader.py' module. If it is found, then 
    it starts reading WaveformSet(s), one per filepath, until
    the total number of read waveforms have reached
    average_wfs_per_channel * channels_no waveforms. Reading
    the WaveformSet(s) is delegated to the
    WaveformSet_from_hdf5_file() function of the 
    'raw_hdf5_reader.py' module. The read WaveformSet(s) are 
    pickle'd to files which are saved in the folder pointed 
    to by saving_folderpath. The WaveformSet coming from 
    the i-th filepath is saved to the file named
    'run_<run>_chunk_<i>.pkl'
    """

    aux = rucio_filepaths_folderpath+f"/0{run}.txt"

    try:
        rucio_filepaths = reader.get_filepaths_from_rucio(aux)
    # Happens if there are no rucio filepaths for this run in rucio_filepaths_folderpath
    except Exception:
        print(
            f"--> WARNING: Did not find the rucio paths for run {run}. Ending execution "
            f"of save_run_to_pickled_WaveformSet({run}, ...).")
        return

    if verbose:
        print(f"--> Processing run {run}: Found {len(rucio_filepaths)} chunks...")

    fGoForAnotherChunk = True
    wvfs_left_to_read_for_this_run = average_wfs_per_channel * channels_no
    current_chunk_iterator = 0

    while fGoForAnotherChunk:

        if verbose:
            print(f"\t --> Processing chunk {current_chunk_iterator+1}/{len(rucio_filepaths)} ...")

        subsample = subsample_seed
        fReadSameChunkAgain = True

        while fReadSameChunkAgain:

            aux_wfset = reader.WaveformSet_from_hdf5_file(
                rucio_filepaths[current_chunk_iterator],
                read_full_streaming_data=read_full_streaming_data,
                subsample=subsample,
                # WaveformSet_from_hdf5_file apparently subsamples from
                # the [0, wvfm_count] range. Therefore, if we set
                # wvfm_count to wvfs_left_to_read_for_this_run we
                # will get, at most, wvfs_left_to_read_for_this_run/subsample
                wvfm_count=wvfs_left_to_read_for_this_run*subsample,
                )
            
            # In this case, we already have what we need for this run
            if len(aux_wfset.waveforms) == wvfs_left_to_read_for_this_run:
                fReadSameChunkAgain = False
                fGoForAnotherChunk = False

                if verbose:
                    print(f"--> Got enough waveforms ({len(aux_wfset.waveforms)}) "
                          f"from chunk {current_chunk_iterator+1}/{len(rucio_filepaths)} "
                          f"of run {run}")
                    print(f"--> Now saving it to a pickle file ...")

                dump_object_to_pickle(
                    aux_wfset,
                    saving_folderpath,
                    f"run_{run}_chunk_{current_chunk_iterator}.pkl",
                    verbose=verbose)
                
                if verbose:
                    print(f"--> Successfully saved WaveformSet of run {run}")

            # In this case, we need more waveforms for this run
            elif len(aux_wfset.waveforms) < wvfs_left_to_read_for_this_run:

                if verbose:
                    print(f"--> Didn't get enough waveforms from chunk "
                          f"{current_chunk_iterator+1}/{len(rucio_filepaths)} "
                          f"of run {run}")
                    print(f"--> Expected {wvfs_left_to_read_for_this_run}, but only read {len(aux_wfset.waveforms)}")

                # In this case, try to read the same file but with a finer subsampling
                if subsample > 1:
                    # fReadSameChunkAgain is True by default
                    subsample -= 1

                    if verbose:
                        print(f"--> Switching 'subsample' from {subsample+1} to {subsample} and reading it again...")

                # In this case, we read every waveform from this chunk, but we still
                # haven't got enough waveforms, so go for the following chunk
                else:
                    subsample = subsample_seed
                    fReadSameChunkAgain = False
                    # fGoForAnotherChunk is True by default

                    if verbose:
                        print(f"--> All of the waveforms from this chunk were read")
                        print(f"--> Saving them and proceeding to look for "
                              f"{wvfs_left_to_read_for_this_run-len(aux_wfset.waveforms)} "
                              f"(={wvfs_left_to_read_for_this_run}-{len(aux_wfset.waveforms)}) "
                              f"waveforms from the following chunk ({current_chunk_iterator+2}/{len(rucio_filepaths)}) "
                              f"of this run ({run}).")

                    dump_object_to_pickle(
                        aux_wfset,
                        saving_folderpath,
                        f"run_{run}_chunk_{current_chunk_iterator}.pkl",
                        verbose=verbose)

                    # Switch to next chunk
                    current_chunk_iterator += 1
                    # But only read the waveforms that we need to add up to 
                    # average_wfs_per_channel * channels_no
                    wvfs_left_to_read_for_this_run -= len(aux_wfset.waveforms)
                    
            # In this case, WaveformSet_from_hdf5_file() is misbehaving
            else:
                raise Exception(f"WaveformSet_from_hdf5_file() is misbehaving. It read"
                                f" more waveforms ({len(aux_wfset.waveforms)}) than "
                                f"specified (wvfm_count={average_wfs_per_channel * channels_no})")

# Parameter input

In [8]:
batch_no = 2
apa_no = 2

In [9]:
saving_folderpath = "/afs/cern.ch/work/j/jurenago/private/repositories/waffles/src/waffles/np04_analysis/LED_calibration/pickles"
average_wfs_per_channel = 4000
channels_per_apa = 40
acquired_apas = 2
rucio_filepaths_folderpath = "/eos/experiment/neutplatform/protodune/experiments/ProtoDUNE-II/PDS_Commissioning/waffles/1_rucio_paths/"
subsample_seed = 3
verbose = True

In [10]:
pde = 0.5
for run in run_to_config[batch_no][3][pde].keys():

    print(f"\t --> Now retrieving data for run = {run}")
    
    aux = saving_folderpath + f"/batch_{batch_no}/apa_{apa_no}/{pde}/"

    save_run_to_pickled_WaveformSet(
        run,
        aux,
        average_wfs_per_channel=average_wfs_per_channel,
        channels_no=channels_per_apa*acquired_apas,
        rucio_filepaths_folderpath=rucio_filepaths_folderpath,
        read_full_streaming_data=True if apa_no == 1 else False,
        subsample_seed=subsample_seed,
        verbose=verbose)

	 --> Now retrieving data for run = 28176

Your files are stored around the world. 
--> Processing run 28176: Found 8 chunks...
	 --> Processing chunk 1/8 ...
Using XROOTD


Run: [ERROR] Local error: file exists:  (destination)


run_numb= 28176


118it [00:27,  4.33it/s]


--> Got enough waveforms (160000) from chunk 1/8 of run 28176
--> Now saving it to a pickle file ...
--> Successfully saved WaveformSet of run 28176
	 --> Now retrieving data for run = 28177
	 --> Now retrieving data for run = 28179

Your files are stored around the world. 
--> Processing run 28179: Found 8 chunks...
	 --> Processing chunk 1/8 ...
Using XROOTD


Run: [ERROR] Local error: file exists:  (destination)


run_numb= 28179


118it [00:28,  4.20it/s]


--> Got enough waveforms (160000) from chunk 1/8 of run 28179
--> Now saving it to a pickle file ...
--> Successfully saved WaveformSet of run 28179
	 --> Now retrieving data for run = 28180

Your files are stored around the world. 
--> Processing run 28180: Found 8 chunks...
	 --> Processing chunk 1/8 ...
Using XROOTD


Run: [ERROR] Local error: file exists:  (destination)


run_numb= 28180


34it [00:06,  5.20it/s]HDF5-DIAG: Error detected in HDF5 (1.12.0) thread 0:
  #000: /tmp/root/spack-stage/spack-stage-hdf5-1.12.0-nwlxllzr7neqtcsd575mrf2jhie4sfbd/spack-src/src/H5D.c line 280 in H5Dopen2(): name parameter cannot be an empty string
    major: Invalid arguments to routine
    minor: Bad value
HDF5-DIAG: Error detected in HDF5 (1.12.0) thread 0:
  #000: /tmp/root/spack-stage/spack-stage-hdf5-1.12.0-nwlxllzr7neqtcsd575mrf2jhie4sfbd/spack-src/src/H5D.c line 280 in H5Dopen2(): name parameter cannot be an empty string
    major: Invalid arguments to routine
    minor: Bad value
HDF5-DIAG: Error detected in HDF5 (1.12.0) thread 0:
  #000: /tmp/root/spack-stage/spack-stage-hdf5-1.12.0-nwlxllzr7neqtcsd575mrf2jhie4sfbd/spack-src/src/H5D.c line 280 in H5Dopen2(): name parameter cannot be an empty string
    major: Invalid arguments to routine
    minor: Bad value
HDF5-DIAG: Error detected in HDF5 (1.12.0) thread 0:
  #000: /tmp/root/spack-stage/spack-stage-hdf5-1.12.0-nwlxllzr7neq

Corrupted fragment:
 <daqdataformats._daq_daqdataformats_py.Fragment object at 0x7f8c32267df0>
<daqdataformats._daq_daqdataformats_py.TriggerRecordHeader object at 0x7f8c30d4f170>
(280, 0)
47244771330
Corrupted fragment:
 <daqdataformats._daq_daqdataformats_py.Fragment object at 0x7f8c32267df0>
<daqdataformats._daq_daqdataformats_py.TriggerRecordHeader object at 0x7f8c30d4f170>
(280, 0)
51539738626
Corrupted fragment:
 <daqdataformats._daq_daqdataformats_py.Fragment object at 0x7f8c28180af0>
<daqdataformats._daq_daqdataformats_py.TriggerRecordHeader object at 0x7f8c30d4f0f0>
(288, 0)
47244771330
Corrupted fragment:
 <daqdataformats._daq_daqdataformats_py.Fragment object at 0x7f8c28180af0>
<daqdataformats._daq_daqdataformats_py.TriggerRecordHeader object at 0x7f8c30d4f0f0>
(288, 0)
51539738626
Corrupted fragment:
 <daqdataformats._daq_daqdataformats_py.Fragment object at 0x7f8c31098270>
<daqdataformats._daq_daqdataformats_py.TriggerRecordHeader object at 0x7f8c32746fb0>
(296, 0)
4724477

120it [00:25,  4.79it/s]


--> Got enough waveforms (160000) from chunk 1/8 of run 28180
--> Now saving it to a pickle file ...
--> Successfully saved WaveformSet of run 28180
	 --> Now retrieving data for run = 28181

Your files are stored around the world. 
--> Processing run 28181: Found 8 chunks...
	 --> Processing chunk 1/8 ...
Using XROOTD


Run: [ERROR] Local error: file exists:  (destination)


run_numb= 28181


34it [00:06,  6.78it/s]HDF5-DIAG: Error detected in HDF5 (1.12.0) thread 0:
  #000: /tmp/root/spack-stage/spack-stage-hdf5-1.12.0-nwlxllzr7neqtcsd575mrf2jhie4sfbd/spack-src/src/H5D.c line 280 in H5Dopen2(): name parameter cannot be an empty string
    major: Invalid arguments to routine
    minor: Bad value
HDF5-DIAG: Error detected in HDF5 (1.12.0) thread 0:
  #000: /tmp/root/spack-stage/spack-stage-hdf5-1.12.0-nwlxllzr7neqtcsd575mrf2jhie4sfbd/spack-src/src/H5D.c line 280 in H5Dopen2(): name parameter cannot be an empty string
    major: Invalid arguments to routine
    minor: Bad value
HDF5-DIAG: Error detected in HDF5 (1.12.0) thread 0:
  #000: /tmp/root/spack-stage/spack-stage-hdf5-1.12.0-nwlxllzr7neqtcsd575mrf2jhie4sfbd/spack-src/src/H5D.c line 280 in H5Dopen2(): name parameter cannot be an empty string
    major: Invalid arguments to routine
    minor: Bad value
HDF5-DIAG: Error detected in HDF5 (1.12.0) thread 0:
  #000: /tmp/root/spack-stage/spack-stage-hdf5-1.12.0-nwlxllzr7neq

Corrupted fragment:
 <daqdataformats._daq_daqdataformats_py.Fragment object at 0x7f8c54525330>
<daqdataformats._daq_daqdataformats_py.TriggerRecordHeader object at 0x7f8c2b9b8030>
(280, 0)
55834705922
Corrupted fragment:
 <daqdataformats._daq_daqdataformats_py.Fragment object at 0x7f8c54525330>
<daqdataformats._daq_daqdataformats_py.TriggerRecordHeader object at 0x7f8c2b9b8030>
(280, 0)
47244771330
Corrupted fragment:
 <daqdataformats._daq_daqdataformats_py.Fragment object at 0x7f8c54525330>
<daqdataformats._daq_daqdataformats_py.TriggerRecordHeader object at 0x7f8c2b9b8030>
(280, 0)
51539738626
Corrupted fragment:
 <daqdataformats._daq_daqdataformats_py.Fragment object at 0x7f8c322fb730>
<daqdataformats._daq_daqdataformats_py.TriggerRecordHeader object at 0x7f8c322fb0b0>
(288, 0)
47244771330
Corrupted fragment:
 <daqdataformats._daq_daqdataformats_py.Fragment object at 0x7f8c322fb730>
<daqdataformats._daq_daqdataformats_py.TriggerRecordHeader object at 0x7f8c322fb0b0>
(288, 0)
5153973

120it [00:25,  4.71it/s]


--> Got enough waveforms (160000) from chunk 1/8 of run 28181
--> Now saving it to a pickle file ...
--> Successfully saved WaveformSet of run 28181


In [None]:
for pde in run_to_config[batch_no][apa_no].keys():

    print(f"--> Now retrieving data for PDE = {pde}")

    for run in run_to_config[batch_no][apa_no][pde].keys():

        print(f"\t --> Now retrieving data for run = {run}")
        
        aux = saving_folderpath + f"/batch_{batch_no}/apa_{apa_no}/{pde}/"

        save_run_to_pickled_WaveformSet(
            run,
            aux,
            average_wfs_per_channel=average_wfs_per_channel,
            channels_no=channels_no,
            rucio_filepaths_folderpath=rucio_filepaths_folderpath,
            read_full_streaming_data=True if apa_no == 1 else False,
            subsample_seed=subsample_seed,
            verbose=verbose)

--> Now retrieving data for PDE = 0.4
	 --> Now retrieving data for run = 28148

Your files are stored around the world. 
--> Processing run 28148: Found 8 chunks...
	 --> Processing chunk 1/8 ...
Using XROOTD
run_numb= 28148


357it [00:04, 85.23it/s]


--> Got enough waveforms (160000) from chunk 1/8 of run 28148
--> Now saving it to a pickle file ...
In function dump_object_to_pickle(): Folder /afs/cern.ch/work/j/jurenago/private/repositories/waffles/src/waffles/np04_analysis/LED_calibration/pickles/batch_2/apa_2/0.4/ does not exist. It will be created.
--> Successfully saved WaveformSet of run 28148
	 --> Now retrieving data for run = 28149
	 --> Now retrieving data for run = 28150

Your files are stored around the world. 
--> Processing run 28150: Found 8 chunks...
	 --> Processing chunk 1/8 ...
Using XROOTD
run_numb= 28150


356it [00:04, 84.78it/s] 


--> Got enough waveforms (160000) from chunk 1/8 of run 28150
--> Now saving it to a pickle file ...
--> Successfully saved WaveformSet of run 28150
	 --> Now retrieving data for run = 28151

Your files are stored around the world. 
--> Processing run 28151: Found 8 chunks...
	 --> Processing chunk 1/8 ...
Using XROOTD
run_numb= 28151


357it [00:04, 86.93it/s] 


--> Got enough waveforms (160000) from chunk 1/8 of run 28151
--> Now saving it to a pickle file ...
--> Successfully saved WaveformSet of run 28151
	 --> Now retrieving data for run = 28152

Your files are stored around the world. 
--> Processing run 28152: Found 8 chunks...
	 --> Processing chunk 1/8 ...
Using XROOTD
run_numb= 28152


356it [00:04, 85.79it/s] 


--> Got enough waveforms (160000) from chunk 1/8 of run 28152
--> Now saving it to a pickle file ...
--> Successfully saved WaveformSet of run 28152
	 --> Now retrieving data for run = 28153

Your files are stored around the world. 
--> Processing run 28153: Found 8 chunks...
	 --> Processing chunk 1/8 ...
Using XROOTD
run_numb= 28153


356it [00:04, 82.53it/s]


--> Got enough waveforms (160000) from chunk 1/8 of run 28153
--> Now saving it to a pickle file ...
--> Successfully saved WaveformSet of run 28153
--> Now retrieving data for PDE = 0.45
	 --> Now retrieving data for run = 28159

Your files are stored around the world. 
--> Processing run 28159: Found 8 chunks...
	 --> Processing chunk 1/8 ...
Using XROOTD


# Debugging

In [None]:
for i in range(len(runs)):

    aux = rucio_filepaths_folderpath+f"/0{runs[i]}.txt"

    try:
        rucio_filepaths = reader.get_filepaths_from_rucio(aux)
    # Happens if there are no rucio filepaths for this run in rucio_filepaths_folderpath
    except Exception:
        print(f"--> WARNING: Did not find the rucio paths for run {runs[i]}. Skipping this run.")
        continue

    if verbose:
        print(f"--> Processing run {runs[i]} ({i+1}/{len(runs)}): Found {len(rucio_filepaths)} chunks...")

    fGoForAnotherChunk = True
    wvfs_left_to_read_for_this_run = average_wfs_per_channel * channels_no
    current_chunk_iterator = 0

    while fGoForAnotherChunk:

        if verbose:
            print(f"\t --> Processing chunk {current_chunk_iterator+1}/{len(rucio_filepaths)} ...")

        subsample = subsample_seed
        fReadSameChunkAgain = True

        while fReadSameChunkAgain:

            aux_wfset = reader.WaveformSet_from_hdf5_file( 
                rucio_filepaths[current_chunk_iterator],
                read_full_streaming_data=read_full_streaming_data
                subsample=subsample,
                # WaveformSet_from_hdf5_file apparently subsamples from
                # the [0, wvfm_count] range. Therefore, if we set
                # wvfm_count to wvfs_left_to_read_for_this_run we
                # will get, at most, wvfs_left_to_read_for_this_run/subsample
                wvfm_count=wvfs_left_to_read_for_this_run*subsample,
                )
            
            # In this case, we already have what we need for this run
            if len(aux_wfset.waveforms) == wvfs_left_to_read_for_this_run:
                fReadSameChunkAgain = False
                fGoForAnotherChunk = False

                if verbose:
                    print(f"--> Got enough waveforms ({len(aux_wfset.waveforms)}) "
                          f"from chunk {current_chunk_iterator+1}/{len(rucio_filepaths)} "
                          f"of run {runs[i]}")
                    print(f"--> Now saving it to a pickle file ...")

                dump_object_to_pickle(
                    aux_wfset,
                    saving_folderpath+f"{runs[i]}_chunk_{current_chunk_iterator}.pkl")
                
                if verbose:
                    try:
                        print(f"--> Switching to next run {runs[i+1]}")
                    # Happens if all runs were already processed
                    except IndexError:
                        pass

            # In this case, we need more waveforms for this run
            elif len(aux_wfset.waveforms) < wvfs_left_to_read_for_this_run:

                if verbose:
                    print(f"--> Didn't get enough waveforms from chunk "
                          f"{current_chunk_iterator+1}/{len(rucio_filepaths)} "
                          f"of run {runs[i]}")
                    print(f"--> Expected {wvfs_left_to_read_for_this_run}, but only read {len(aux_wfset.waveforms)}")

                # In this case, try to read the same file but with a finer subsampling
                if subsample > 1:
                    # fReadSameChunkAgain is True by default
                    subsample -= 1

                    if verbose:
                        print(f"--> Switching 'subsample' from {subsample+1} to {subsample} and reading it again...")

                # In this case, we read every waveform from this chunk, but we still
                # haven't got enough waveforms, so go for the following chunk
                else:
                    subsample = subsample_seed
                    fReadSameChunkAgain = False
                    # fGoForAnotherChunk is True by default

                    if verbose:
                        print(f"--> All of the waveforms from this chunk were read")
                        print(f"--> Saving them and proceeding to look for "
                              f"{wvfs_left_to_read_for_this_run-len(aux_wfset.waveforms)} "
                              f"(={wvfs_left_to_read_for_this_run}-{len(aux_wfset.waveforms)}) "
                              f"waveforms from the following chunk ({current_chunk_iterator+2}/{len(rucio_filepaths)}) "
                              f"of this run ({runs[i]}).")

                    dump_object_to_pickle(
                        aux_wfset,
                        saving_folderpath+f"{runs[i]}_chunk_{current_chunk_iterator}.pkl")

                    # Switch to next chunk
                    current_chunk_iterator += 1
                    # But only read the waveforms that we need to add up to 
                    # average_wfs_per_channel * channels_no
                    wvfs_left_to_read_for_this_run -= len(aux_wfset.waveforms)
                    
            # In this case, WaveformSet_from_hdf5_file() is misbehaving
            else:
                raise Exception(f"WaveformSet_from_hdf5_file() read more waveforms"
                                f" ({len(aux_wfset.waveforms)}) than specified (wvfm_count="
                                f"{average_wfs_per_channel * channels_no})")