AxFoundation · JoranAngevaare · Aug 25, 2021 · Jul 13, 2021 · Jul 19, 2021 · Jul 19, 2021
diff --git a/strax/processing/peak_building.py b/strax/processing/peak_building.py
@@ -154,10 +154,17 @@ def store_downsampled_waveform(p, wv_buffer):
 
 @export
 @numba.jit(nopython=True, nogil=True, cache=True)
-def sum_waveform(peaks, records, adc_to_pe, select_peaks_indices=None):
-    """Compute sum waveforms for all peaks in peaks
+def sum_waveform(peaks, hits, records, record_links, adc_to_pe, select_peaks_indices=None):
+    """Compute sum waveforms for all peaks in peaks. Only builds summed
+    waveform other regions in which hits were found. This is required
+    to avoid any bias due to zero-padding and baselining.
     Will downsample sum waveforms if they do not fit in per-peak buffer
 
+    :param peaks: Peaks for which the summed waveform should be build.
+    :param hits: Hits which are inside peaks. Must be sorted according
+        to record_i.
+    :param records: Records to be used to build peaks.
+    :param record_links: Tuple of previous and next records.
     :arg select_peaks_indices: Indices of the peaks for partial
     processing. In the form of np.array([np.int, np.int, ..]). If
     None (default), all the peaks are used for the summation.
@@ -173,18 +180,21 @@ def sum_waveform(peaks, records, adc_to_pe, select_peaks_indices=None):
     if not len(select_peaks_indices):
         return
     dt = records[0]['dt']
+    n_samples_record = len(records[0]['data'])
+    prev_record_i, next_record_i = record_links
 
     # Big buffer to hold even largest sum waveforms
     # Need a little more even for downsampling..
     swv_buffer = np.zeros(peaks['length'].max() * 2, dtype=np.float32)
 
-    # Index of first record that could still contribute to subsequent peaks
-    # Records before this do not need to be considered anymore
-    left_r_i = 0
-
     n_channels = len(peaks[0]['area_per_channel'])
     area_per_channel = np.zeros(n_channels, dtype=np.float32)
 
+    # Hit index for hits in peaks
+    left_h_i = 0
+    # Create hit waveform buffer
+    hit_waveform = np.zeros(hits['length'].max(), dtype=np.float32)
+
     for peak_i in select_peaks_indices:
         p = peaks[peak_i]
         # Clear the relevant part of the swv buffer for use
@@ -197,55 +207,68 @@ def sum_waveform(peaks, records, adc_to_pe, select_peaks_indices=None):
         area_per_channel *= 0
         p['area'] = 0
 
-        # Find first record that contributes to this peak
-        for left_r_i in range(left_r_i, len(records)):
-            r = records[left_r_i]
+        # Find first hit that contributes to this peak
+        for left_h_i in range(left_h_i, len(hits)):
+            h = hits[left_h_i]
             # TODO: need test that fails if we replace < with <= here
-            if p['time'] < r['time'] + r['length'] * dt:
+            if p['time'] < h['time'] + h['length'] * dt:
                 break
         else:
-            # Records exhausted before peaks exhausted
+            # Hits exhausted before peaks exhausted
             # TODO: this is a strange case, maybe raise warning/error?
             break
 
-        # Scan over records that overlap
-        for right_r_i in range(left_r_i, len(records)):
-            r = records[right_r_i]
-            ch = r['channel']
-            multiplier = 2**r['amplitude_bit_shift']
-            assert p['dt'] == r['dt'], "Records and peaks must have same dt"
+        # Scan over hits that overlap with peak
+        for right_h_i in range(left_h_i, len(hits)):
+            h = hits[right_h_i]
+            record_i = h['record_i']
+            ch = h['channel']
+            assert p['dt'] == h['dt'], "Hits and peaks must have same dt"
 
-            shift = (p['time'] - r['time']) // dt
-            n_r = r['length']
-            n_p = p_length
+            shift = (p['time'] - h['time']) // dt
+            n_samples_hit = h['length']
+            n_samples_peak = p_length
 
-            if shift <= -n_p:
-                # Record is completely to the right of the peak;
+            if shift <= -n_samples_peak:
+                # Hit is completely to the right of the peak;
                 # we've seen all overlapping records
                 break
 
-            if n_r <= shift:
+            if n_samples_hit <= shift:
                 # The (real) data in this record does not actually overlap
                 # with the peak
-                # (although a previous, longer record did overlap)
+                # (although a previous, longer hit did overlap)
                 continue
 
-            (r_start, r_end), (p_start, p_end) = strax.overlap_indices(
-                r['time'] // dt, n_r,
-                p['time'] // dt, n_p)
+            # Get overlapping samples between hit and peak:
+            (h_start, h_end), (p_start, p_end) = strax.overlap_indices(
+                h['time'] // dt, n_samples_hit,
+                p['time'] // dt, n_samples_peak)
+
+            hit_waveform[:] = 0
+
+            # Get record which belongs to main part of hit (wo integration bounds):
+            r = records[record_i]
+
+            is_saturated = _build_hit_waveform(h, r, hit_waveform)
 
-            max_in_record = r['data'][r_start:r_end].max() * multiplier
-            p['saturated_channel'][ch] |= np.int8(max_in_record >= np.int16(r['baseline']))
+            # Now check if we also have to go to prev/next record due to integration bounds.
+            # If bounds are outside of peak we chop when building the summed waveform later.
+            if h['left_integration'] < 0 and prev_record_i[record_i] != -1:
+                r = records[prev_record_i[record_i]]
+                is_saturated |= _build_hit_waveform(h, r, hit_waveform)
 
-            bl_fpart = r['baseline'] % 1
-            # TODO: check numba does casting correctly here!
-            pe_waveform = adc_to_pe[ch] * (
-                    multiplier * r['data'][r_start:r_end]
-                    + bl_fpart)
+            if h['right_integration'] > n_samples_record and next_record_i[record_i] != -1:
+                r = records[next_record_i[record_i]]
+                is_saturated |= _build_hit_waveform(h, r, hit_waveform)
 
-            swv_buffer[p_start:p_end] += pe_waveform
+            p['saturated_channel'][ch] |= is_saturated
 
-            area_pe = pe_waveform.sum()
+            hit_data = hit_waveform[h_start:h_end]
+            hit_data *= adc_to_pe[ch]
+            swv_buffer[p_start:p_end] += hit_data
+
+            area_pe = hit_data.sum()
             area_per_channel[ch] += area_pe
             p['area'] += area_pe
 
@@ -255,6 +278,31 @@ def sum_waveform(peaks, records, adc_to_pe, select_peaks_indices=None):
         p['area_per_channel'][:] = area_per_channel
 
 
+@numba.njit(cache=True, nogil=True)
+def _build_hit_waveform(hit, record, hit_waveform):
+    """
+    Adds information for overlapping record and hit to hit_waveform.
+    Updates hit_waveform inplace. Result is still in ADC counts.
+
+    :returns: Boolean if record saturated within the hit.
+    """
+    (h_start_record, h_end_record), (r_start, r_end) = strax.overlap_indices(
+        hit['time'] // hit['dt'], hit['length'],
+        record['time'] // record['dt'], record['length'])
+
+    # Get record properties:
+    record_data = record['data'][r_start:r_end]
+    multiplier = 2**record['amplitude_bit_shift']
+    bl_fpart = record['baseline'] % 1
+    # TODO: check numba does casting correctly here!
+    max_in_record = record_data.max() * multiplier
+
+    # Build hit waveform:
+    hit_waveform[h_start_record:h_end_record] = (multiplier * record_data + bl_fpart)
+
+    return np.int8(max_in_record >= np.int16(record['baseline']))
+
+
 @export
 def find_peak_groups(peaks, gap_threshold,
                      left_extension=0, right_extension=0,
@@ -292,13 +340,26 @@ def find_peak_groups(peaks, gap_threshold,
 ##
 # Lone hit integration
 ##
-
+@export
 @numba.njit(nogil=True, cache=True)
-def _find_hit_integration_bounds(
-        lone_hits, peaks, records, save_outside_hits, n_channels):
-    """"Update lone hits to include integration bounds
-
-    save_outside_hits: in ns!!
+def find_hit_integration_bounds(
+        lone_hits, peaks, records, save_outside_hits, n_channels,
+        allow_bounds_beyond_records=False):
+    """"Update (lone) hits to include integration bounds. Please note
+    that time and length of the original hit are not changed!
+
+    :param lone_hits: Hits or lone hits which should be extended by
+        integration bounds.
+    :param peaks: Regions in which hits should not extend to. E.g. Peaks
+        for lone hits. If not needed just put a zero length
+        strax.time_fields array.
+    :param records: Records in which hits were found.
+    :param save_outside_hits: Hit extension to the left and right in ns
+        not samples!!
+    :param n_channels: Number of channels for given detector.
+    :param allow_bounds_beyond_records: If true extend left/
+        right_integration beyond record boundaries. E.g. to negative
+        samples for left side.
     """
     result = np.zeros((len(lone_hits), 2), dtype=np.int64)
     if not len(lone_hits):
@@ -330,12 +391,18 @@ def _find_hit_integration_bounds(
         # Ensure we do not integrate parts of peaks
         # or (at least for now) beyond the record in which the hit was found
         r = records[h['record_i']]
-        result[hit_i][0] = max(prev_p_end,
-                               r['time'],
-                               result[hit_i][0])
-        result[hit_i][1] = min(next_p_start,
-                               strax.endtime(r),
-                               result[hit_i][1])
+        if allow_bounds_beyond_records:
+            result[hit_i][0] = max(prev_p_end,
+                                   result[hit_i][0])
+            result[hit_i][1] = min(next_p_start,
+                                   result[hit_i][1])
+        else:
+            result[hit_i][0] = max(prev_p_end,
+                                   r['time'],
+                                   result[hit_i][0])
+            result[hit_i][1] = min(next_p_start,
+                                   strax.endtime(r),
+                                   result[hit_i][1])
 
         if last_hit_index[ch] != NO_EARLIER_HIT:
             # Ensure previous hit does not integrate the over-threshold region
@@ -373,8 +440,8 @@ def integrate_lone_hits(
 
     TODO: this doesn't extend the integration range beyond record boundaries
     """
-    _find_hit_integration_bounds(
-        lone_hits, peaks, records, save_outside_hits, n_channels)
+    find_hit_integration_bounds(lone_hits, peaks, records, save_outside_hits,
+                                n_channels)
     for hit_i, h in enumerate(lone_hits):
         r = records[h['record_i']]
         start, end = h['left_integration'], h['right_integration']

diff --git a/strax/processing/peak_splitting.py b/strax/processing/peak_splitting.py
@@ -6,14 +6,22 @@
 
 
 @export
-def split_peaks(peaks, records, to_pe, algorithm='local_minimum',
+def split_peaks(peaks, hits, records, rlinks, to_pe, algorithm='local_minimum',
                 data_type='peaks', **kwargs):
     """Return peaks split according to algorithm, with waveforms summed
     and widths computed.
 
+    Note:
+        Can also be used for hitlets splitting with local_minimum
+        splitter. Just put hitlets instead of peaks.
+
     :param peaks: Original peaks. Sum waveform must have been built
     and properties must have been computed (if you use them)
+    :param hits: Hits found in records. (or None in case of hitlets
+        splitting.)
     :param records: Records from which peaks were built
+    :param rlinks: strax.record_links for given records
+        (or None in case of hitlets splitting.)
     :param to_pe: ADC to PE conversion factor array (of n_channels)
     :param algorithm: 'local_minimum' or 'natural_breaks'.
     :param data_type: 'peaks' or 'hitlets'. Specifies whether to use
@@ -29,7 +37,7 @@ def split_peaks(peaks, records, to_pe, algorithm='local_minimum',
     data_type_is_not_supported = data_type not in ('hitlets', 'peaks')
     if data_type_is_not_supported:
         raise TypeError(f'Data_type "{data_type}" is not supported.')
-    return splitter(peaks, records, to_pe, data_type, **kwargs)
+    return splitter(peaks, hits, records, rlinks, to_pe, data_type, **kwargs)
 
 
 NO_MORE_SPLITS = -9999999
@@ -40,6 +48,7 @@ class PeakSplitter:
     :param peaks: Original peaks. Sum waveform must have been built
     and properties must have been computed (if you use them).
     :param records: Records from which peaks were built.
+    :param rlinks: strax.record_links for given records.
     :param to_pe: ADC to PE conversion factor array (of n_channels).
     :param data_type: 'peaks' or 'hitlets'. Specifies whether to use
         sum_waveform or get_hitlets_data to compute the waveform of the
@@ -55,7 +64,7 @@ class PeakSplitter:
     """
     find_split_args_defaults: tuple
 
-    def __call__(self, peaks, records, to_pe, data_type,
+    def __call__(self, peaks, hits, records, rlinks, to_pe, data_type,
                  do_iterations=1, min_area=0, **kwargs):
         if not len(records) or not len(peaks) or not do_iterations:
             return peaks
@@ -93,14 +102,14 @@ def __call__(self, peaks, records, to_pe, data_type,
         if is_split.sum() != 0:
             # Found new peaks: compute basic properties
             if data_type == 'peaks':
-                strax.sum_waveform(new_peaks, records, to_pe)
+                strax.sum_waveform(new_peaks, hits, records, rlinks, to_pe)
                 strax.compute_widths(new_peaks)
             elif data_type == 'hitlets':
                 # Add record fields here
                 new_peaks = strax.sort_by_time(new_peaks)  # Hitlets are not necessarily sorted after splitting
                 new_peaks = strax.get_hitlets_data(new_peaks, records, to_pe)           
             # ... and recurse (if needed)
-            new_peaks = self(new_peaks, records, to_pe, data_type,
+            new_peaks = self(new_peaks, hits, records, rlinks, to_pe, data_type,
                              do_iterations=do_iterations - 1,
                              min_area=min_area, **kwargs)
             if np.any(new_peaks['length'] == 0):

diff --git a/tests/test_lone_hit_integration.py b/tests/test_lone_hit_integration.py
@@ -0,0 +1,54 @@
+from strax.testutils import several_fake_records
+import numpy as np
+from hypothesis import given, settings
+import hypothesis.strategies as st
+
+import strax
+
+
+@settings(deadline=None)
+@given(several_fake_records,
+       st.integers(min_value=0, max_value=100),
+       st.integers(min_value=0, max_value=100),
+       )
+def test_lone_hits_integration_bounds(records, left_extension, right_extension):
+    """
+    Loops over hits and tests if integration bounds overlap.
+    """
+    n_channel = 0
+    if len(records):
+        n_channel = records['channel'].max()+1
+
+    hits = strax.find_hits(records, np.ones(n_channel))
+
+    strax.find_hit_integration_bounds(hits,
+                                      np.zeros(0, dtype=strax.time_dt_fields),
+                                      records,
+                                      (left_extension, right_extension),
+                                      n_channel,
+                                      allow_bounds_beyond_records=False
+                                      )
+    _test_overlap(hits)
+
+    hits['left_integration'] = 0
+    hits['right_integration'] = 0
+
+    strax.find_hit_integration_bounds(hits,
+                                      np.zeros(0, dtype=strax.time_dt_fields),
+                                      records,
+                                      (left_extension, right_extension),
+                                      n_channel,
+                                      allow_bounds_beyond_records=True
+                                      )
+    _test_overlap(hits)
+
+
+def _test_overlap(hits):
+    tester = np.zeros(len(hits), dtype=strax.time_fields)
+    tester['time'] = hits['time'] - (hits['left_integration'] - hits['left'])*hits['dt']
+    tester['endtime'] = hits['time'] + (hits['right_integration'] - hits['left'])*hits['dt']
+
+    for ch in np.unique(hits['channel']):
+        mask = hits['channel'] == ch
+        test_ch = np.all((tester[mask]['endtime'][:-1] - tester[mask]['time'][1:]) <= 0)
+        assert np.all(test_ch), 'Hits overlap!'