AxFoundation · WenzDaniel · Nov 23, 2023 · Oct 24, 2023 · Nov 22, 2023 · Nov 22, 2023
diff --git a/strax/plugins/__init__.py b/strax/plugins/__init__.py
@@ -4,3 +4,4 @@
 from .merge_only_plugin import *
 from .overlap_window_plugin import  *
 from .parrallel_source_plugin import *
+from .down_chunking_plugin import *
diff --git a/strax/plugins/down_chunking_plugin.py b/strax/plugins/down_chunking_plugin.py
@@ -0,0 +1,40 @@
+import strax
+import types
+import inspect
+from .plugin import Plugin
+export, __all__ = strax.exporter()
+
+
+##
+# Plugin which allows to use yield in plugins compute method.
+# Allows to chunk down output before storing to disk.
+# Only works if multiprocessing is omitted.
+##
+
+@export
+class DownChunkingPlugin(Plugin):
+    """Plugin that merges data from its dependencies
+    """
+
+    def iter(self, iters, executor=None):
+
+        _plugin_uses_multi_threading = (self.parallel 
+                                        and executor is not None 
+                                        and (inspect.isgeneratorfunction(self.compute))
+                                       )
+        if _plugin_uses_multi_threading:
+            raise NotImplementedError(
+                f'Plugin "{self.__class__.__name__}" uses an iterator as compute method. ' 
+                'This is not supported in multi-threading/processing.')
+        return super().iter(iters, executor=None)
+
+    def _iter_return(self, chunk_i, **inputs_merged):
+        return self.do_compute(chunk_i=chunk_i, **inputs_merged)
+
+    def _fix_output(self, result, start, end, _dtype=None):
+        """Wrapper around _fix_output to support the return
+        of iterators.
+        """
+        if isinstance(result, types.GeneratorType):
+            return result
+        return super()._fix_output(result, start, end)
diff --git a/strax/plugins/plugin.py b/strax/plugins/plugin.py
@@ -496,7 +496,7 @@ class IterDone(Exception):
                     pending_futures = [f for f in pending_futures if not f.done()]
                     yield new_future
                 else:
-                    yield self.do_compute(chunk_i=chunk_i, **inputs_merged)
+                    yield from self._iter_return(chunk_i=chunk_i, **inputs_merged)
 
         except IterDone:
             # Check all sources are exhausted.
@@ -523,6 +523,10 @@ class IterDone(Exception):
         finally:
             self.cleanup(wait_for=pending_futures)
 
+    def _iter_return(self, chunk_i, **inputs_merged):
+        yield self.do_compute(chunk_i=chunk_i, **inputs_merged)
+
+
     def cleanup(self, wait_for):
         pass
         # A standard plugin doesn't need to do anything here

diff --git a/strax/testutils.py b/strax/testutils.py
@@ -260,6 +260,74 @@ def compute(self, peaks):
         return dict(peak_classification=p,
                     lone_hits=lh)
 
+
+# Plugins with time structure within chunks,
+# used to test down chunking within plugin compute.
+@strax.takes_config(
+    strax.Option('n_chunks', type=int, default=10, track=False),
+    strax.Option('recs_per_chunk', type=int, default=10, track=False),
+)
+class RecordsWithTimeStructure(strax.Plugin):
+    provides = 'records'
+    parallel = 'process'
+    depends_on = tuple()
+    dtype = strax.record_dtype()
+
+    rechunk_on_save = False
+
+    def source_finished(self):
+        return True
+
+    def is_ready(self, chunk_i):
+        return chunk_i < self.config['n_chunks']    
+
+    def setup(self):
+        self.last_end = 0
+
+    def compute(self, chunk_i):
+
+        r = np.zeros(self.config['recs_per_chunk'], self.dtype)
+        r['time'] = self.last_end + np.arange(self.config['recs_per_chunk']) + 5
+        r['length'] = r['dt'] = 1
+        r['channel'] = np.arange(len(r))
+
+        end = self.last_end + self.config['recs_per_chunk'] + 10
+        chunk = self.chunk(start=self.last_end, end=end, data=r)
+        self.last_end = end
+
+        return chunk
+
+
+class DownSampleRecords(strax.DownChunkingPlugin):
+    """PLugin to test the downsampling of Chunks during compute. Needed
+    for simulations.
+    """
+
+    provides = 'records_down_chunked'
+    depends_on  = 'records'
+    dtype = strax.record_dtype()
+    rechunk_on_save = False
+    parallel='process'
+
+    def compute(self, records, start, end):
+        offset = 0
+        last_start = start
+
+        count=0
+        for count, r in enumerate(records):
+            if count == 5:
+                res = records[offset:count]
+                chunk_end = np.max(strax.endtime(res))
+                offset = count
+                chunk = self.chunk(start=last_start, end=chunk_end, data=res)
+                last_start = chunk_end
+                yield chunk
+
+        res = records[offset:count+1]
+        chunk = self.chunk(start=last_start, end=end, data=res)
+        yield chunk
+
+
 # Used in test_core.py
 run_id = '0'
 

diff --git a/tests/test_context.py b/tests/test_context.py
@@ -1,5 +1,5 @@
 import strax
-from strax.testutils import Records, Peaks, PeaksWoPerRunDefault, PeakClassification, run_id
+from strax.testutils import Records, Peaks, PeaksWoPerRunDefault, PeakClassification, RecordsWithTimeStructure, DownSampleRecords, run_id
 import tempfile
 import numpy as np
 from hypothesis import given, settings
@@ -215,6 +215,34 @@ def tearDown(self):
         if os.path.exists(self.tempdir):
             shutil.rmtree(self.tempdir)
 
+    def test_down_chunking(self):
+        st = self.get_context(False)
+        st.register(RecordsWithTimeStructure)
+        st.register(DownSampleRecords)
+
+        st.make(run_id, 'records')
+        st.make(run_id, 'records_down_chunked')
+
+        chunks_records = st.get_meta(run_id, 'records')['chunks']
+        chunks_records_down_chunked = st.get_meta(run_id, 'records_down_chunked')['chunks']
+
+        _chunks_are_downsampled = len(chunks_records)*2 == len(chunks_records_down_chunked) 
+        assert _chunks_are_downsampled
+
+        _chunks_are_continues = np.all([chunks_records_down_chunked[i]['end'] == chunks_records_down_chunked[i+1]['start'] for i in range(len(chunks_records_down_chunked)-1)])
+        assert _chunks_are_continues
+
+    def test_down_chunking_multi_processing(self):
+        st = self.get_context(False, allow_multiprocess=True)
+        st.set_context_config({'use_per_run_defaults': False})
+        st.register(RecordsWithTimeStructure)
+        st.register(DownSampleRecords)
+
+        st.make(run_id, 'records', max_workers=1)
+        with self.assertRaises(NotImplementedError):
+            st.make(run_id, 'records_down_chunked', max_workers=2)
+
+
     def test_get_plugins_with_cache(self):
         st = self.get_context(False)
         st.register(Records)
@@ -283,11 +311,12 @@ def test_deregister(self):
         st.deregister_plugins_with_missing_dependencies()
         assert st._plugin_class_registry.pop('peaks', None) is None
 
-    def get_context(self, use_defaults):
+    def get_context(self, use_defaults, **kwargs):
         """Get simple context where we have one mock run in the only storage frontend"""
         assert isinstance(use_defaults, bool)
         st = strax.Context(storage=self.get_mock_sf(),
-                           check_available=('records',)
+                           check_available=('records',),
+                           **kwargs
                            )
         st.set_context_config({'use_per_run_defaults': use_defaults})
         return st