Merge pull request #235 from AxFoundation/chunk_metadata

Explicit time tracking, remove raw_records fields
AxFoundation · Mar 1, 2020 · 4cea268 · 4cea268
2 parents 427d923 + 4a1b082
commit 4cea268
Show file tree

Hide file tree

Showing 28 changed files with 1,189 additions and 1,045 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.8.8
+current_version = 0.9.0
 files = setup.py strax/__init__.py docs/source/conf.py
 commit = True
 tag = True

diff --git a/.pylintrc b/.pylintrc
@@ -0,0 +1,8 @@
+[MESSAGES CONTROL]
+# Jelle: CodeFactor has a whitelist of pylint messages
+# I removed:
+#  - cyclic-import    (we use this all the time in strax, see __init__.py)
+#  - no-else-return   (I think this makes sense for symmetric conditions, see https://dmerej.info/blog/post/else-after-return-yea-or-nay/)
+#  - len-as-condition (if you do 'if data' on a numpy array it will crash)
+disable=all
+enable=assert-on-tuple,astroid-error,bad-except-order,bad-inline-option,bad-option-value,bad-reversed-sequence,bare-except,binary-op-exception,boolean-datetime,catching-non-exception,cell-var-from-loop,confusing-with-statement,consider-merging-isinstance,consider-using-enumerate,consider-using-ternary,continue-in-finally,deprecated-pragma,django-not-available,duplicate-except,duplicate-key,eval-used,exec-used,expression-not-assigned,fatal,file-ignored,fixme,global-at-module-level,global-statement,global-variable-not-assigned,global-variable-undefined,http-response-with-content-type-json,http-response-with-json-dumps,invalid-all-object,invalid-characters-in-docstring,literal-comparison,locally-disabled,locally-enabled,lost-exception,lowercase-l-suffix,misplaced-bare-raise,missing-final-newline,missing-kwoa,mixed-line-endings,model-has-unicode,model-missing-unicode,model-no-explicit-unicode,model-unicode-not-callable,multiple-imports,multiple-statements,new-db-field-with-default,no-else-raise,non-ascii-bytes-literals,nonexistent-operator,not-an-iterable,not-in-loop,notimplemented-raised,overlapping-except,parse-error,pointless-statement,pointless-string-statement,raising-bad-type,raising-non-exception,raw-checker-failed,redefine-in-handler,redefined-argument-from-local,redefined-builtin,redundant-content-type-for-json-response,reimported,relative-import,return-outside-function,simplifiable-if-statement,singleton-comparison,syntax-error,trailing-comma-tuple,trailing-newlines,unbalanced-tuple-unpacking,undefined-all-variable,undefined-loop-variable,unexpected-line-ending-format,unidiomatic-typecheck,unnecessary-lambda,unnecessary-pass,unnecessary-semicolon,unneeded-not,unpacking-non-sequence,unreachable,unrecognized-inline-option,used-before-assignment,useless-else-on-loop,using-constant-test,wildcard-import,yield-outside-function,useless-return
diff --git a/.travis.yml b/.travis.yml
@@ -25,7 +25,7 @@ jobs:
     - name: "Python 3.7"
       env: PYTHON=3.7 DEPLOY_ME=true
     - name: "Python 3.7 numbaless (for coverage)"
-      env: PYTHON=3.7 NUMBA_DISABLE_JIT=true
+      env: PYTHON=3.7 NUMBA_DISABLE_JIT=1
     - name: "Python 3.6 (legacy)"
       env: PYTHON=3.6
 

diff --git a/docs/source/advanced/chunking.rst b/docs/source/advanced/chunking.rst
@@ -0,0 +1,51 @@
+Strax data model
+=================
+
+Data type and kind
+-------------------
+
+All data lives in *data types*, such as `raw_records` or `peak_basics`. Each of these has a fixed numpy datatype.
+
+If a single row of two data types refers to the same physical / logical thing, such as an event or a peak, we say those data types have the same `data_kind`.
+
+
+The Laws of Chunking
+---------------------
+You shall obey them.
+
+1. Each data row corresponds to a time interval. Time and (endtime or (dt and length)) are mandatory fields in all datatypes.
+2. Strax handles data in chunks. A chunk is also an interval (containing rows of data which are individually intervals).
+3. Suppose you have a chunk of some datatype reaching from [t0, t1), then
+
+   a. It contains all and only data that starts >= t0 or ends <= t1;
+   b. All data outside the chunk ends <= t0, or starts >= t1. (Remember intervals are half-open; the boundary cases are not ambiguous.)
+   c. In particular, every data row lies completely in one chunk. No data whatsoever lies partially in more than one chunk. This means chunks cannot be split at arbitrary times.
+
+4. Zero-duration data rows are not allowed. Zero-duration chunks are allowed, but they cannot contain data.
+
+
+Incoming data
+-------------
+From the perspective of a plugin, all incoming data is time-synchronized and merged by kind. Specifically:
+
+* Data of the same kind is merged into a single array. If you depend on `events`, `peaks` and `peak_basics`, you will get two arrays: `events` and `peaks`. The first will be the merged array of `peaks` and `peak_basics`.
+* Data of different kinds are synchronized by time. Strax will fetch a chunks of the first kind (`events`), then fetch as much as needed from the second kind (`peaks`) until you have all peaks that end before or at exactly the same time as the last event.
+
+This example is a bit odd: when loading data of multiple kinds that are contained in each other, e.g. events and peaks, you very often want to use a `LoopPlugin` rather than a straight-up Plugin.
+
+Outgoing data
+-------------
+Plugins can chunk their output as they wish, including withholding some data until the next chunk is sent out. Of course this requires keeping state, which means you cannot parallelize: see the chunk boundary handling section later in this documentation.
+
+Savers, too, are free to chunk their data as they like; for example, to create files of convenient sizes. This affects the chunks you get when loading or reprocessing data. If you don't want this, e.g. if the next plugin in line assumes a particular kind of chunking you want to preserve, set the attribute `rechunk_on_save = False`.
+
+
+Sorted output requirement
+--------------------------
+Strax requires that all output is sorted by time inside chunks.
+
+Additionally, most or all plugins will assume that incoming data is time-ordered between chunks. That is, a subsequent chunk should not contain any data that starts before an item from a previous chunk ends. Strax data must be either consist of disjoint things, or if there are overlaps, chunk boundaries must fall in places where gaps exist.
+
+It would have been much harder to code an algorithm if you do not know when you have seen all input before a certain time. Essentially you would have to wait until the end of the run before you can process any data, which goes against the idea of processing your data as a stream.
+
+If your plugin removes or adds items from the original incoming array, it must output a different *data kind*. For example, during the initial data reduction steps, we remove items from 'raw_records' to make a new data kind 'records'. Here we change data kind, even though the fields in the output data type are identical to the fields in the input data type.
diff --git a/docs/source/advanced/overview.rst b/docs/source/advanced/overview.rst
@@ -1,5 +1,5 @@
-Strax overview
-==============
+Strax overview (old)
+====================
 
 .. image:: architecture.svg
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -65,9 +65,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.8.8'
+version = '0.9.0'
 # The full version, including alpha/beta/rc tags.
-release = '0.8.8'
+release = '0.9.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/source/developer/chunking.rst b/docs/source/developer/chunking.rst
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -30,15 +30,15 @@ You might also find these presentations useful:
     :maxdepth: 1
     :caption: Advanced usage
 
-    advanced/overview
+    advanced/chunking
     advanced/superrun
     advanced/plugin_dev
+    advanced/overview
 
 .. toctree::
     :maxdepth: 1
     :caption: Developer documentation
 
-    developer/chunking
     developer/pipeline
     developer/parallel
     developer/overlaps

diff --git a/setup.py b/setup.py
@@ -12,7 +12,7 @@
     history = file.read()
 
 setuptools.setup(name='strax',
-                 version='0.8.8',
+                 version='0.9.0',
                  description='Streaming analysis for xenon TPCs',
                  author='Jelle Aalbers',
                  url='https://github.com/AxFoundation/strax',
@@ -40,4 +40,4 @@
                      'Programming Language :: Python :: Implementation :: CPython',
                      'Topic :: Scientific/Engineering :: Physics',
                  ],
-                 zip_safe = False)
+                 zip_safe=False)
diff --git a/strax/__init__.py b/strax/__init__.py
@@ -1,13 +1,13 @@
 # flake8: noqa
-__version__ = '0.8.8'
+__version__ = '0.9.0'
 
 # Glue the package together
 # See https://www.youtube.com/watch?v=0oTh1CXRaQ0 if this confuses you
 # The order of subpackes is not invariant, since we use strax.xxx inside strax
 from .utils import *
+from .chunk import *
 from .dtypes import *
 from strax.processing.general import *
-from .chunk_arrays import *
 
 from .storage.common import *
 from .storage.files import *

diff --git a/strax/__main__.py b/strax/__main__.py
@@ -1,4 +1,4 @@
 # Rather boring __main__, makes it possible to test if strax imports with
 # python -m strax
 import strax    # noqa
-print("Strax says hi!")
+print(f"Strax version {strax.__version__} says hi!")