Merge pull request #217 from Crunch-io/handle-smoothing-transform-173…

…943536 Handle smoothing transform 173943536
Crunch-io · Sep 21, 2020 · b3fbd03 · b3fbd03
2 parents 57de896 + 3d4be27
commit b3fbd03
Show file tree

Hide file tree

Showing 50 changed files with 8,337 additions and 2,694 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,52 @@
 # History of Changes
 
+#### 1.11.31
+- Use bases instead of margin for MR `standard_error` calculation
+
+#### 1.11.30
+- Fix `standard_error` calculation for MR types 
+
+#### 1.11.29
+- Fix `standard_error` denominator for `Strand` 
+
+#### 1.11.28
+- Fix collapsed `scale-mean-pairwise-indices`
+
+#### 1.11.27
+- Standard deviation and standard error for `Strand`
+
+#### 1.11.26
+- Fix `pairwise_indices()` array collapse when all values empty
+
+#### 1.11.25
+- Expose two-level pairwise-t-test
+
+#### 1.11.24
+- Bug fix for scale_median calculation
+
+#### 1.11.23
+- Expose population fraction in cube partitions
+
+#### 1.11.22
+- Additional summary measures for scale (`std_dev`, `std_error`, `median`)
+
+#### 1.11.21
+- Fix slicing for CA + single col filter
+
+#### 1.11.20
+- Fix cube title payload discrepancy
+
+#### 1.11.19
+- Fix problem where pre-ordering anchor-idx was used for locating inserted subtotal vectors
+- Enable handling of filter-only multitable-template placeholders.
+- New measures: table and columns standard deviation and standard error
+
+#### 1.11.18
+- Fix wrong proportions and base values when explicit order is expressed
+
+#### 1.11.17
+- Fix incorrect means values after hiding
+
 #### 1.11.16
 - New base for t_stats overlaps
 

diff --git a/README.md b/README.md
@@ -112,6 +112,9 @@ The detailed description can be found
 
 ## Changes
 
+### 1.12.1
+- Smoothing on column percentages and column index
+
 ### 1.11.37
 - PR 216: Document matrix.py classes and properties
 
@@ -130,51 +133,5 @@ The detailed description can be found
 #### 1.11.32
 - Handle hidden option for insertions
 
-#### 1.11.31
-- Use bases instead of margin for MR `standard_error` calculation
-
-#### 1.11.30
-- Fix `standard_error` calculation for MR types 
-
-#### 1.11.29
-- Fix `standard_error` denominator for `Strand` 
-
-#### 1.11.28
-- Fix collapsed `scale-mean-pairwise-indices`
-
-#### 1.11.27
-- Standard deviation and standard error for `Strand`
-
-#### 1.11.26
-- Fix `pairwise_indices()` array collapse when all values empty
-
-#### 1.11.25
-- Expose two-level pairwise-t-test
-
-#### 1.11.24
-- Bug fix for scale_median calculation
-
-#### 1.11.23
-- Expose population fraction in cube partitions
-
-#### 1.11.22
-- Additional summary measures for scale (`std_dev`, `std_error`, `median`)
-
-#### 1.11.21
-- Fix slicing for CA + single col filter
-
-#### 1.11.20
-- Fix cube title payload discrepancy
-
-#### 1.11.19
-- Fix problem where pre-ordering anchor-idx was used for locating inserted subtotal vectors
-- Enable handling of filter-only multitable-template placeholders.
-- New measures: table and columns standard deviation and standard error
-
-#### 1.11.18
-- Fix wrong proportions and base values when explicit order is expressed
-
-#### 1.11.17
-- Fix incorrect means values after hiding
 
 For a complete list of changes see [history](https://github.com/Crunch-io/crunch-cube/blob/master/HISTORY.md).
diff --git a/src/cr/cube/__init__.py b/src/cr/cube/__init__.py
@@ -2,6 +2,6 @@
 
 """Initialization module for crunch-cube package."""
 
-__version__ = "1.11.37"
+__version__ = "1.12.1"
 
 # NOTE: We'll be switching to 2.0.0 once we throw out the old cube and slice
diff --git a/src/cr/cube/cube.py b/src/cr/cube/cube.py
@@ -428,7 +428,7 @@ def _cube_dict(self):
 
     @lazyproperty
     def _is_single_filter_col_cube(self):
-        """ -> bool, determines if it is a single column filter cube."""
+        """bool determines if it is a single column filter cube."""
         return self._cube_dict["result"].get("is_single_col_cube", False)
 
     def _measure(self, weighted):

diff --git a/src/cr/cube/cube_slice.py b/src/cr/cube/cube_slice.py
@@ -596,7 +596,6 @@ def _array_type_std_res(self, counts, total, colsum, rowsum):
         are calculated for each of them separately, and then stacked together
         in the resulting array.
         """
-        # __import__("ipdb").set_trace()
         if self.mr_dim_ind == 0:
             # --This is a special case where broadcasting cannot be
             # --automatically done. We need to "inflate" the single dimensional

diff --git a/src/cr/cube/cubepart.py b/src/cr/cube/cubepart.py
@@ -288,6 +288,7 @@ def columns_std_err(self):
 
     @lazyproperty
     def counts(self):
+        """2D np.float64 ndarray of weighted cube counts."""
         return np.array([row.counts for row in self._matrix.rows])
 
     @lazyproperty
@@ -865,10 +866,12 @@ def _transform_dicts(self):
         item is an empty dict (`{}`) when no transforms are specified for that
         dimension.
         """
-        return (
-            self._transforms_dict.get("rows_dimension", {}),
-            self._transforms_dict.get("columns_dimension", {}),
-        )
+        transforms_dict = self._transforms_dict
+        rows_dimension_dict = transforms_dict.get("rows_dimension", {})
+        columns_dimension_dict = transforms_dict.get("columns_dimension", {})
+        if "smoothing" in transforms_dict:
+            columns_dimension_dict["smoothing"] = transforms_dict["smoothing"]
+        return (rows_dimension_dict, columns_dimension_dict)
 
 
 class _Strand(CubePartition):
@@ -892,6 +895,7 @@ def bases(self):
 
     @lazyproperty
     def counts(self):
+        """tuple, 1D cube counts."""
         return tuple(row.count for row in self._stripe.rows)
 
     @lazyproperty
@@ -1177,7 +1181,11 @@ def _rows_dimension(self):
     @lazyproperty
     def _row_transforms_dict(self):
         """Transforms dict for the single (rows) dimension of this strand."""
-        return self._transforms_dict.get("rows_dimension", {})
+        transforms_dict = self._transforms_dict
+        rows_dimension_dict = transforms_dict.get("rows_dimension", {})
+        if "smoothing" in transforms_dict:
+            rows_dimension_dict["smoothing"] = transforms_dict["smoothing"]
+        return rows_dimension_dict
 
     @lazyproperty
     def _stripe(self):

diff --git a/src/cr/cube/dimension.py b/src/cr/cube/dimension.py
@@ -3,6 +3,7 @@
 """Provides the Dimension class."""
 
 import sys
+import warnings
 
 if sys.version_info >= (3, 3):
     from collections.abc import Sequence
@@ -385,6 +386,19 @@ def prune(self):
     def shape(self):
         return len(self.all_elements)
 
+    @lazyproperty
+    def smooth(self):
+        """Function performing smoothing for this dimension, based on transform."""
+
+        def null_smooth(values):
+            return values
+
+        return (
+            _SingleSideMovingAvgSmoother.smoothing_function(self._smoothing_window)
+            if self._show_smoothing
+            else null_smooth
+        )
+
     @lazyproperty
     def sort(self):
         """A _BaseSort-subclass object or None, describing the applied sort method.
@@ -439,6 +453,49 @@ def _dimension_transforms_dict(self):
             else {}
         )
 
+    @lazyproperty
+    def _is_cat_date(self):
+        """True for a categorical dimension having date defined on all valid categories.
+
+        Only meaningful when the dimension is known to be categorical
+        (has base-type `categorical`).
+        """
+        categories = self._dimension_dict["type"].get("categories", [])
+        if not categories:
+            return False
+        return all(
+            category.get("date")
+            for category in categories
+            if not category.get("missing", False)
+        )
+
+    @lazyproperty
+    def _show_smoothing(self):
+        """Return True if a smoothing transform is active for this dimension."""
+        smoothing = self._dimension_transforms_dict.get("smoothing")
+        # --- default is no smoothing when smoothing transform is not present ---
+        if not smoothing:
+            return False
+        # --- no smoothing when dimension is not a categorical date ---
+        if not self._is_cat_date:
+            return False
+        # --- no smoothing when the smoothing transform is inactive ---
+        if not smoothing.get("show", True):
+            return False
+        return True
+
+    @lazyproperty
+    def _smoothing_window(self):
+        """size of the moving window.
+
+        This is the number of observations used for calculating the statistic. Each
+        window will be a fixed size.
+        """
+        smoothing = self._dimension_transforms_dict.get("smoothing")
+        if not smoothing:
+            return None
+        return smoothing.get("window", 3)
+
 
 class _BaseElements(Sequence):
     """Base class for element sequence containers."""
@@ -976,3 +1033,103 @@ def label(self):
     def prune(self):
         """True if this subtotal should not appear when empty."""
         return self._prune
+
+
+class _SingleSideMovingAvgSmoother(object):
+    """Create and configure smoothing function for one-sided moving average."""
+
+    def __init__(self, window):
+        self._window = window
+
+    @classmethod
+    def smoothing_function(cls, window):
+        """Returns function that smooths a sequence of numeric values in `window`."""
+        return cls(window)._smoothing_function
+
+    @lazyproperty
+    def _smoothing_function(self):
+        """function that returns an array of smoothed values."""
+
+        def smooth(values):
+            """ -> 1D/2D float64 ndarray of smootehd values including additional nans.
+
+            Given a series of numbers and a fixed subset size, the first element of the
+            moving average is obtained by taking the average of the initial fixed subset
+            of the number series. Then the subset is modified by `shifting forward` the
+            values. A moving average is commonly used with time series data to smooth
+            out short-term fluctuations and highlight longer-term trends or cycles.
+
+            The below examples will show 1D and 2D array rolling mean calculations with
+            window sizes of 2 and 3, respectively.
+
+                                        [window = 2]
+            ----------------------------------------------------------------------------
+                x    |   smooth(x)                  x     |        smooth(x)
+            ---------+--------------         -------------+------------------------
+                1    |    NaN                 1  3  2  3  |   NaN  2.0  2.5  2.5
+                2    |    1.5                 2  3  3  2  |   NaN  2.5  3.0  2.5
+                3    |    2.5                 3  2  4  4  |   NaN  2.5  3.0  4.0
+                4    |    3.5                 4  1  5  1  |   NaN  2.5  3.0  3.0
+
+                                        [window = 3]
+            ----------------------------------------------------------------------------
+                x    |   smooth(x)                  x     |        smooth(x)
+            ---------+--------------         -------------+------------------------
+                1    |    NaN                 1  3  2  3  |   NaN  NaN   2.0  2.67
+                2    |    NaN                 2  3  3  2  |   NaN  NaN  2.67  2.67
+                3    |     2                  3  2  4  4  |   NaN  NaN   3.0  3.33
+                4    |     3                  4  1  5  1  |   NaN  NaN  3.33  2.33
+
+            This is performed just taking the average of the last 2 or 3 rows according
+            to the window, all the way down the column.
+            """
+            if not self._valid_window(values.shape[-1]):
+                warnings.warn(
+                    "No smoothing performed. Window (value: {}) parameter is not "
+                    "valid: window must be less than equal to the total period "
+                    "(value: {}) and positive".format(self._window, values.shape[-1]),
+                    UserWarning,
+                )
+                return values
+            smoothed_values = self._smoother(values)
+            # offset between original values and smoothed values
+            offset = [values.shape[-1] - smoothed_values.shape[-1]]
+            additional_nans = np.full(list(values.shape[:-1]) + offset, np.nan)
+            return np.concatenate(
+                [additional_nans, smoothed_values], axis=values.ndim - 1
+            )
+
+        return smooth
+
+    def _smoother(self, values):
+        """ -> np.ndarray, provide smoothing algorithm on the given values.
+
+        In this case the moving average smoother is performed using the np.convolve
+        (https://numpy.org/doc/stable/reference/generated/numpy.convolve.html)
+        operator that returns the discrete, linear convolution of two one-dimensional
+        sequences.
+        A moving average is a form of a convolution often used in time series analysis
+        to smooth out noise in data by replacing a data point with the average of
+        neighboring values in a moving window. A moving average is essentially a
+        low-pass filter because it removes short-term fluctuations to highlight a deeper
+        underlying trend.
+        """
+        w = self._window
+        return (
+            np.array(tuple(np.convolve(values, np.ones(w), mode="valid") / w))
+            if values.ndim == 1
+            else np.array(
+                [tuple(np.convolve(v, np.ones(w), mode="valid") / w) for v in values]
+            )
+        )
+
+    def _valid_window(self, total_period):
+        """ -> bool, the validity of the window parameter.
+
+        Return last dimension size, if window is greater than the the last dimension
+        size because we cannot have a moving window grater than the number of elements
+        of each column.
+        """
+        if self._window > total_period or self._window == 0:
+            return False
+        return True