Merge branch 'sentry-cannot-broadcast-161845317'

This branch fixes the Sentry error from ticket 161845317 and makes some nice refactoring improvements along the way.
Crunch-io · Nov 14, 2018 · 55963e8 · 55963e8
2 parents 543ee6c + eb3d79e
commit 55963e8
Show file tree

Hide file tree

Showing 9 changed files with 14,714 additions and 2,771 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -9,8 +9,7 @@ testpaths =
 show-source = True
 max-line-length = 80
 ignore =
-    # E123: closing bracket does not match indentation of opening bracket's line
-    E123
+    W504  # line break after binary operator (e.g. 'and')
 
 [isort]
 line_length = 80

diff --git a/src/cr/cube/crunch_cube.py b/src/cr/cube/crunch_cube.py
@@ -12,8 +12,6 @@
 import warnings
 
 import numpy as np
-from scipy.stats import norm
-from scipy.stats.contingency import expected_freq
 
 from cr.cube.cube_slice import CubeSlice
 from cr.cube.dimension import AllDimensions
@@ -72,15 +70,9 @@ def __repr__(self):
         """
         try:
             dimensionality = ' x '.join(dt.name for dt in self.dim_types)
-            slice_reprs = (
-                '\n'.join(
-                    'slices[%d]: %s' % (idx, repr(s))
-                    for idx, s in enumerate(self.slices)
-                )
-            )
             return (
-                "%s(name='%s', dim_types='%s')\n%s" %
-                (type(self).__name__, self.name, dimensionality, slice_reprs)
+                "%s(name='%s', dim_types='%s')" %
+                (type(self).__name__, self.name, dimensionality)
             )
         except Exception:
             return super(CrunchCube, self).__repr__()
@@ -620,26 +612,21 @@ def hs_dims_for_den(hs_dims, axis):
         return res
 
     def pvals(self, weighted=True, prune=False, hs_dims=None):
-        """Calculate p-vals.
+        """Return ndarray with calculated p-vals.
 
         This function calculates statistically significant results for
-        categorical contingency tables. The values can be calculated across
-        columns (axis = 0), or across rows (axis = 1).
-
-        Returns
-            (ndarray): 2-Dimensional array, representing the p-values for each
-                       cell of the table-like representation of the
-                       crunch cube.
+        categorical contingency tables. The values are calculated for 2D tables
+        only. For 3D cubes, the slices' results are stacked together and
+        returned as an ndarray.
+
+        :param weighted: Use weighted counts for zscores
+        :param prune: Prune based on unweighted counts
+        :param hs_dims: Include headers and subtotals (as NaN values)
+        :returns: 2 or 3 Dimensional ndarray, representing the p-values for each
+                  cell of the table-like representation of the crunch cube.
         """
-        stats = self.zscore(weighted=weighted, prune=prune, hs_dims=hs_dims)
-        res = 2 * (1 - norm.cdf(np.abs(stats)))
-
-        if isinstance(stats, np.ma.core.MaskedArray):
-            # Explicit setting of the mask is necessary, because the norm.cdf
-            # creates a non-masked version
-            res = np.ma.masked_array(res, stats.mask)
-
-        return res
+        res = [s.pvals(weighted, prune, hs_dims) for s in self.slices]
+        return np.array(res) if self.ndim == 3 else res[0]
 
     @lazyproperty
     def row_direction_axis(self):
@@ -706,35 +693,20 @@ def univariate_ca_main_axis(self):
         return self.dim_types.index(DT.CA_CAT)
 
     def zscore(self, weighted=True, prune=False, hs_dims=None):
-        """Get cube zscore measurement."""
-        res = []
-        for slice_ in self.slices:
-            counts = slice_.as_array(weighted=weighted)
-            total = slice_.margin(weighted=weighted)
-            colsum = slice_.margin(axis=0, weighted=weighted)
-            rowsum = slice_.margin(axis=1, weighted=weighted)
-            std_res = self._calculate_std_res(
-                counts, total, colsum, rowsum, slice_,
-            )
-            res.append(std_res)
-
-        if len(res) == 1 and self.ndim < 3:
-            res = res[0]
-        else:
-            res = np.array(res)
+        """Return ndarray with cube's zscore measurements.
 
-        if hs_dims:
-            res = self._intersperse_hs_in_std_res(hs_dims, res)
+        Zscore is a measure of statistical signifficance of observed vs.
+        expected counts. It's only applicable to a 2D contingency tables.
+        For 3D cubes, the measures of separate slices are stacked together
+        and returned as the result.
 
-        if prune:
-            arr = self.as_array(
-                prune=prune,
-                include_transforms_for_dims=hs_dims,
-            )
-            if isinstance(arr, np.ma.core.MaskedArray):
-                res = np.ma.masked_array(res, arr.mask)
-
-        return res
+        :param weighted: Use weighted counts for zscores
+        :param prune: Prune based on unweighted counts
+        :param hs_dims: Include headers and subtotals (as NaN values)
+        :returns zscore: ndarray representing zscore measurements
+        """
+        res = [s.zscore(weighted, prune, hs_dims) for s in self.slices]
+        return np.array(res) if self.ndim == 3 else res[0]
 
     def _adjust_axis(self, axis):
         """Return raw axis/axes corresponding to apparent axis/axes.
@@ -911,31 +883,6 @@ def _calculate_constraints_sum(cls, prop_table, prop_margin, axis):
             # (because of the inner matrix dimensions).
             return np.dot(prop_margin, V)
 
-    def _calculate_std_res(self, counts, total, colsum, rowsum, slice_):
-        has_mr_or_ca = set(slice_.dim_types) & DT.ARRAY_TYPES
-        if has_mr_or_ca:
-            if (not self.is_double_mr and
-                    (self.mr_dim_ind == 0 or
-                        self.mr_dim_ind == 1 and self.ndim == 3)):
-                total = total[:, np.newaxis]
-                rowsum = rowsum[:, np.newaxis]
-
-            expected = rowsum * colsum / total
-            variance = (
-                rowsum * colsum * (total - rowsum) * (total - colsum) /
-                total ** 3
-            )
-            res = (counts - expected) / np.sqrt(variance)
-        else:
-            expected_counts = expected_freq(counts)
-            residuals = counts - expected_counts
-            variance = (
-                np.outer(rowsum, colsum) *
-                np.outer(total - rowsum, total - colsum) / total ** 3
-            )
-            res = residuals / np.sqrt(variance)
-        return res
-
     @lazyproperty
     def _col_direction_axis(self):
         return self.ndim - 2
@@ -1077,14 +1024,6 @@ def iter_insertions():
 
         return [insertion for insertion in iter_insertions()]
 
-    def _intersperse_hs_in_std_res(self, hs_dims, res):
-        for dim, inds in enumerate(self.inserted_hs_indices()):
-            for i in inds:
-                if dim not in hs_dims:
-                    continue
-                res = np.insert(res, i, np.nan, axis=(dim - self.ndim))
-        return res
-
     def _is_axis_allowed(self, axis):
         """Check if axis are allowed.
 

diff --git a/src/cr/cube/cube_slice.py b/src/cr/cube/cube_slice.py
@@ -9,6 +9,8 @@
 
 import numpy as np
 from tabulate import tabulate
+from scipy.stats import norm
+from scipy.stats.contingency import expected_freq
 
 from cr.cube.enum import DIMENSION_TYPE as DT
 from cr.cube.measures.scale_means import ScaleMeans
@@ -174,7 +176,7 @@ def index_table(self, axis=None, baseline=None, prune=False):
 
         indexes = proportions / baseline * 100
 
-        return self._apply_pruning_mask(indexes, prune)
+        return self._apply_pruning_mask(indexes) if prune else indexes
 
     @lazyproperty
     def is_double_mr(self):
@@ -299,16 +301,94 @@ def table_name(self):
         table_name = self._cube.labels()[0][self._index]
         return '%s: %s' % (title, table_name)
 
-    def _apply_pruning_mask(self, res, prune):
-        if not prune:
-            return res
+    def pvals(self, weighted=True, prune=False, hs_dims=None):
+        """Return 2D ndarray with calculated p-vals.
+
+        This function calculates statistically significant results for
+        categorical contingency tables. The values are calculated for 2D tables
+        only.
+
+        :param weighted: Use weighted counts for zscores
+        :param prune: Prune based on unweighted counts
+        :param hs_dims: Include headers and subtotals (as NaN values)
+        :returns: 2 or 3 Dimensional ndarray, representing the p-values for each
+                  cell of the table-like representation of the crunch cube.
+        """
+        stats = self.zscore(weighted=weighted, prune=prune, hs_dims=hs_dims)
+        pvals = 2 * (1 - norm.cdf(np.abs(stats)))
+
+        return self._apply_pruning_mask(pvals, hs_dims) if prune else pvals
+
+    def zscore(self, weighted=True, prune=False, hs_dims=None):
+        """Return ndarray with slices's zscore measurements.
+
+        Zscore is a measure of statistical signifficance of observed vs.
+        expected counts. It's only applicable to a 2D contingency tables.
+
+        :param weighted: Use weighted counts for zscores
+        :param prune: Prune based on unweighted counts
+        :param hs_dims: Include headers and subtotals (as NaN values)
+        :returns zscore: ndarray representing zscore measurements
+        """
+        counts = self.as_array(weighted=weighted)
+        total = self.margin(weighted=weighted)
+        colsum = self.margin(axis=0, weighted=weighted)
+        rowsum = self.margin(axis=1, weighted=weighted)
+        zscore = self._calculate_std_res(
+            counts, total, colsum, rowsum,
+        )
+
+        if hs_dims:
+            zscore = self._intersperse_hs_in_std_res(hs_dims, zscore)
+
+        if prune:
+            return self._apply_pruning_mask(zscore, hs_dims)
+
+        return zscore
+
+    def _apply_pruning_mask(self, res, hs_dims=None):
+        array = self.as_array(prune=True, include_transforms_for_dims=hs_dims)
 
-        array = self.as_array(prune=True)
         if not isinstance(array, np.ma.core.MaskedArray):
             return res
 
         return np.ma.masked_array(res, mask=array.mask)
 
+    def _array_type_std_res(self, counts, total, colsum, rowsum):
+        """Return ndarray containing standard residuals for array values.
+
+        The shape of the return value is the same as that of *counts*.
+        Array variables require special processing because of the
+        underlying math. Essentially, it boils down to the fact that the
+        variable dimensions are mutually independent, and standard residuals
+        are calculated for each of them separately, and then stacked together
+        in the resulting array.
+        """
+        if self.mr_dim_ind == 0:
+            # --This is a special case where broadcasting cannot be
+            # --automatically done. We need to "inflate" the single dimensional
+            # --ndarrays, to be able to treat them as "columns" (essentially a
+            # --Nx1 ndarray). This is needed for subsequent multiplication
+            # --that needs to happen column wise (rowsum * colsum) / total.
+            total = total[:, np.newaxis]
+            rowsum = rowsum[:, np.newaxis]
+
+        expected_counts = rowsum * colsum / total
+        variance = (
+            rowsum * colsum * (total - rowsum) * (total - colsum) /
+            total ** 3
+        )
+        return (counts - expected_counts) / np.sqrt(variance)
+
+    def _calculate_std_res(self, counts, total, colsum, rowsum):
+        """Return ndarray containing standard residuals.
+
+        The shape of the return value is the same as that of *counts*.
+        """
+        if set(self.dim_types) & DT.ARRAY_TYPES:  # ---has-mr-or-ca---
+            return self._array_type_std_res(counts, total, colsum, rowsum)
+        return self._scalar_type_std_res(counts, total, colsum, rowsum)
+
     def _call_cube_method(self, method, *args, **kwargs):
         kwargs = self._update_args(kwargs)
         result = getattr(self._cube, method)(*args, **kwargs)
@@ -318,6 +398,14 @@ def _call_cube_method(self, method, *args, **kwargs):
             return result
         return self._update_result(result)
 
+    def _intersperse_hs_in_std_res(self, hs_dims, res):
+        for dim, inds in enumerate(self.inserted_hs_indices()):
+            for i in inds:
+                if dim not in hs_dims:
+                    continue
+                res = np.insert(res, i, np.nan, axis=(dim - self.ndim))
+        return res
+
     def _prepare_index_baseline(self, axis):
         # First get the margin of the opposite direction of the index axis.
         # We need this in order to end up with the right shape of the
@@ -347,6 +435,19 @@ def _prepare_index_baseline(self, axis):
         baseline = baseline / np.sum(baseline)
         return baseline / np.sum(baseline, axis=0)
 
+    def _scalar_type_std_res(self, counts, total, colsum, rowsum):
+        """Return ndarray containing standard residuals for category values.
+
+        The shape of the return value is the same as that of *counts*.
+        """
+        expected_counts = expected_freq(counts)
+        residuals = counts - expected_counts
+        variance = (
+            np.outer(rowsum, colsum) *
+            np.outer(total - rowsum, total - colsum) / total ** 3
+        )
+        return residuals / np.sqrt(variance)
+
     def _update_args(self, kwargs):
         if self._cube.ndim < 3:
             # If cube is 2D it doesn't actually have slices (itself is a slice).