Merge fd0a35a into c1c26cb

Crunch-io · Nov 29, 2018 · 2ef6f73 · 2ef6f73
2 parents c1c26cb + fd0a35a
commit 2ef6f73
Show file tree

Hide file tree

Showing 17 changed files with 7,222 additions and 204 deletions.
diff --git a/src/cr/cube/crunch_cube.py b/src/cr/cube/crunch_cube.py
@@ -314,69 +314,18 @@ def margin(self, axis=None, weighted=True, include_missing=False,
             ])
         """
 
-        def hs_dims_for_den(hs_dims, axis):
-            if axis is None or hs_dims is None:
-                return None
-            if isinstance(axis, int):
-                axis = [axis]
-            return [dim for dim in hs_dims if dim not in axis]
-
-        table = self._counts(weighted).raw_cube_array
-        new_axis = self._adjust_axis(axis)
-        index = tuple(
-            None if i in new_axis else slice(None)
-            for i, _ in enumerate(table.shape)
+        den = self._denominator(
+            axis, weighted, include_missing,
+            include_transforms_for_dims, prune,
         )
 
-        # Calculate denominator. Only include those H&S dimensions, across
-        # which we DON'T sum. These H&S are needed because of the shape, when
-        # dividing. Those across dims which are summed across MUST NOT be
-        # included, because they would change the result.
-        hs_dims = hs_dims_for_den(include_transforms_for_dims, axis)
-        den = self._apply_missings_and_insertions(
-            table, hs_dims, include_missing=include_missing
-        )
+        # Calculate "margin" from denominator
+        margin = self._drop_mr_cat_dims(den, fix_valids=include_missing)
 
-        # Apply correct mask (based on the as_array shape)
-        arr = self._as_array(
-            include_transforms_for_dims=hs_dims,
-            include_missing=include_missing
-        )
+        if margin.shape[0] == 1 and len(margin.shape) > 1 and self.ndim < 3:
+            margin = margin.reshape(margin.shape[1:])
 
-        # ---prune array if pruning was requested---
-        if prune:
-            arr = self._prune_body(arr, transforms=hs_dims)
-
-        arr = self._drop_mr_cat_dims(arr, fix_valids=include_missing)
-
-        if isinstance(arr, np.ma.core.MaskedArray):
-            inflate_ind = tuple(
-                (
-                    None
-                    if (
-                        d.dimension_type == DT.MR_CAT or
-                        n <= 1 or
-                        len(d.elements()) <= 1
-                    ) else
-                    slice(None)
-                )
-                for d, n in zip(self._all_dimensions, table.shape)
-            )
-            mask = np.logical_or(
-                np.zeros(den.shape, dtype=bool),
-                arr.mask[inflate_ind],
-            )
-            den = np.ma.masked_array(den, mask)
-
-        if (self.ndim != 1 or axis is None or
-                axis == 0 and len(self._all_dimensions) == 1):
-            # Special case for 1D cube wigh MR, for "Table" direction
-            den = np.sum(den, axis=new_axis)[index]
-
-        den = self._drop_mr_cat_dims(den, fix_valids=include_missing)
-        if den.shape[0] == 1 and len(den.shape) > 1 and self.ndim < 3:
-            den = den.reshape(den.shape[1:])
-        return den
+        return margin
 
     @lazyproperty
     def missing(self):
@@ -572,36 +521,20 @@ def proportions(self, axis=None, weighted=True,
             ])
         """
 
-        def hs_dims_for_den(hs_dims, axis):
-            if axis is None or hs_dims is None:
-                return None
-            if isinstance(axis, int):
-                axis = [axis]
-            return [dim for dim in hs_dims if dim not in axis]
-
-        table = self._measure(weighted).raw_cube_array
-        new_axis = self._adjust_axis(axis)
-        index = tuple(
-            None if i in new_axis else slice(None)
-            for i, _ in enumerate(table.shape)
-        )
-
-        # Calculate denominator. Only include those H&S dimensions, across
-        # which we DON'T sum. These H&S are needed because of the shape, when
-        # dividing. Those across dims which are summed across MUST NOT be
-        # included, because they would change the result.
-        hs_dims = hs_dims_for_den(include_transforms_for_dims, axis)
-        den = self._apply_missings_and_insertions(table, hs_dims)
-        den = np.sum(den, axis=new_axis)[index]
-
         # Calculate numerator from table (include all H&S dimensions).
         num = self._apply_missings_and_insertions(
-            table, include_transforms_for_dims
+            self._measure(weighted).raw_cube_array,
+            include_transforms_for_dims,
+        )
+        # Always use unpruned denominator (bases), because pruning is based on
+        # unweighted bases explicitly
+        den = self._denominator(
+            axis, weighted, False, include_transforms_for_dims, False
         )
 
         res = self._drop_mr_cat_dims(num / den)
 
-        # Apply correct mask (based on the as_array shape)
+        # Apply correct pruning mask (based on the as_array shape)
         arr = self.as_array(
             prune=prune,
             include_transforms_for_dims=include_transforms_for_dims,
@@ -928,6 +861,32 @@ def _cube_dict(self):
                 '(str) or dict.' % type(self._cube_response_arg).__name__
             )
 
+    def _denominator(self, axis, weighted, include_missing,
+                     include_transforms_for_dims, prune=False):
+        table = self._counts(weighted).raw_cube_array
+        new_axis = self._adjust_axis(axis)
+        index = tuple(
+            None if i in new_axis else slice(None)
+            for i, _ in enumerate(table.shape)
+        )
+
+        # Calculate denominator. Only include those H&S dimensions, across
+        # which we DON'T sum. These H&S are needed because of the shape, when
+        # dividing. Those across dims which are summed across MUST NOT be
+        # included, because they would change the result.
+        if prune:
+            # Always prune only based on _unweighted_ counts
+            mask = self._counts(False).raw_cube_array == 0
+            table = np.ma.masked_array(table, mask)
+        hs_dims = self._hs_dims_for_den(include_transforms_for_dims, axis)
+        den = self._apply_missings_and_insertions(
+            table, hs_dims, include_missing=include_missing
+        )
+        try:
+            return np.sum(den, axis=new_axis)[index]
+        except np.AxisError:
+            return den
+
     def _drop_mr_cat_dims(self, array, fix_valids=False):
         """Return ndarray reflecting *array* with MR_CAT dims dropped.
 
@@ -991,6 +950,14 @@ def _fix_valid_indices(cls, valid_indices, insertion_index, dim):
         valid_indices[dim] = indices.tolist()
         return valid_indices
 
+    @staticmethod
+    def _hs_dims_for_den(hs_dims, axis):
+        if axis is None or hs_dims is None:
+            return None
+        if isinstance(axis, int):
+            axis = [axis]
+        return [dim for dim in hs_dims if dim not in axis]
+
     def _inserted_dim_inds(self, transform_dims, axis):
         dim_ind = axis if self.ndim < 3 else axis + 1
         if not transform_dims or dim_ind not in transform_dims:
@@ -1030,6 +997,10 @@ def _is_axis_allowed(self, axis):
         In case the calculation is requested over CA items dimension, it is not
         valid. It's valid in all other cases.
         """
+        if not self.dimensions:
+            # In case of no dimensions any direction is not allowed
+            return False
+
         if axis is None:
             # If table direction was requested, we must ensure that each slice
             # doesn't have the CA items dimension (thus the [-2:] part). It's
@@ -1300,16 +1271,25 @@ def _update_result(self, result, insertions, dimension_index,
         """Insert subtotals into resulting ndarray."""
         # TODO: valid_indices should be a tuple as a parameter and as a return
         # value
+        masked = type(result) == np.ma.core.MaskedArray
+        if masked:
+            mask = result.mask
         for j, (ind_insertion, value) in enumerate(insertions):
             result = np.insert(
                 result, ind_insertion + j + 1, value, axis=dimension_index
             )
+            if masked:
+                mask = np.insert(
+                    mask, ind_insertion + j + 1, False, axis=dimension_index
+                )
             valid_indices = (
                 valid_indices and
                 self._fix_valid_indices(
                     valid_indices, ind_insertion + j, dimension_index
                 )
             )
+        if masked:
+            result = np.ma.masked_array(result, mask)
         return result, valid_indices