Merge branch 'prune-slice-shape-160788065' into rel-5.2.393

Crunch-io · Oct 3, 2018 · 1b274e3 · 1b274e3
2 parents 20df047 + c59ecbb
commit 1b274e3
Show file tree

Hide file tree

Showing 4 changed files with 121 additions and 69 deletions.
diff --git a/README.md b/README.md
@@ -96,69 +96,14 @@ The detailed description can be found
 
 ## Changes
 
-### 1.0 Initial release
+#### 1.6.11 Deprecate `shape`
+- Deprecate the `CubeSlice` `shape` property
+- Use `get_shape(prune=False)` instead
+- Will be removed in future versions
 
-### 1.1 Fix stray ipdb.
-
-### 1.2 Support exporter
-
-### 1.3 Implement Headers & Subtotals
-
-### 1.4 Update based on tabbook tests from `cr.lib`
-
-#### 1.4.1 Update based on deck tests from `cr.server`
-
-#### 1.4.2 Fix bugs discovered by first `cr.exporter` deploy to alpha
-
-#### 1.4.3 Fix bug (exporting 2D crtab with H&S on row only)
-
-#### 1.4.4 Implement obtaining labels with category ids (useful for H&S in exporter)
-
-#### 1.4.5 Fix MR x MR proportions calculation
-
-### 1.5.0 Start implementing index table functionality
-
-#### 1.5.1 Implement index for MR x MR
-
-#### 1.5.2 Fix bugs with `anchor: 0` for H&S
-
-#### 1.5.3 Fix bugs with invalid input data for H&S
-
-### 1.6.0 Z-Score and bug fixes.
-
-#### 1.6.1 `standardized_residuals` are now included.
-
-#### 1.6.2 support "Before" and "After" in variable transformations since they exist in zz9 data.
-
-#### 1.6.4 Fixes for 3d Pruning.
-
-#### 1.6.5 Fixes for Pruning and Headers and subtotals.
-- Population size support.
-- Fx various calculations in 3d cubes.
-
-#### 1.6.6 Added support for CubeSlice, which always represents a
-- 2D cube (even if they're the slices of a 3D cube).
-- Various fixes for support of wide-export
-
-#### 1.6.7 Population fraction
-- Various bugfixes and optimizations.
-- Add property `population_fraction`. This is needed for the exporter to be able to calculate the correct population counts, based on weighted/unweighted and filtered/unfiltered states of the cube.
-- Apply newly added `population_fraction` to the calculation of `population_counts`.
-- Modify API for `scale_means`. It now accepts additional parameters `hs_dims` (defaults to `None`) and `prune` (defaults to `False`). Also, the format of the return value is slightly different in nature. It is a list of lists of numpy arrrays. It functions like this:
-
-    - The outermost list corresponds to cube slices. If cube.ndim < 3, then it's a single-element list
-    - Inner lists have either 1 or 2 elements (if they're a 1D cube slice, or a 2D cube slice, respectively).
-    - If there are scale means defined on the corresponding dimension of the cube slice, then the inner list element is a numpy array with scale means. If it doesn't have scale means defined (numeric values), then the element is `None`.
-
-- Add property `ca_dim_ind` to `CubeSlice`.
-- Add property `is_double_mr` to `CubeSlice` (which is needed since it differs from the interpretation of the cube. E.g. MR x CA x MR will render slices which are *not* double MRs).
-- Add `shape`, `ndim`, and `scale_means` to `CubeSlice`, for accessibility.
-- `index` now also operates on slices (no api change).
-
-#### 1.6.8 Scale Means Marginal
-- Add capability to calculate the scale means marginal. This is used when analysing a 2D cube, and obtaining a sort of a "scale mean _total_" for each of the variables constituting a cube.
+#### 1.6.10 Fix README on pypi
 
 #### 1.6.9 Bugfix
 - When Categorical Array variable is selected in multitable export, and Scale Means is selected, the cube fails, because it tries to access the non-existing slice (the CA is only _interpreted_ as multiple slices in tabbooks). This fix makes sure that the export cube doesn't fail in such case.
 
-#### 1.6.10 Fix README on pypi
+For a complete list of changes see [history](https://github.com/Crunch-io/crunch-cube/blob/master/HISTORY.md).
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 
 from setuptools import setup, find_packages
 
-version = '1.6.10'
+version = '1.6.11'
 
 
 def get_long_desc():

diff --git a/src/cr/cube/cube_slice.py b/src/cr/cube/cube_slice.py
@@ -1,11 +1,11 @@
 '''Home of the CubeSlice class.'''
 
 from functools import partial
+import warnings
 import numpy as np
 
 from cr.cube.measures.scale_means import ScaleMeans
-
-from .utils import lazyproperty
+from .utils import lazyproperty, compress_pruned, memoize
 
 
 # pylint: disable=too-few-public-methods
@@ -34,6 +34,7 @@ def __init__(self, cube, index, ca_as_0th=False):
 
     @lazyproperty
     def col_dim_ind(self):
+        """Return 1 if not categorical array as 0th, 0 otherwise."""
         return 1 if not self.ca_as_0th else 0
 
     def __getattr__(self, attr):
@@ -160,6 +161,19 @@ def has_ca(self):
 
     @lazyproperty
     def ca_dim_ind(self):
+        """Return items dimension index if there is one.
+
+        If the slice is a part of a cube that has Categorical Array variable,
+        return the index of the items dimension (if it belongs to the slice).
+        Examples:
+        - For a CA(items) x CAT => returns 0
+        - For CAT x CA(items) => returns 1
+        - For CAT x CA(items) x CAT => returns 0 (because the items is the 0th
+          dimension of each slice)
+        - For CA(items) x CAT x CAT => returns None (because the 0th items
+          dimension doesn't belong to any one slice, and is itself used for
+          slicing the cube).
+        """
         index = self._cube.ca_dim_ind
         if index is None:
             return None
@@ -204,7 +218,7 @@ def ca_main_axis(self):
         try:
             ca_ind = self.dim_types.index('categorical_array')
             return 1 - ca_ind
-        except Exception:
+        except ValueError:
             return None
 
     def labels(self, hs_dims=None, prune=False):
@@ -252,17 +266,73 @@ def is_double_mr(self):
         return self.dim_types == ['multiple_response'] * 2
 
     def scale_means(self, hs_dims=None, prune=False):
+        """Return list of column and row scaled means for this slice.
+
+        If a row/col doesn't have numerical values, return None for the
+        corresponding dimension. If a slice only has 1D, return only the column
+        scaled mean (as numpy array). If both row and col scaled means are
+        present, return them as two numpy arrays inside of a list.
+        """
         if self.ca_as_0th:
             return [None, None]
         return self._cube.scale_means(hs_dims, prune)[self._index]
 
     @lazyproperty
     def ndim(self):
+        """Number of slice dimensions
+
+        Returns 2 if the origin cube has 3 or 2 dimensions.  Returns 1 if the
+        cube (and the slice) has 1 dimension.  Returns 0 if the cube doesn't
+        have any dimensions.
+        """
         return min(self._cube.ndim, 2)
 
     @lazyproperty
     def shape(self):
-        return self.as_array().shape
+        """Tuple of array dimensions' lengths.
+
+        It returns a tuple of ints, each representing the length of a cube
+        dimension, in the order those dimensions appear in the cube.
 
+        This property is deprecated, use 'get_shape' instead. Pruning is not
+        supported (supported in 'get_shape').
+        """
+        deprecation_msg = 'Deprecated. Use `get_shape` instead.'
+        warnings.warn(deprecation_msg, DeprecationWarning)
+        return self.get_shape()
+
+    @memoize
     def scale_means_margin(self, axis):
+        """Get scale means margin for 2D slice.
+
+        This value represents the scale mean of a single variable that
+        constitutes a 2D slice. There's one for each axis, if there are
+        numerical values on the corresponding (opposite) dimension. The
+        numerical values are filtered by non-missing criterium of the
+        opposite dimension.
+        """
         return ScaleMeans(self).margin(axis)
+
+    @memoize
+    def get_shape(self, prune=False):
+        """Tuple of array dimensions' lengths.
+
+        It returns a tuple of ints, each representing the length of a cube
+        dimension, in the order those dimensions appear in the cube.
+        Pruning is supported. Dimensions that get reduced to a single element
+        (e.g. due to pruning) are removed from the returning shape, thus
+        allowing for the differentiation between true 2D cubes (over which
+        statistical testing can be performed) and essentially
+        1D cubes (over which it can't).
+
+        Usage:
+        >>> shape = get_shape()
+        >>> pruned_shape = get_shape(prune=True)
+        """
+        if not prune:
+            return self.as_array().shape
+
+        shape = compress_pruned(self.as_array(prune=True)).shape
+        # Eliminate dimensions that get reduced to 1
+        # (e.g. single element categoricals)
+        return tuple(n for n in shape if n > 1)
diff --git a/tests/unit/test_cube_slice.py b/tests/unit/test_cube_slice.py
@@ -1,13 +1,15 @@
 '''Unit tests for the CubeSlice class.'''
-from unittest import TestCase
+
 from mock import Mock, patch
 import numpy as np
+import pytest
 
 from cr.cube.cube_slice import CubeSlice
 
 
 # pylint: disable=invalid-name, no-self-use, protected-access
-class TestCubeSlice(TestCase):
+# pylint: disable=too-many-public-methods, missing-docstring
+class TestCubeSlice(object):
     '''Test class for the CubeSlice unit tests.'''
 
     def test_init(self):
@@ -25,7 +27,7 @@ def test_init_ca_as_0th(self):
         assert CubeSlice(cube, 0, ca_as_0th=True)
 
         cube.dim_types = ['categorical', 'categorical']
-        with self.assertRaises(ValueError):
+        with pytest.raises(ValueError):
             CubeSlice(cube, 0, ca_as_0th=True)
 
     def test_ndim_invokes_ndim_from_cube(self):
@@ -379,6 +381,7 @@ def test_ca_dim_ind(self):
     @patch('cr.cube.measures.scale_means.ScaleMeans.margin')
     @patch('cr.cube.measures.scale_means.ScaleMeans.__init__')
     def test_scale_means_marginal(self, mock_sm_init, mock_sm_margin):
+        """Test if slice method invokes cube method."""
         mock_sm_init.return_value = None
 
         cs = CubeSlice({}, 0)
@@ -392,3 +395,37 @@ def test_scale_means_for_ca_as_0th(self):
         cube.dim_types = ['categorical_array']
         cs = CubeSlice(cube, 0, ca_as_0th=True)
         assert cs.scale_means() == [None, None]
+
+    def test_shape_property_deprecated(self):
+        cube = Mock()
+
+        cube.ndim = 2
+        cube.as_array.return_value = np.zeros((3, 2))
+        cs = CubeSlice(cube, 0)
+        with pytest.warns(DeprecationWarning):
+            # TODO: Remove once 'shape' is removed
+            assert cs.shape == (3, 2)
+
+    def test_get_shape(self, shape_fixture):
+        """Test shape based on 'as_array' and pruning."""
+        slice_, prune, expected = shape_fixture
+        actual = slice_.get_shape(prune=prune)
+        assert actual == expected
+
+    @pytest.fixture(params=[
+        (False, None, (3, 2)),
+        (True, [[True, False], [True, False], [True, False]], (3,)),
+        (True, [[False, False], [True, True], [True, True]], (2,)),
+        (True, [[False, False], [True, True], [False, False]], (2, 2)),
+        (True, [[True, True], [True, True], [True, True]], ()),
+    ])
+    def shape_fixture(self, request):
+        prune, mask, expected = request.param
+        array = np.zeros((3, 2))
+        cube = Mock()
+        cube.ndim = 2
+        if mask is not None:
+            array = np.ma.masked_array(array, np.array(mask))
+        cube.as_array.return_value = array
+        cs = CubeSlice(cube, 0)
+        return cs, prune, expected