Merge 1e7412f into f46f645

Crunch-io · Oct 17, 2018 · 7dfa003 · 7dfa003
2 parents f46f645 + 1e7412f
commit 7dfa003
Show file tree

Hide file tree

Showing 12 changed files with 7,745 additions and 22 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,10 @@
 # History of Changes
 
+#### 1.6.11 Deprecate `shape`
+- Deprecate the `CubeSlice` `shape` property
+- Use `get_shape(prune=False)` instead
+- Will be removed in future versions
+
 #### 1.6.10 Fix README on pypi
 
 #### 1.6.9 Bugfix

diff --git a/README.md b/README.md
@@ -96,6 +96,10 @@ The detailed description can be found
 
 ## Changes
 
+#### 1.7.2
+- Implement correct index table functionality
+- Deprecate old index functionality
+
 #### 1.7.1 Fix index error
 - Fix peculiar case of CA x CAT (single elem) index error
 - Support with unit tests
@@ -106,9 +110,4 @@ The detailed description can be found
 - Fixed pesky numpy warnings
 - Replaced vulnerable lazyproperty implementation
 
-#### 1.6.11 Deprecate `shape`
-- Deprecate the `CubeSlice` `shape` property
-- Use `get_shape(prune=False)` instead
-- Will be removed in future versions
-
 For a complete list of changes see [history](https://github.com/Crunch-io/crunch-cube/blob/master/HISTORY.md).
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 
 from setuptools import setup, find_packages
 
-version = '1.7.1'
+version = '1.7.2'
 
 
 def get_long_desc():

diff --git a/src/cr/cube/crunch_cube.py b/src/cr/cube/crunch_cube.py
@@ -10,6 +10,7 @@
 
 import json
 import numpy as np
+import warnings
 
 from scipy.stats import norm
 from scipy.stats.contingency import expected_freq
@@ -255,7 +256,12 @@ def has_mr(self):
         return self.mr_dim_ind is not None
 
     def index(self, weighted=True, prune=False):
-        """Get cube index measurement."""
+        """Return cube index measurement.
+
+        This function is deprecated. Use index_table from CubeSlice.
+        """
+        msg = "Deprecated. Use index_table from CubeSlice"""
+        warnings.warn(msg, DeprecationWarning)
         return Index.data(self, weighted, prune)
 
     def inserted_hs_indices(self, prune=False):
@@ -324,7 +330,7 @@ def labels(self, include_missing=False, include_transforms_for_dims=False):
             for dim in self.dimensions
         ]
 
-    def margin(self, axis=None, weighted=True,
+    def margin(self, axis=None, weighted=True, include_missing=False,
                include_transforms_for_dims=None, prune=False):
         """Get margin for the selected axis.
 
@@ -402,14 +408,18 @@ def hs_dims_for_den(hs_dims, axis):
         # dividing. Those across dims which are summed across MUST NOT be
         # included, because they would change the result.
         hs_dims = hs_dims_for_den(include_transforms_for_dims, axis)
-        den = self._transform(table, hs_dims, inflate=True, fix=True)
+        den = self._transform(
+            table, hs_dims, inflate=True, fix=True,
+            include_missing=include_missing,
+        )
 
         # Apply correct mask (based on the as_array shape)
-        arr = self.as_array(
+        arr = self._as_array(
             prune=prune,
-            # include_transforms_for_dims=include_transforms_for_dims,
             include_transforms_for_dims=hs_dims,
+            include_missing=include_missing,
         )
+        arr = self._fix_shape(arr, fix_valids=include_missing)
         if isinstance(arr, np.ma.core.MaskedArray):
 
             inflate_ind = tuple(
@@ -435,7 +445,7 @@ def hs_dims_for_den(hs_dims, axis):
             # Special case for 1D cube wigh MR, for "Table" direction
             den = np.sum(den, axis=new_axis)[index]
 
-        den = self._fix_shape(den)
+        den = self._fix_shape(den, fix_valids=include_missing)
         if den.shape[0] == 1 and len(den.shape) > 1 and self.ndim < 3:
             den = den.reshape(den.shape[1:])
         return den
@@ -788,10 +798,12 @@ def univariate_ca_main_axis(self):
         dim_types = [dim.type for dim in self.dimensions]
         return dim_types.index('categorical')
 
-    @lazyproperty
-    def valid_indices_with_selections(self):
+    def valid_indices_with_selections(self, include_missing=False):
         """Get all valid indices (including MR selections)."""
-        return [dim.valid_indices(False) for dim in self.all_dimensions]
+        return [
+            dim.valid_indices(include_missing)
+            for dim in self.all_dimensions
+        ]
 
     def zscore(self, weighted=True, prune=False, hs_dims=None):
         """Get cube zscore measurement."""
@@ -917,7 +929,10 @@ def _as_array(self, include_missing=False, get_non_selected=False,
         dimensions = self.all_dimensions
         shape = [len(dim.elements(include_missing=True)) for dim in dimensions]
         res = np.array(values).reshape(shape)
-        res = self._transform(res, include_transforms_for_dims, inflate=True)
+        res = self._transform(
+            res, include_transforms_for_dims, inflate=True,
+            include_missing=include_missing,
+        )
         res = res + adjusted
 
         if prune:
@@ -987,11 +1002,10 @@ def _create_mask(res, row_prune_inds, col_prune_inds):
         )
         return np.logical_or(mask_rows, mask_cols)
 
-    def _fix_shape(self, array):
+    def _fix_shape(self, array, fix_valids=False):
         """Fixes shape of MR variables.
         For MR variables, where 'selections' dims are dropped, the ndarray
         needs to be reshaped, in order to seem as if those dims never existed.
-
         Also, if any (except 1st) dimension has a single element, it is
         flattened in the resulting array (which is more convenient for the
         users of the CrunchCube). If the original shape of the cube is
@@ -1000,6 +1014,7 @@ def _fix_shape(self, array):
         general, use private methods, if operating inside CrunchCube. API
         methods should only be used from outside CrunchCube.
         """
+
         if not array.shape or len(array.shape) != len(self.all_dimensions):
             # This condition covers two cases:
             # 1. In case of no dimensions, the shape of the array is empty
@@ -1016,8 +1031,11 @@ def _fix_shape(self, array):
 
         display_ind = tuple(
             0 if dim.is_mr_selections(self.all_dimensions) else slice(None)
-            for dim in self.all_dimensions
-        )
+            for dim, n in zip(self.all_dimensions, array.shape)
+        ) if not fix_valids else np.ix_(*[
+            dim.valid_indices(False) if n > 1 else [0]
+            for dim, n in zip(self.all_dimensions, array.shape)
+        ])
         array = array[display_ind]
 
         # If a first dimension only has one element, we don't want to
@@ -1291,9 +1309,12 @@ def _shape(self):
         return tuple([dim.shape for dim in self.all_dimensions])
 
     def _transform(self, res, include_transforms_for_dims,
-                   inflate=False, fix=True):
-        valid_indices = self.valid_indices_with_selections if fix else None
+                   inflate=False, fix=True, include_missing=False):
         """Transform the shape of the resulting ndarray."""
+        valid_indices = (
+            self.valid_indices_with_selections(include_missing)
+            if fix else None
+        )
         if not include_transforms_for_dims:
             return res[np.ix_(*valid_indices)] if valid_indices else res
 

diff --git a/src/cr/cube/cube_slice.py b/src/cr/cube/cube_slice.py
@@ -2,6 +2,8 @@
 
 """CubeSlice class."""
 
+from __future__ import division
+
 from functools import partial
 import warnings
 import numpy as np
@@ -56,6 +58,35 @@ def __getattr__(self, attr):
         # ---otherwise, the property value is the same for cube or slice---
         return cube_attr
 
+    def _prepare_index_baseline(self, axis):
+        # First get the margin of the opposite direction of the index axis.
+        # We need this in order to end up with the right shape of the
+        # numerator vs denominator.
+        baseline = self.margin(axis=(1 - axis), include_missing=True)
+
+        # Now check if the shape of the marginal needs to be fixed, because
+        # different versions of the MR containing cubes, combined with
+        # different margin directions, provide marginals of different shapes.
+        # We also need to calculate the percentage marginals correctly,
+        # so we need to perform the addition (to get the denominator)
+        # across the correct axis.
+        if axis == self.mr_dim_ind:
+            baseline = baseline / np.sum(baseline, axis=1)[:, None]
+            return baseline[:, 0]
+        elif isinstance(self.mr_dim_ind, tuple) and axis in self.mr_dim_ind:
+            total = np.sum(baseline, axis=(axis + 1))
+            if axis == 0:
+                return baseline[:, 0, 0] / total[:, 0]
+            return baseline[0, :, 0] / total[0]
+
+        if axis == 0 and self.mr_dim_ind is not None:
+            baseline = baseline[:, 0]
+            return baseline / np.sum(baseline)
+
+        baseline = baseline if len(baseline.shape) <= 1 else baseline[0]
+        baseline = baseline / np.sum(baseline)
+        return baseline / np.sum(baseline, axis=0)
+
     @lazyproperty
     def ca_dim_ind(self):
         """Return items dimension index if there is one.
@@ -146,6 +177,26 @@ def has_mr(self):
         """
         return 'multiple_response' in self.dim_types
 
+    def index_table(self, axis=None, baseline=None):
+        """Return index percentages for a given axis and baseline.
+
+        The index values represent the difference of the percentages to the
+        corresponding baseline values. The baseline values are the univariate
+        percentages of the corresponding variable.
+        """
+        proportions = self.proportions(axis=axis)
+        baseline = (
+            baseline
+            if baseline is not None else
+            self._prepare_index_baseline(axis)
+        )
+
+        # Fix the shape to enable correct broadcasting
+        if axis == 0 and len(baseline.shape) <= 1:
+            baseline = baseline[:, None]
+
+        return proportions / baseline * 100
+
     @lazyproperty
     def is_double_mr(self):
         """This has to be overridden from cr.cube.

diff --git a/tests/integration/fixtures/__init__.py b/tests/integration/fixtures/__init__.py
@@ -144,3 +144,6 @@ def _load(cube_file):
 CAT_X_MR_X_CAT = _load('cat-x-mr-x-cat.json')
 CAT_X_CAT_FILTERED_POP = _load('cat-x-cat-filtered-population.json')
 UNIV_MR_WITH_HS = _load('univ-mr-with-hs.json')
+FULL_CUBE = _load('full-cube.json')
+NATREP = _load('natrep-cube.json')
+MR_X_MR_INDEX_TABLE = _load('mr-x-mr-index-table.json')