Skip to content

Commit

Permalink
Merge 9d354fd into 848408a
Browse files Browse the repository at this point in the history
  • Loading branch information
slobodan-ilic committed Feb 13, 2019
2 parents 848408a + 9d354fd commit 9cf4270
Show file tree
Hide file tree
Showing 4 changed files with 433 additions and 3 deletions.
25 changes: 22 additions & 3 deletions src/cr/cube/cube_slice.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from scipy.stats.contingency import expected_freq

from cr.cube.enum import DIMENSION_TYPE as DT
from cr.cube.min_base_size_mask import MinBaseSizeMask
from cr.cube.measures.scale_means import ScaleMeans
from cr.cube.measures.pairwise_pvalues import PairwisePvalues
from cr.cube.util import compress_pruned, lazyproperty, memoize
Expand Down Expand Up @@ -126,7 +127,7 @@ def dim_types(self):
return self._cube.dim_types[-2:]

@memoize
def get_shape(self, prune=False):
def get_shape(self, prune=False, hs_dims=None):
"""Tuple of array dimensions' lengths.
It returns a tuple of ints, each representing the length of a cube
Expand All @@ -143,9 +144,11 @@ def get_shape(self, prune=False):
>>> pruned_shape = get_shape(prune=True)
"""
if not prune:
return self.as_array().shape
return self.as_array(include_transforms_for_dims=hs_dims).shape

shape = compress_pruned(self.as_array(prune=True)).shape
shape = compress_pruned(
self.as_array(prune=True, include_transforms_for_dims=hs_dims)
).shape
# Eliminate dimensions that get reduced to 1
# (e.g. single element categoricals)
return tuple(n for n in shape if n > 1)
Expand Down Expand Up @@ -275,6 +278,22 @@ def margin(

return self._extract_slice_result_from_cube(margin)

def min_base_size_mask(self, size, hs_dims=None):
"""Returns MinBaseSizeMask object with correct row, col and table masks.
The returned object stores the necessary information about the base size, as
well as about the base values. It can create corresponding masks in teh row,
column, and table directions, based on the corresponding base values
(the values of the unweighted margins).
Usage:
>>> slice_ = cube.slices[0] # obtain a valid cube slice
>>> slice_.min_base_size_mask(30).row_mask
>>> slice_.min_base_size_mask(50).column_mask
>>> slice_.min_base_size_mask(22).table_mask
"""
return MinBaseSizeMask(self, size, hs_dims)

@lazyproperty
def mr_dim_ind(self):
"""Get the correct index of the MR dimension in the cube slice."""
Expand Down
86 changes: 86 additions & 0 deletions src/cr/cube/min_base_size_mask.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# encoding: utf-8

"""MinBaseSize class."""

from __future__ import division
import numpy as np

from cr.cube.util import lazyproperty
from cr.cube.enum import DIMENSION_TYPE as DT


class MinBaseSizeMask:
"""Helper for deciding which rows/columns to suppress, based on min base size.
If a base value, that is used when calculating percentages, is less than a given
minimum base size, then all of the values obtained in such a way need to
suppressed. We achieve this by generating a mask, based on row/column/table
marginal values and the shape of the underlying slice.
"""

def __init__(self, slice_, size, hs_dims=None):
self._slice = slice_
self._size = size
self._hs_dims = hs_dims

@lazyproperty
def column_mask(self):
"""ndarray, True where column margin <= min_base_size, same shape as slice."""
margin = self._slice.margin(
axis=0, weighted=False, include_transforms_for_dims=self._hs_dims
)
mask = margin < self._size

if margin.shape == self._shape:
# If margin shape is the same as slice's (such as in a col margin for
# MR x CAT), don't broadcast the mask to the array shape, since
# they're already the same.
return mask

# If the row margin is a row vector - broadcast it's mask to the array shape
return np.logical_or(np.zeros(self._shape, dtype=bool), mask)

@lazyproperty
def row_mask(self):
"""ndarray, True where row margin <= min_base_size, same shape as slice."""
margin = self._slice.margin(
axis=1, weighted=False, include_transforms_for_dims=self._hs_dims
)
mask = margin < self._size

if margin.shape == self._shape:
# If margin shape is the same as slice's (such as in a row margin for
# CAT x MR), don't broadcast the mask to the array shape, since
# they're already the same.
return mask

# If the row margin is a column vector - broadcast it's mask to the array shape
return np.logical_or(np.zeros(self._shape, dtype=bool), mask[:, None])

@lazyproperty
def table_mask(self):
"""ndarray, True where table margin <= min_base_size, same shape as slice."""
margin = self._slice.margin(axis=None, weighted=False)
mask = margin < self._size

if margin.shape == self._shape:
return mask

if self._slice.dim_types[0] == DT.MR:
# If the margin is a column vector - broadcast it's mask to the array shape
return np.logical_or(np.zeros(self._shape, dtype=bool), mask[:, None])

return np.logical_or(np.zeros(self._shape, dtype=bool), mask)

@lazyproperty
def _shape(self):
shape = self._slice.get_shape(hs_dims=self._hs_dims)

if len(shape) != self._slice.ndim:
# TODO: This is an ugly hack that needs to happen due to the fact that we
# purge dimensions with the count of 1, when getting the slice shape. This
# will be addressed in a PR (already on the way) that strives to abandon
# the ad-hoc purging of 1-element dimensions altogether.
shape = (shape[0], 1)

return shape
153 changes: 153 additions & 0 deletions tests/integration/test_multiple_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -1832,3 +1832,156 @@ def test_mr_by_cat_hs_cell_percentage():
)
actual = cube.proportions(axis=None, include_transforms_for_dims=[0, 1])
np.testing.assert_almost_equal(actual, expected)


def test_mr_x_cat_min_base_size_mask():
cube_slice = CrunchCube(CR.MR_X_CAT_HS).slices[0]

# Table margin evaluates to:
#
# array([176.36555176, 211.42058767, 247.74073787, 457.05095566, 471.93176847])
#
# We thus choose the min base size to be 220, and expeect it to broadcast across
# columns (in the row direction, i.e. axis=1), sincee the MR is what won't be
# collapsed after doing the base calculation in the table direction.
expected_table_mask = np.array(
[
[True, True, True, True, True, True],
[True, True, True, True, True, True],
[False, False, False, False, False, False],
[False, False, False, False, False, False],
[False, False, False, False, False, False],
]
)
np.testing.assert_array_equal(
cube_slice.min_base_size_mask(220).table_mask, expected_table_mask
)

# Column margin evaluates to:
#
# np.array(
# [
# [15, 24, 0, 57, 69, 0],
# [15, 34, 0, 75, 86, 0],
# [13, 37, 0, 81, 111, 0],
# [20, 50, 0, 159, 221, 0],
# [32, 69, 0, 167, 208, 0],
# ]
# )
#
# We thus choose the min base size to be 30, and expeect it to not be broadcast.
expected_column_mask = np.array(
[
[True, True, True, False, False, True],
[True, False, True, False, False, True],
[True, False, True, False, False, True],
[True, False, True, False, False, True],
[False, False, True, False, False, True],
]
)
np.testing.assert_array_equal(
cube_slice.min_base_size_mask(30).column_mask, expected_column_mask
)

# Row margin evaluates to:
#
# np.array([31.63152104, 70.73073413, 125.75911351, 366.88839144, 376.76564059])
#
# We thus choose the min base size to be 80, and expeect it to broadcast across
# columns (in the row direction, i.e. axis=1), sincee the MR is what won't be
# collapsed after doing the base calculation in the row direction.
expected_row_mask = np.array(
[
[True, True, True, True, True, True],
[True, True, True, True, True, True],
[False, False, False, False, False, False],
[False, False, False, False, False, False],
[False, False, False, False, False, False],
]
)
np.testing.assert_array_equal(
cube_slice.min_base_size_mask(80).row_mask, expected_row_mask
)


def test_cat_x_mr_min_base_size_mask():
cube_slice = CrunchCube(CR.CAT_X_MR).slices[0]

# Table margin evaluates to:
#
# array([80, 79, 70])
#
# We thus choose the min base size to be 75, and expeect it to broadcast across
# rows (in the col direction, i.e. axis=0), sincee the MR is what won't be
# collapsed after doing the base calculation in the table direction.
expected_table_mask = np.array([[False, False, True], [False, False, True]])
np.testing.assert_array_equal(
cube_slice.min_base_size_mask(75).table_mask, expected_table_mask
)

# Column margin evaluates to:
#
# np.array([40, 34, 38])
#
# We thus choose the min base size to be 35, and expeect it to broadcast across
# rows (in the col direction, i.e. axis=0), sincee the MR is what won't be
# collapsed after doing the base calculation in the table direction.
expected_column_mask = np.array([[False, True, False], [False, True, False]])
np.testing.assert_array_equal(
cube_slice.min_base_size_mask(35).column_mask, expected_column_mask
)

# Row margin evaluates to:
#
# np.array([[28, 25, 23], [52, 54, 47]])
#
# We thus choose the min base size to be 25, and expeect it to not be broadcast
expected_row_mask = np.array([[False, False, True], [False, False, False]])
np.testing.assert_array_equal(
cube_slice.min_base_size_mask(25).row_mask, expected_row_mask
)


def test_mr_x_mr_min_base_size_mask():
cube_slice = CrunchCube(CR.CAT_X_MR_X_MR).slices[0]

# Table margin evaluates to:
#
# array([[10000, 10000],
# [10000, 10000],
# [10000, 10000]])
#
# We thus choose the min base size to be 11000, and expeect it to be broadcast
# across all values
expected_table_mask = np.array([[True, True], [True, True], [True, True]])
np.testing.assert_array_equal(
cube_slice.min_base_size_mask(11000).table_mask, expected_table_mask
)

# Column margin evaluates to:
#
# array([[1914, 5958],
# [1914, 5958],
# [1914, 5958]])
#
# We thus choose the min base size to be 2000, and expeect it to broadcast across
# rows (in the col direction, i.e. axis=0), sincee the MR is what won't be
# collapsed after doing the base calculation in the table direction.
expected_column_mask = np.array([[True, False], [True, False], [True, False]])
np.testing.assert_array_equal(
cube_slice.min_base_size_mask(2000).column_mask, expected_column_mask
)

# Row margin evaluates to:
#
# array([[6046, 6046],
# [1008, 1008],
# [ 974, 974]])
#
# We thus choose the min base size to be 1000, and expeect it to broadcast across
# rows (in the col direction, i.e. axis=0), sincee the MR is what won't be
# collapsed after doing the base calculation in the table direction.
expected_row_mask = np.array([[False, False], [False, False], [True, True]])
np.testing.assert_array_equal(
cube_slice.min_base_size_mask(1000).row_mask, expected_row_mask
)
Loading

0 comments on commit 9cf4270

Please sign in to comment.