From 98aa3128e14750fd7dd461f17cdbf4b115f97f0f Mon Sep 17 00:00:00 2001 From: Slobodan Ilic Date: Wed, 17 Jan 2018 22:49:29 +0100 Subject: [PATCH 1/2] Add tests for pruning within cr.cube --- tests/integration/fixtures/__init__.py | 3 + tests/integration/fixtures/cubes/binned.json | 277 ++++++++++++++++++ .../cubes/cat-x-cat-with-empty-cols.json | 222 ++++++++++++++ tests/integration/test_crunch_cube.py | 169 +++++++++++ tests/unit/test_crunch_cube.py | 5 +- 5 files changed, 675 insertions(+), 1 deletion(-) create mode 100644 tests/integration/fixtures/cubes/binned.json create mode 100644 tests/integration/fixtures/cubes/cat-x-cat-with-empty-cols.json diff --git a/tests/integration/fixtures/__init__.py b/tests/integration/fixtures/__init__.py index 8112e6567..7c59e959b 100644 --- a/tests/integration/fixtures/__init__.py +++ b/tests/integration/fixtures/__init__.py @@ -104,3 +104,6 @@ FIXT_ARRAY_X_MR = load_fixture(CUBES_DIR, 'array-by-mr.json') FIXT_PROFILES_PERCENTS = load_fixture(CUBES_DIR, 'test-profiles-percentages.json') +FIXT_CAT_X_CAT_WITH_EMPTY_COLS = load_fixture(CUBES_DIR, + 'cat-x-cat-with-empty-cols.json') +FIXT_BINNED = load_fixture(CUBES_DIR, 'binned.json') diff --git a/tests/integration/fixtures/cubes/binned.json b/tests/integration/fixtures/cubes/binned.json new file mode 100644 index 000000000..e03b22b93 --- /dev/null +++ b/tests/integration/fixtures/cubes/binned.json @@ -0,0 +1,277 @@ +{ + "query": { + "dimensions": [ + { + "args": [ + { + "variable": "/api/datasets/63fe72a5f0504e7786cdb2b9db58c399/variables/989632aec2a0482cad6ae803a1196a16/" + } + ], + "function": "bin" + } + ], + "filters": [], + "measures": { + "count": { + "args": [], + "function": "cube_count" + } + }, + "weight": "/api/datasets/63fe72a5f0504e7786cdb2b9db58c399/variables/ae78fa132f47402c97c67a73664bb85a/" + }, + "result": { + "dimensions": [ + { + "derived": true, + "references": { + "alias": "netpromoter:nps_3", + "description": "Net Promoter Score", + "name": "Firefox NPS" + }, + "type": { + "class": "enum", + "elements": [ + { + "id": -1, + "missing": true, + "value": { + "?": -1 + } + }, + { + "id": 1, + "missing": false, + "value": [ + -100, + -90 + ] + }, + { + "id": 2, + "missing": false, + "value": [ + -90, + -80 + ] + }, + { + "id": 3, + "missing": false, + "value": [ + -80, + -70 + ] + }, + { + "id": 4, + "missing": false, + "value": [ + -70, + -60 + ] + }, + { + "id": 5, + "missing": false, + "value": [ + -60, + -50 + ] + }, + { + "id": 6, + "missing": false, + "value": [ + -50, + -40 + ] + }, + { + "id": 7, + "missing": false, + "value": [ + -40, + -30 + ] + }, + { + "id": 8, + "missing": false, + "value": [ + -30, + -20 + ] + }, + { + "id": 9, + "missing": false, + "value": [ + -20, + -10 + ] + }, + { + "id": 10, + "missing": false, + "value": [ + -10, + 0 + ] + }, + { + "id": 11, + "missing": false, + "value": [ + 0, + 10 + ] + }, + { + "id": 12, + "missing": false, + "value": [ + 10, + 20 + ] + }, + { + "id": 13, + "missing": false, + "value": [ + 20, + 30 + ] + }, + { + "id": 14, + "missing": false, + "value": [ + 30, + 40 + ] + }, + { + "id": 15, + "missing": false, + "value": [ + 40, + 50 + ] + }, + { + "id": 16, + "missing": false, + "value": [ + 50, + 60 + ] + }, + { + "id": 17, + "missing": false, + "value": [ + 60, + 70 + ] + }, + { + "id": 18, + "missing": false, + "value": [ + 70, + 80 + ] + }, + { + "id": 19, + "missing": false, + "value": [ + 80, + 90 + ] + }, + { + "id": 20, + "missing": false, + "value": [ + 90, + 100 + ] + } + ] + } + } + ], + "element": "crunch:cube", + "margins": { + "data": [ + 575543.0542529409 + ] + }, + "measures": { + "count": { + "data": [ + 118853.42316535371, + 118504.40402203641, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 155261.27236309808, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 182923.9547024528 + ], + "metadata": { + "derived": true, + "references": {}, + "type": { + "class": "numeric", + "integer": false, + "missing_reasons": { + "No Data": -1 + }, + "missing_rules": {} + } + }, + "n_missing": 116732 + } + }, + "missing": 116732, + "n": 575543, + "counts": [ + 118153, + 118104, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 155161, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 182123 + ] + } +} diff --git a/tests/integration/fixtures/cubes/cat-x-cat-with-empty-cols.json b/tests/integration/fixtures/cubes/cat-x-cat-with-empty-cols.json new file mode 100644 index 000000000..24f76999b --- /dev/null +++ b/tests/integration/fixtures/cubes/cat-x-cat-with-empty-cols.json @@ -0,0 +1,222 @@ +{ + "element": "shoji:view", + "self": "https://app.crunch.io/api/datasets/5c0735442f214b95ade0c24fbff187b9/cube/?filter=%5B%5D&query=%7B%22dimensions%22:%5B%7B%22variable%22:%22https:%2F%2Fapp.crunch.io%2Fapi%2Fdatasets%2F5c0735442f214b95ade0c24fbff187b9%2Fvariables%2F000001%2F%22%7D,%7B%22variable%22:%22https:%2F%2Fapp.crunch.io%2Fapi%2Fdatasets%2F5c0735442f214b95ade0c24fbff187b9%2Fvariables%2F000000%2F%22%7D%5D,%22measures%22:%7B%22count%22:%7B%22function%22:%22cube_count%22,%22args%22:%5B%5D%7D%7D,%22weight%22:null%7D", + "value": { + "query": { + "measures": { + "count": { + "function": "cube_count", + "args": [] + } + }, + "dimensions": [ + { + "variable": "https://app.crunch.io/api/datasets/5c0735442f214b95ade0c24fbff187b9/variables/000001/" + }, + { + "variable": "https://app.crunch.io/api/datasets/5c0735442f214b95ade0c24fbff187b9/variables/000000/" + } + ], + "weight": null + }, + "query_environment": { + "filter": [] + }, + "result": { + "dimensions": [ + { + "derived": false, + "references": { + "alias": "cat2", + "name": "cat2" + }, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 1, + "id": 1, + "name": "a", + "missing": false + }, + { + "numeric_value": 2, + "id": 2, + "name": "b", + "missing": false + }, + { + "numeric_value": 3, + "id": 3, + "name": "c", + "missing": false + }, + { + "numeric_value": 4, + "id": 4, + "name": "d", + "missing": false + }, + { + "numeric_value": 5, + "id": 5, + "name": "e", + "missing": false + }, + { + "numeric_value": 6, + "id": 6, + "name": "f", + "missing": false + }, + { + "numeric_value": null, + "id": -1, + "name": "No Data", + "missing": true + } + ] + } + }, + { + "derived": false, + "references": { + "alias": "cat1", + "name": "cat1" + }, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 1, + "id": 1, + "name": "four", + "missing": false + }, + { + "numeric_value": 2, + "id": 2, + "name": "one", + "missing": false + }, + { + "numeric_value": 3, + "id": 3, + "name": "three", + "missing": false + }, + { + "numeric_value": 4, + "id": 4, + "name": "two", + "missing": false + }, + { + "numeric_value": null, + "id": -1, + "name": "No Data", + "missing": true + } + ] + } + } + ], + "missing": 87, + "measures": { + "count": { + "data": [ + 2, + 2, + 0, + 1, + 9, + 0, + 0, + 0, + 0, + 6, + 0, + 1, + 0, + 2, + 13, + 0, + 2, + 0, + 0, + 8, + 0, + 2, + 0, + 1, + 4, + 0, + 1, + 0, + 0, + 0, + 2, + 3, + 6, + 3, + 33 + ], + "n_missing": 87, + "metadata": { + "references": {}, + "derived": true, + "type": { + "integer": true, + "missing_rules": {}, + "missing_reasons": { + "No Data": -1 + }, + "class": "numeric" + } + } + } + }, + "element": "crunch:cube", + "counts": [ + 2, + 2, + 0, + 1, + 9, + 0, + 0, + 0, + 0, + 6, + 0, + 1, + 0, + 2, + 13, + 0, + 2, + 0, + 0, + 8, + 0, + 2, + 0, + 1, + 4, + 0, + 1, + 0, + 0, + 0, + 2, + 3, + 6, + 3, + 33 + ], + "n": 101 + } + } +} diff --git a/tests/integration/test_crunch_cube.py b/tests/integration/test_crunch_cube.py index 7168e2d3e..369c80c54 100644 --- a/tests/integration/test_crunch_cube.py +++ b/tests/integration/test_crunch_cube.py @@ -43,6 +43,8 @@ FIXT_SELECTED_3_WAY, FIXT_ARRAY_X_MR, FIXT_PROFILES_PERCENTS, + FIXT_CAT_X_CAT_WITH_EMPTY_COLS, + FIXT_BINNED, ) from cr.cube.crunch_cube import CrunchCube @@ -2108,3 +2110,170 @@ def test_profiles_percentages_add_up_to_100(self): actual_sum = np.sum(props, axis=1) expected_sum = np.ones(props.shape[0]) * 100 np.testing.assert_almost_equal(actual_sum, expected_sum) + + def test_cat_x_cat_as_array_prune_cols(self): + cube = CrunchCube(FIXT_CAT_X_CAT_WITH_EMPTY_COLS) + expected = np.array([ + [2, 2, 0, 1], + [0, 0, 0, 0], + [0, 1, 0, 2], + [0, 2, 0, 0], + [0, 2, 0, 1], + [0, 1, 0, 0], + ]) + actual = cube.as_array(prune=False) + expected = np.array([ + [2, 2, 1], + [0, 1, 2], + [0, 2, 0], + [0, 2, 1], + [0, 1, 0], + ]) + actual = cube.as_array(prune=True) + np.testing.assert_array_equal(actual, expected) + + def test_cat_x_cat_props_by_col_prune_cols(self): + cube = CrunchCube(FIXT_CAT_X_CAT_WITH_EMPTY_COLS) + expected = np.array([ + [1., 0.25, np.nan, 0.25], + [0., 0., np.nan, 0.], + [0., 0.125, np.nan, 0.5], + [0., 0.25, np.nan, 0.], + [0., 0.25, np.nan, 0.25], + [0., 0.125, np.nan, 0.] + ]) + actual = cube.proportions(axis=0, prune=False) + expected = np.array([ + [1., 0.25, 0.25], + [0., 0.125, 0.5], + [0., 0.25, 0.], + [0., 0.25, 0.25], + [0., 0.125, 0.] + ]) + actual = cube.proportions(axis=0, prune=True) + np.testing.assert_array_equal(actual, expected) + + def test_cat_x_cat_props_by_row_prune_cols(self): + cube = CrunchCube(FIXT_CAT_X_CAT_WITH_EMPTY_COLS) + expected = np.array([ + [0.4, 0.4, 0., 0.2], + [np.nan, np.nan, np.nan, np.nan], + [0., 0.33333333, 0., 0.66666667], + [0., 1., 0., 0.], + [0., 0.66666667, 0., 0.33333333], + [0., 1., 0., 0.], + ]) + actual = cube.proportions(axis=1, prune=False) + expected = np.array([ + [0.4, 0.4, 0.2], + [0., 0.33333333, 0.66666667], + [0., 1., 0.], + [0., 0.66666667, 0.33333333], + [0., 1., 0.] + ]) + actual = cube.proportions(axis=1, prune=True) + np.testing.assert_almost_equal(actual, expected) + + def test_cat_x_cat_props_by_cell_prune_cols(self): + cube = CrunchCube(FIXT_CAT_X_CAT_WITH_EMPTY_COLS) + expected = np.array([ + [0.14285714, 0.14285714, 0., 0.07142857], + [0., 0., 0., 0.], + [0., 0.07142857, 0., 0.14285714], + [0., 0.14285714, 0., 0.], + [0., 0.14285714, 0., 0.07142857], + [0., 0.07142857, 0., 0.], + ]) + actual = cube.proportions(axis=None, prune=False) + expected = np.array([ + [0.14285714, 0.14285714, 0.07142857], + [0., 0.07142857, 0.14285714], + [0., 0.14285714, 0.], + [0., 0.14285714, 0.07142857], + [0., 0.07142857, 0.], + ]) + actual = cube.proportions(axis=None, prune=True) + np.testing.assert_almost_equal(actual, expected) + + def test_cat_x_cat_index_by_col_prune_cols(self): + cube = CrunchCube(FIXT_CAT_X_CAT_WITH_EMPTY_COLS) + expected = np.array([ + [2.8, 0.7, np.nan, 0.7], + [np.nan, np.nan, np.nan, np.nan], + [0., 0.58333333, np.nan, 2.33333333], + [0., 1.75, np.nan, 0.], + [0., 1.16666667, np.nan, 1.16666667], + [0., 1.75, np.nan, 0.] + ]) + actual = cube.index(axis=0, prune=False) + expected = np.array([ + [2.8, 0.7, 0.7], + [0., 0.58333333, 2.33333333], + [0., 1.75, 0.], + [0., 1.16666667, 1.16666667], + [0., 1.75, 0.] + ]) + actual = cube.index(axis=0, prune=True) + np.testing.assert_almost_equal(actual, expected) + + def test_hs_as_props_by_col_not_affected_by_prune(self): + cube = CrunchCube(FIXT_ECON_US_PROBLEM_X_BIGGER_PROBLEM) + expected = np.array([ + [0.93244626, 0.66023166], + [0.63664278, 0.23166023], + [0.29580348, 0.42857143], + [0.04401228, 0.21428571], + [0.00307062, 0.06177606], + [0.02047083, 0.06370656], + ]) + actual = cube.proportions(axis=0, include_transforms_for_dims=[0, 1], + prune=True) + np.testing.assert_almost_equal(actual, expected) + + def test_hs_as_props_by_row_not_affected_by_prune(self): + cube = CrunchCube(FIXT_ECON_US_PROBLEM_X_BIGGER_PROBLEM) + expected = np.array([ + [0.72705507, 0.27294493], + [0.83827493, 0.16172507], + [0.56555773, 0.43444227], + [0.27922078, 0.72077922], + [0.08571429, 0.91428571], + [0.37735849, 0.62264151], + ]) + actual = cube.proportions(axis=1, include_transforms_for_dims=[0, 1], + prune=True) + np.testing.assert_almost_equal(actual, expected) + + def test_hs_as_props_by_cell_not_affected_by_prune(self): + cube = CrunchCube(FIXT_ECON_US_PROBLEM_X_BIGGER_PROBLEM) + expected = np.array([ + [0.60936455, 0.22876254], + [0.41605351, 0.08026756], + [0.19331104, 0.14849498], + [0.02876254, 0.07424749], + [0.00200669, 0.02140468], + [0.01337793, 0.02207358], + ]) + actual = cube.proportions(include_transforms_for_dims=[0, 1], + prune=True) + np.testing.assert_almost_equal(actual, expected) + + def test_prune_univariate_cat(self): + cube = CrunchCube(FIXT_BINNED) + expected = np.array([ + 118504.40402204, + 155261.2723631, + 182923.95470245, + ]) + actual = cube.as_array(prune=True) + np.testing.assert_almost_equal(actual, expected) + + def test_cat_x_mr_with_hs_and_prune_params(self): + '''Test that HS and prune params don't break CAT x MR.''' + cube = CrunchCube(FIXT_CAT_X_MR) + # Only ensure that the call doesn't break + cube.as_array( + include_transforms_for_dims=[0, 1], + prune=True, + ) + assert True diff --git a/tests/unit/test_crunch_cube.py b/tests/unit/test_crunch_cube.py index 88c0c900f..e33de2b43 100644 --- a/tests/unit/test_crunch_cube.py +++ b/tests/unit/test_crunch_cube.py @@ -151,11 +151,13 @@ def test_transform_param_propagation(self, mock_margin, fake_dims = Mock() fake_axis = Mock() fake_weighted = Mock() + fake_prune = Mock() # Make the call cube.proportions( axis=fake_axis, weighted=fake_weighted, - include_transforms_for_dims=fake_dims + include_transforms_for_dims=fake_dims, + prune=fake_prune, ) # Assert parameter propagation mock_margin.assert_called_once_with( @@ -163,6 +165,7 @@ def test_transform_param_propagation(self, mock_margin, weighted=fake_weighted, adjusted=False, include_transforms_for_dims=fake_dims, + prune=fake_prune, ) @patch('cr.cube.crunch_cube.CrunchCube._get_dimensions') From 168b54dacc61365e97cfd28494840be25a1597da Mon Sep 17 00:00:00 2001 From: Slobodan Ilic Date: Wed, 17 Jan 2018 22:49:58 +0100 Subject: [PATCH 2/2] Implement pruning within cr.cube --- src/cr/cube/crunch_cube.py | 85 +++++++++++++++++++++++++++++--------- 1 file changed, 65 insertions(+), 20 deletions(-) diff --git a/src/cr/cube/crunch_cube.py b/src/cr/cube/crunch_cube.py index 5fc5e07db..e126e612b 100644 --- a/src/cr/cube/crunch_cube.py +++ b/src/cr/cube/crunch_cube.py @@ -183,7 +183,8 @@ def _get_table(self, weighted): def _as_array(self, include_missing=False, get_non_selected=False, weighted=True, adjusted=False, - include_transforms_for_dims=False): + include_transforms_for_dims=False, + prune=False): '''Get crunch cube as ndarray. Args @@ -231,7 +232,32 @@ def _as_array(self, include_missing=False, get_non_selected=False, res = res[np.ix_(*valid_indices)] adjustment = 1 if adjusted else 0 - return self._fix_shape(res) + adjustment + res = self._fix_shape(res) + adjustment + + if prune and not self.has_mr: + # Remove columns and rows where marginal value is 0 or np.nan + if len(self.dimensions) == 1: + margin = self.margin( + include_transforms_for_dims=include_transforms_for_dims, + axis=1, + ) + ind_prune = np.logical_or(margin == 0, np.isnan(margin)) + return res[~ind_prune] + + col_margin = self.margin( + include_transforms_for_dims=include_transforms_for_dims, + axis=0, + ) + ind_prune = np.logical_or(col_margin == 0, np.isnan(col_margin)) + res = res[:, ~ind_prune] + row_margin = self.margin( + include_transforms_for_dims=include_transforms_for_dims, + axis=1 + ) + ind_prune = np.logical_or(row_margin == 0, np.isnan(row_margin)) + return res[~ind_prune, :] + + return res @classmethod def _fix_valid_indices(cls, valid_indices, insertion_index, dim): @@ -368,7 +394,7 @@ def _mr_margin(self, axis, weighted, adjusted): return margin def _margin(self, axis=None, weighted=True, adjusted=False, - include_transforms_for_dims=None): + include_transforms_for_dims=None, prune=False): # MR margins are calculated differently, so they need a separate method # for them. A good example of this is the rcrunch functionality. @@ -386,6 +412,7 @@ def _margin(self, axis=None, weighted=True, adjusted=False, weighted=weighted, adjusted=adjusted, include_transforms_for_dims=transform_dims, + # prune=prune, ) if axis and axis > 0 and len(array.shape) == 1: @@ -395,7 +422,14 @@ def _margin(self, axis=None, weighted=True, adjusted=False, # and axes, and we need to restore one dimension in this case. array = array[:, np.newaxis] - return np.sum(array, axis) + res = np.sum(array, axis) + + if prune: + # Remove values if 0 or np.nan + ind_prune = np.logical_or(res == 0, np.isnan(res)) + res = res[~ind_prune] + + return res def _mr_proportions(self, axis, weighted): if self.is_double_mr: @@ -442,7 +476,8 @@ def _mr_proportions(self, axis, weighted): return res[np.ix_(*valid_indices)] def _proportions(self, axis=None, weighted=True, adjusted=False, - include_transforms_for_dims=None, include_missing=False): + include_transforms_for_dims=None, include_missing=False, + prune=False): if self.has_mr: return self._mr_proportions(axis, weighted) @@ -450,7 +485,8 @@ def _proportions(self, axis=None, weighted=True, adjusted=False, axis=axis, weighted=weighted, adjusted=adjusted, - include_transforms_for_dims=include_transforms_for_dims + include_transforms_for_dims=include_transforms_for_dims, + prune=prune, ) if axis == 1: margin = margin[:, np.newaxis] @@ -462,6 +498,7 @@ def _proportions(self, axis=None, weighted=True, adjusted=False, weighted=weighted, adjusted=adjusted, include_transforms_for_dims=include_transforms_for_dims, + prune=prune, ) / margin # Properties @@ -521,10 +558,10 @@ def is_weighted(self): ) return weighted - @property - def has_means(self): - '''Check if cube has means.''' - return self.has_means + # @property + # def has_means(self): + # '''Check if cube has means.''' + # return self.has_means @property def filter_annotation(self): @@ -533,6 +570,7 @@ def filter_annotation(self): @property def has_means(self): + '''Check if cube has means.''' return self._cube['result']['measures'].get('mean', None) is not None @property @@ -553,11 +591,13 @@ def labels(self, include_missing=False, include_transforms_for_dims=False): Returns labels (list of lists): Labels for each dimension ''' - return [dim.labels(include_missing, include_transforms_for_dims) - for dim in self.dimensions] + return [ + dim.labels(include_missing, include_transforms_for_dims) + for dim in self.dimensions + ] def as_array(self, include_missing=False, weighted=True, adjusted=False, - include_transforms_for_dims=None): + include_transforms_for_dims=None, prune=False): '''Get crunch cube as ndarray. Returns the tabular representation of the crunch cube. The returning @@ -592,10 +632,11 @@ def as_array(self, include_missing=False, weighted=True, adjusted=False, weighted=weighted, adjusted=adjusted, include_transforms_for_dims=include_transforms_for_dims, + prune=prune, ) def margin(self, axis=None, weighted=True, - include_transforms_for_dims=None): + include_transforms_for_dims=None, prune=False): '''Get margin for the selected axis. the selected axis. For MR variables, this is the sum of the selected @@ -657,10 +698,12 @@ def margin(self, axis=None, weighted=True, weighted=weighted, adjusted=False, include_transforms_for_dims=include_transforms_for_dims, + prune=prune, ) def proportions(self, axis=None, weighted=True, - include_transforms_for_dims=None, include_missing=False): + include_transforms_for_dims=None, include_missing=False, + prune=False): '''Get proportions of a crunch cube. This function calculates the proportions across the selected axis @@ -717,6 +760,7 @@ def proportions(self, axis=None, weighted=True, adjusted=False, include_transforms_for_dims=include_transforms_for_dims, include_missing=include_missing, + prune=prune, ) def percentages(self, axis=None): @@ -850,22 +894,23 @@ def _mr_index(self, axis, weighted): raise ValueError('Unexpected dimension types for cube with MR.') - def index(self, axis, weighted=True): + def index(self, axis, weighted=True, prune=False): '''Return table index by margin.''' if self.has_mr: return self._mr_index(axis, weighted) - margin = np.true_divide( - self.margin(axis=axis, weighted=weighted), - self.margin(weighted=weighted), + margin = ( + self.margin(axis=axis, weighted=weighted, prune=prune) / + self.margin(weighted=weighted, prune=prune) ) props = self.proportions( axis=(1 - axis), weighted=weighted, + prune=prune, ) if axis == 1: margin = margin[:, np.newaxis] - return np.true_divide(props, margin) + return props / margin