From 7c615cefe925b0fe46d12409bba51f563d10f2fa Mon Sep 17 00:00:00 2001 From: Slobodan Ilic Date: Mon, 22 Apr 2019 15:44:19 +0200 Subject: [PATCH 1/3] Add pairwise summary * Support with unit tests * Support with integration tests * Implement t-stats, p-vals, and pairwise indices for column summary --- src/cr/cube/cube_slice.py | 5 + src/cr/cube/measures/pairwise_significance.py | 53 +++++++++-- .../integration/test_pairwise_significance.py | 13 +++ .../test_wishart_pairwise_significance.py | 92 +++++++++++++++++++ 4 files changed, 154 insertions(+), 9 deletions(-) diff --git a/src/cr/cube/cube_slice.py b/src/cr/cube/cube_slice.py index 6a9497219..16125c0ee 100644 --- a/src/cr/cube/cube_slice.py +++ b/src/cr/cube/cube_slice.py @@ -563,6 +563,11 @@ def pairwise_indices(self, alpha=0.05, only_larger=True, hs_dims=None): self, alpha=alpha, only_larger=only_larger, hs_dims=hs_dims ).pairwise_indices + def summary_pairwise_indices(self, alpha=0.05, only_larger=True, hs_dims=None): + return PairwiseSignificance( + self, alpha=alpha, only_larger=only_larger, hs_dims=hs_dims + ).summary_pairwise_indices + def pairwise_significance_tests(self, column_idx, hs_dims=None): """list of _ColumnPairwiseSignificance tests. diff --git a/src/cr/cube/measures/pairwise_significance.py b/src/cr/cube/measures/pairwise_significance.py index 72a338535..2d94caabb 100644 --- a/src/cr/cube/measures/pairwise_significance.py +++ b/src/cr/cube/measures/pairwise_significance.py @@ -56,6 +56,11 @@ def pairwise_indices(self): """ndarray containing tuples of pairwise indices.""" return np.array([sig.pairwise_indices for sig in self.values]).T + @lazyproperty + def summary_pairwise_indices(self): + """ndarray containing tuples of pairwise indices for the column summary.""" + return np.array([sig.summary_pairwise_indices for sig in self.values]).T + # pylint: disable=too-few-public-methods class _ColumnPairwiseSignificance: @@ -79,26 +84,25 @@ def __init__( self._only_larger = only_larger self._hs_dims = hs_dims + @lazyproperty + def _unweighted_col_margin(self): + return self._slice.margin( + axis=0, weighted=False, include_transforms_for_dims=self._hs_dims + ) + @lazyproperty def t_stats(self): props = self._slice.proportions( axis=0, include_transforms_for_dims=self._hs_dims ) diff = props - props[:, [self._col_idx]] - unweighted_margin = self._slice.margin( - axis=0, weighted=False, include_transforms_for_dims=self._hs_dims - ) - var_props = props * (1.0 - props) / unweighted_margin + var_props = props * (1.0 - props) / self._unweighted_col_margin se_diff = np.sqrt(var_props + var_props[:, [self._col_idx]]) return diff / se_diff @lazyproperty def p_vals(self): - unweighted_n = self._slice.margin( - axis=0, weighted=False, include_transforms_for_dims=self._hs_dims - ) - df = unweighted_n + unweighted_n[self._col_idx] - 2 - return 2 * (1 - t.cdf(abs(self.t_stats), df=df)) + return 2 * (1 - t.cdf(abs(self.t_stats), df=self._df)) @lazyproperty def pairwise_indices(self): @@ -106,3 +110,34 @@ def pairwise_indices(self): if self._only_larger: significance = np.logical_and(self.t_stats < 0, significance) return [tuple(np.where(sig_row)[0]) for sig_row in significance] + + @lazyproperty + def summary_pairwise_indices(self): + significance = self.summary_p_vals < self._alpha + if self._only_larger: + significance = np.logical_and(self.summary_t_stats < 0, significance) + spwi = tuple(np.where(significance[0])[0]) + return spwi + + @lazyproperty + def summary_t_stats(self): + total_margin = self._slice.margin(weighted=self._weighted) + col_margin_props = self._unweighted_col_margin / total_margin + diff = col_margin_props - col_margin_props[self._col_idx] + var_props = col_margin_props * (1.0 - col_margin_props) / total_margin + se_diff = np.sqrt(var_props + var_props[self._col_idx]) + return diff / se_diff + + @lazyproperty + def summary_p_vals(self): + return 2 * (1 - t.cdf(abs(self.summary_t_stats), df=self._df)) + + @lazyproperty + def _df(self): + return self._unweighted_n + self._unweighted_n[self._col_idx] - 2 + + @lazyproperty + def _unweighted_n(self): + return self._slice.margin( + axis=0, weighted=False, include_transforms_for_dims=self._hs_dims + ) diff --git a/tests/integration/test_pairwise_significance.py b/tests/integration/test_pairwise_significance.py index 1b42913af..1ce18e019 100644 --- a/tests/integration/test_pairwise_significance.py +++ b/tests/integration/test_pairwise_significance.py @@ -353,6 +353,19 @@ def test_pairwise_indices_larger_and_smaller(self): pairwise_indices = cube.slices[0].pairwise_indices(only_larger=False) np.testing.assert_array_equal(pairwise_indices, expected_indices) + def test_summary_pairwise_indices(self): + slice_ = CrunchCube(CR.PAIRWISE_HIROTSU_OCCUPATION_X_ILLNESS).slices[0] + + # Only larger + pairwise_indices = slice_.summary_pairwise_indices() + expected_indices = np.array([(), (0,), ()]) + np.testing.assert_array_equal(pairwise_indices, expected_indices) + + # Larger and smaller + pairwise_indices = slice_.summary_pairwise_indices(only_larger=False) + expected_indices = np.array([(), (0,), (0,)]) + np.testing.assert_array_equal(pairwise_indices, expected_indices) + def test_ttests_use_unweighted_n_for_variance(self): """The weights on this cube demonstrate much higher variance (less extreme t values, and higher associated p-values) than if weighted_n diff --git a/tests/unit/test_wishart_pairwise_significance.py b/tests/unit/test_wishart_pairwise_significance.py index 3abbe2711..3d0bcb2a6 100644 --- a/tests/unit/test_wishart_pairwise_significance.py +++ b/tests/unit/test_wishart_pairwise_significance.py @@ -57,8 +57,82 @@ def it_knows_its_pairwise_indices( cps = _ColumnPairwiseSignificance(slice_, None, only_larger=only_larger) assert cps.pairwise_indices == pairwise_indices + def it_can_calculate_summary_t_stats( + self, slice_, _unweighted_col_margin_prop_, summary_t_stats_fixture + ): + margin, col_idx, expected, _ = summary_t_stats_fixture + _unweighted_col_margin_prop_.return_value = margin + slice_.margin.return_value = np.sum(margin) + np.testing.assert_almost_equal( + _ColumnPairwiseSignificance(slice_, col_idx).summary_t_stats, expected + ) + + def it_can_calculate_summary_p_vals( + self, + slice_, + _unweighted_col_margin_prop_, + _unweighted_n_prop, + summary_t_stats_fixture, + ): + margin, col_idx, _, expected = summary_t_stats_fixture + _unweighted_col_margin_prop_.return_value = margin + slice_.margin.return_value = np.sum(margin) + _unweighted_n_prop.return_value = margin + np.testing.assert_almost_equal( + _ColumnPairwiseSignificance(slice_, col_idx).summary_p_vals, expected + ) + + def it_can_calculate_summary_pairwise_indices( + self, + slice_, + summary_pairwise_indices_fixture, + summary_p_vals_prop, + summary_t_stats_prop, + ): + only_larger, col_idx, t_stats, p_vals, expected = ( + summary_pairwise_indices_fixture + ) + summary_p_vals_prop.return_value = p_vals + summary_t_stats_prop.return_value = t_stats + np.testing.assert_array_equal( + _ColumnPairwiseSignificance( + slice_, col_idx, only_larger=only_larger + ).summary_pairwise_indices, + expected, + ) + # fixtures ------------------------------------------------------- + @pytest.fixture( + params=[ + (True, 1, [-0.6793662, 0.0, -1], [0.6201015, 1.0, 0.01], (2,)), + (False, 0, None, [1, 0.01, 0.01], (1, 2)), + ] + ) + def summary_pairwise_indices_fixture(self, request): + only_larger, col_idx, t_stats, p_vals, expected = request.param + return only_larger, col_idx, np.array([t_stats]), np.array([p_vals]), expected + + @pytest.fixture( + params=[ + ( + [1, 2, 3], + 0, + [0.0, 0.67936622, 1.30930734], + [np.nan, 0.62010151, 0.32063378], + ), + ([1, 2, 3], 1, [-0.6793662, 0.0, 0.5940885], [0.6201015, 1.0, 0.5942728]), + ] + ) + def summary_t_stats_fixture(self, request): + margin, col_idx, expected_t_stats, expected_p_vals = request.param + return ( + np.array(margin), + col_idx, + np.array(expected_t_stats), + np.array(expected_p_vals), + ) + @pytest.fixture( params=[ ( @@ -177,10 +251,28 @@ def t_stats_fixture(self, request): def slice_(self, request): return instance_mock(request, CubeSlice) + @pytest.fixture + def _unweighted_col_margin_prop_(self, request): + return property_mock( + request, _ColumnPairwiseSignificance, "_unweighted_col_margin" + ) + + @pytest.fixture + def _unweighted_n_prop(self, request): + return property_mock(request, _ColumnPairwiseSignificance, "_unweighted_n") + @pytest.fixture def t_stats_prop_(self, request): return property_mock(request, _ColumnPairwiseSignificance, "t_stats") + @pytest.fixture + def summary_p_vals_prop(self, request): + return property_mock(request, _ColumnPairwiseSignificance, "summary_p_vals") + + @pytest.fixture + def summary_t_stats_prop(self, request): + return property_mock(request, _ColumnPairwiseSignificance, "summary_t_stats") + @pytest.fixture def p_vals_prop_(self, request): return property_mock(request, _ColumnPairwiseSignificance, "p_vals") From ec990238eb55b945383f63ae93fc050b284e6058 Mon Sep 17 00:00:00 2001 From: Slobodan Ilic Date: Tue, 23 Apr 2019 12:28:35 +0200 Subject: [PATCH 2/3] Add tests for all the cases with MR --- src/cr/cube/measures/pairwise_significance.py | 7 ++- .../integration/test_pairwise_significance.py | 46 ++++++++++++++++++- .../test_wishart_pairwise_significance.py | 8 +++- 3 files changed, 57 insertions(+), 4 deletions(-) diff --git a/src/cr/cube/measures/pairwise_significance.py b/src/cr/cube/measures/pairwise_significance.py index 2d94caabb..688fa34f2 100644 --- a/src/cr/cube/measures/pairwise_significance.py +++ b/src/cr/cube/measures/pairwise_significance.py @@ -134,7 +134,12 @@ def summary_p_vals(self): @lazyproperty def _df(self): - return self._unweighted_n + self._unweighted_n[self._col_idx] - 2 + selected_unweighted_n = ( + self._unweighted_n[self._col_idx] + if self._unweighted_n.ndim < 2 + else self._unweighted_n[:, self._col_idx][:, None] + ) + return self._unweighted_n + selected_unweighted_n - 2 @lazyproperty def _unweighted_n(self): diff --git a/tests/integration/test_pairwise_significance.py b/tests/integration/test_pairwise_significance.py index 1ce18e019..3e7e5e6f4 100644 --- a/tests/integration/test_pairwise_significance.py +++ b/tests/integration/test_pairwise_significance.py @@ -315,7 +315,7 @@ def test_compare_to_column(self): np.testing.assert_almost_equal(actual.t_stats, expected_tstats) np.testing.assert_almost_equal(actual.p_vals, expected_pvals) - def test_pairwise_indices_only_larger(self): + def test_cat_x_cat_pairwise_indices_only_larger(self): cube = CrunchCube(CR.PAIRWISE_HIROTSU_OCCUPATION_X_ILLNESS) expected_indices = np.array( [ @@ -334,6 +334,48 @@ def test_pairwise_indices_only_larger(self): pairwise_indices = cube.slices[0].pairwise_indices() np.testing.assert_array_equal(pairwise_indices, expected_indices) + def test_mr_x_cat_pairwise_indices_only_larger(self): + cube = CrunchCube(CR.MR_X_CAT_HS) + expected_indices = np.array( + [ + [(1, 3, 4), (), (), (), (), ()], + [(), (), (), (), (), ()], + [(), (), (), (0,), (0,), ()], + [(), (), (), (), (1,), ()], + [(), (), (), (), (), ()], + ] + ) + pairwise_indices = cube.slices[0].pairwise_indices() + np.testing.assert_array_equal(pairwise_indices, expected_indices) + + def test_cat_x_mr_pairwise_indices_only_larger(self): + cube = CrunchCube(CR.CAT_X_MR_HS) + expected_indices = np.array( + [ + [(1, 2, 3, 4), (2, 3), (), (), (2,)], + [(), (), (), (), (3,)], + [(), (), (), (), ()], + [(), (0,), (0,), (0,), (0,)], + [(), (), (), (0, 1, 4), ()], + [(), (), (), (), ()], + ] + ) + pairwise_indices = cube.slices[0].pairwise_indices() + np.testing.assert_array_equal(pairwise_indices, expected_indices) + + def test_mr_x_mr_pairwise_indices_only_larger(self): + cube = CrunchCube(CR.MR_X_MR) + expected_indices = np.array( + [ + [(1, 2, 3), (), (), ()], + [(), (0, 2, 3), (), (2,)], + [(), (), (0, 1, 3), (1,)], + [(), (), (), ()], + ] + ) + pairwise_indices = cube.slices[0].pairwise_indices() + np.testing.assert_array_equal(pairwise_indices, expected_indices) + def test_pairwise_indices_larger_and_smaller(self): cube = CrunchCube(CR.PAIRWISE_HIROTSU_OCCUPATION_X_ILLNESS) expected_indices = np.array( @@ -353,7 +395,7 @@ def test_pairwise_indices_larger_and_smaller(self): pairwise_indices = cube.slices[0].pairwise_indices(only_larger=False) np.testing.assert_array_equal(pairwise_indices, expected_indices) - def test_summary_pairwise_indices(self): + def test_cat_x_cat_summary_pairwise_indices(self): slice_ = CrunchCube(CR.PAIRWISE_HIROTSU_OCCUPATION_X_ILLNESS).slices[0] # Only larger diff --git a/tests/unit/test_wishart_pairwise_significance.py b/tests/unit/test_wishart_pairwise_significance.py index 3d0bcb2a6..445bf09d7 100644 --- a/tests/unit/test_wishart_pairwise_significance.py +++ b/tests/unit/test_wishart_pairwise_significance.py @@ -238,7 +238,13 @@ def p_vals_fixture(self, request): [0.0, -0.8586079707543924, -1.1774569464270872], [0.0, 4.663801762560106, 3.743253010905157], ], - ) + ), + ( + 0, + [[0.25, 0.75], [0.75, 0.25]], + [[1, 2], [3, 4]], + [[0.0, 0.94280904], [0.0, -1.51185789]], + ), ] ) def t_stats_fixture(self, request): From 30d851d015e4c966b2514dfabc43acb45adbc039 Mon Sep 17 00:00:00 2001 From: Slobodan Ilic Date: Tue, 23 Apr 2019 13:01:29 +0200 Subject: [PATCH 3/3] Fix error in indexing and ndarray dtype --- src/cr/cube/measures/pairwise_significance.py | 11 ++++++++--- tests/integration/test_pairwise_significance.py | 4 ++-- tests/unit/test_wishart_pairwise_significance.py | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/cr/cube/measures/pairwise_significance.py b/src/cr/cube/measures/pairwise_significance.py index 688fa34f2..3fc0da478 100644 --- a/src/cr/cube/measures/pairwise_significance.py +++ b/src/cr/cube/measures/pairwise_significance.py @@ -59,7 +59,13 @@ def pairwise_indices(self): @lazyproperty def summary_pairwise_indices(self): """ndarray containing tuples of pairwise indices for the column summary.""" - return np.array([sig.summary_pairwise_indices for sig in self.values]).T + summary_pairwise_indices = np.empty( + self.values[0].t_stats.shape[1], dtype=object + ) + summary_pairwise_indices[:] = [ + sig.summary_pairwise_indices for sig in self.values + ] + return summary_pairwise_indices # pylint: disable=too-few-public-methods @@ -116,8 +122,7 @@ def summary_pairwise_indices(self): significance = self.summary_p_vals < self._alpha if self._only_larger: significance = np.logical_and(self.summary_t_stats < 0, significance) - spwi = tuple(np.where(significance[0])[0]) - return spwi + return tuple(np.where(significance)[0]) @lazyproperty def summary_t_stats(self): diff --git a/tests/integration/test_pairwise_significance.py b/tests/integration/test_pairwise_significance.py index 3e7e5e6f4..5d436ffac 100644 --- a/tests/integration/test_pairwise_significance.py +++ b/tests/integration/test_pairwise_significance.py @@ -400,12 +400,12 @@ def test_cat_x_cat_summary_pairwise_indices(self): # Only larger pairwise_indices = slice_.summary_pairwise_indices() - expected_indices = np.array([(), (0,), ()]) + expected_indices = np.array([(2,), (0, 2), ()]) np.testing.assert_array_equal(pairwise_indices, expected_indices) # Larger and smaller pairwise_indices = slice_.summary_pairwise_indices(only_larger=False) - expected_indices = np.array([(), (0,), (0,)]) + expected_indices = np.array([(1, 2), (0, 2), (0, 1)], dtype="i,i") np.testing.assert_array_equal(pairwise_indices, expected_indices) def test_ttests_use_unweighted_n_for_variance(self): diff --git a/tests/unit/test_wishart_pairwise_significance.py b/tests/unit/test_wishart_pairwise_significance.py index 445bf09d7..64f88564f 100644 --- a/tests/unit/test_wishart_pairwise_significance.py +++ b/tests/unit/test_wishart_pairwise_significance.py @@ -111,7 +111,7 @@ def it_can_calculate_summary_pairwise_indices( ) def summary_pairwise_indices_fixture(self, request): only_larger, col_idx, t_stats, p_vals, expected = request.param - return only_larger, col_idx, np.array([t_stats]), np.array([p_vals]), expected + return only_larger, col_idx, np.array(t_stats), np.array(p_vals), expected @pytest.fixture( params=[