diff --git a/README.md b/README.md index cce553b2c..94bf77d88 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,9 @@ The detailed description can be found ## Changes +#### 1.9.16 +- Proper t-stats for cubes with H&S + #### 1.9.15 - Implement pairwise indices for Wishart, directly in cube diff --git a/src/cr/cube/__init__.py b/src/cr/cube/__init__.py index 0fda6b5c9..50579922b 100644 --- a/src/cr/cube/__init__.py +++ b/src/cr/cube/__init__.py @@ -2,4 +2,4 @@ """Initialization module for crunch-cube package.""" -__version__ = "1.9.15" +__version__ = "1.9.16" diff --git a/src/cr/cube/cube_slice.py b/src/cr/cube/cube_slice.py index cf02f9b54..6a9497219 100644 --- a/src/cr/cube/cube_slice.py +++ b/src/cr/cube/cube_slice.py @@ -563,14 +563,14 @@ def pairwise_indices(self, alpha=0.05, only_larger=True, hs_dims=None): self, alpha=alpha, only_larger=only_larger, hs_dims=hs_dims ).pairwise_indices - def pairwise_significance_tests(self, column_idx): + def pairwise_significance_tests(self, column_idx, hs_dims=None): """list of _ColumnPairwiseSignificance tests. Result has as many elements as there are columns in the slice. Each significance test contains `p_vals` and `t_stats` (ndarrays that represent probability values and statistical scores). """ - return PairwiseSignificance(self).values[column_idx] + return PairwiseSignificance(self, hs_dims=hs_dims).values[column_idx] def _apply_pruning_mask(self, res, hs_dims=None): array = self.as_array(prune=True, include_transforms_for_dims=hs_dims) diff --git a/src/cr/cube/measures/pairwise_significance.py b/src/cr/cube/measures/pairwise_significance.py index 705486c99..8a2c4ca3c 100644 --- a/src/cr/cube/measures/pairwise_significance.py +++ b/src/cr/cube/measures/pairwise_significance.py @@ -7,7 +7,7 @@ import numpy as np from scipy.stats import t -from cr.cube.util import lazyproperty, intersperse_hs_in_std_res +from cr.cube.util import lazyproperty try: xrange @@ -48,21 +48,13 @@ def values(self): self._only_larger, self._hs_dims, ) - for col_idx in range(self._slice.shape[1]) + for col_idx in range(self._slice.get_shape(hs_dims=self._hs_dims)[1]) ] @lazyproperty def pairwise_indices(self): """ndarray containing tuples of pairwise indices.""" - pwi = np.array([sig.pairwise_indices for sig in self.values]).T - - if self._hs_dims and 1 in self._hs_dims: - # If we need to account for the dimension 1 in pairwise indices, we need - # to intersperse with NaNs. The dimension 0 is already tackled - # when determining the indices. - pwi = intersperse_hs_in_std_res(self._slice, (1,), pwi) - - return pwi + return np.array([sig.pairwise_indices for sig in self.values]).T # pylint: disable=too-few-public-methods @@ -88,24 +80,25 @@ def __init__( self._hs_dims = hs_dims @lazyproperty - def _t_stats(self): - props = self._slice.proportions(axis=0) + def t_stats(self): + props = self._slice.proportions( + axis=0, include_transforms_for_dims=self._hs_dims + ) diff = props - props[:, [self._col_idx]] - margin = self._slice.margin(axis=0, weighted=self._weighted) + margin = self._slice.margin( + axis=0, weighted=self._weighted, include_transforms_for_dims=self._hs_dims + ) var_props = props * (1.0 - props) / margin se_diff = np.sqrt(var_props + var_props[:, [self._col_idx]]) return diff / se_diff - @lazyproperty - def t_stats(self): - return intersperse_hs_in_std_res(self._slice, self._hs_dims, self._t_stats) - @lazyproperty def p_vals(self): - unweighted_n = self._slice.margin(axis=0, weighted=False) + unweighted_n = self._slice.margin( + axis=0, weighted=False, include_transforms_for_dims=self._hs_dims + ) df = unweighted_n + unweighted_n[self._col_idx] - 2 - p_vals = 2 * (1 - t.cdf(abs(self._t_stats), df=df)) - return intersperse_hs_in_std_res(self._slice, self._hs_dims, p_vals) + return 2 * (1 - t.cdf(abs(self.t_stats), df=df)) @lazyproperty def pairwise_indices(self): diff --git a/src/cr/cube/util.py b/src/cr/cube/util.py index fc022799a..96e1b69ae 100644 --- a/src/cr/cube/util.py +++ b/src/cr/cube/util.py @@ -37,12 +37,7 @@ def compress_pruned(table): def intersperse_hs_in_std_res(slice_, hs_dims, res): - - if not hs_dims: - # Don't intersperse anything, just return the result - return res - - # Perform the insertions of place-holding rows and cols for insertions + """Perform the insertions of place-holding rows and cols for insertions.""" for dim, inds in enumerate(slice_.inserted_hs_indices()): if dim not in hs_dims: continue diff --git a/tests/integration/test_pairwise_significance.py b/tests/integration/test_pairwise_significance.py index 9cb7fc234..2e1f8eb6e 100644 --- a/tests/integration/test_pairwise_significance.py +++ b/tests/integration/test_pairwise_significance.py @@ -96,17 +96,115 @@ def test_hirotsu_pairwise_indices(self): expected = np.array([(), (), (3, 4, 7, 9), (2,), (2,), (), (), (2,), (), (2,)]) np.testing.assert_array_equal(actual_pairwise_indices, expected) + def test_pairwise_t_stats_with_hs(self): + slice_ = CrunchCube(CR.PAIRWISE_HIROTSU_ILLNESS_X_OCCUPATION_WITH_HS).slices[0] + expected = np.array( + [ + [ + 0.0, + -0.06178448, + 0.30342874, + -3.18865018, + -3.82130608, + -2.99560531, + 0.07456344, + -0.68932699, + -2.95469238, + -0.46970468, + -4.14956044, + ], + [ + 0.0, + 1.18922394, + 0.38254102, + 3.3306654, + 3.45013209, + 2.7520633, + 0.59216241, + 0.86352416, + 2.54171145, + 1.10130414, + 3.3839919, + ], + [ + 0.0, + -1.7080666, + -0.931165, + -0.87419923, + -0.24915622, + -0.2367748, + -0.97198009, + -0.38504801, + -0.03910193, + -1.02720423, + 0.19773989, + ], + ] + ) + t_stats = slice_.pairwise_significance_tests( + column_idx=0, hs_dims=(0, 1) + ).t_stats + np.testing.assert_almost_equal(t_stats, expected) + + def test_pairwise_p_vals_with_hs(self): + slice_ = CrunchCube(CR.PAIRWISE_HIROTSU_ILLNESS_X_OCCUPATION_WITH_HS).slices[0] + expected = np.array( + [ + [ + 1.00000000e00, + 9.50744855e-01, + 7.61580875e-01, + 1.45494511e-03, + 1.35271514e-04, + 2.75262668e-03, + 9.40575477e-01, + 4.90755141e-01, + 3.16822332e-03, + 6.38672254e-01, + 3.44275537e-05, + ], + [ + 1.00000000e00, + 2.34589189e-01, + 7.02082945e-01, + 8.84605265e-04, + 5.67562813e-04, + 5.94375533e-03, + 5.53862215e-01, + 3.88027539e-01, + 1.11098015e-02, + 2.71039211e-01, + 7.25442977e-04, + ], + [ + 1.00000000e00, + 8.78852262e-02, + 3.51831360e-01, + 3.82131035e-01, + 8.03255937e-01, + 8.12841336e-01, + 3.31271828e-01, + 7.00272311e-01, + 9.68813218e-01, + 3.04581978e-01, + 8.43264694e-01, + ], + ] + ) + p_vals = slice_.pairwise_significance_tests(column_idx=0, hs_dims=(0, 1)).p_vals + np.testing.assert_almost_equal(p_vals, expected) + def test_pairwise_indices_with_hs(self): - cube = CrunchCube(CR.PAIRWISE_HIROTSU_ILLNESS_X_OCCUPATION_WITH_HS) + slice_ = CrunchCube(CR.PAIRWISE_HIROTSU_ILLNESS_X_OCCUPATION_WITH_HS).slices[0] expected = [ [ - (3, 4, 8, 10), - (3, 4, 8, 10), - (3, 4, 8, 10), + (3, 4, 5, 8, 10), + (3, 4, 5, 8, 10), + (3, 4, 5, 8, 10), (), (), - np.nan, - (3, 4, 8, 10), + (10,), + (3, 4, 5, 8, 10), (3, 4, 10), (), (4, 10), @@ -118,17 +216,17 @@ def test_pairwise_indices_with_hs(self): (), (0, 2, 6, 7), (0, 2, 6, 7), - np.nan, + (0, 2), (), (), (0, 2), (), (0, 2, 6, 7), ], - [(), (), (), (), (), np.nan, (), (), (), (), (1,)], + [(), (), (), (), (), (1,), (), (), (), (), (1,)], ] - actual = cube.slices[0].pairwise_indices(hs_dims=[0, 1]).tolist() - assert actual == expected + pairwise_indices = slice_.pairwise_indices(hs_dims=(0, 1)).tolist() + assert pairwise_indices == expected def test_hirotsu_pvals_with_hs(self): """The shape of the result should be 11 x 11, with H&S (at index 5).""" diff --git a/tests/unit/test_wishart_pairwise_significance.py b/tests/unit/test_wishart_pairwise_significance.py index 30fb85e87..3abbe2711 100644 --- a/tests/unit/test_wishart_pairwise_significance.py +++ b/tests/unit/test_wishart_pairwise_significance.py @@ -17,7 +17,7 @@ class DescribePairwiseSignificance: def it_provides_access_to_its_values(self, request, slice_): shape = (2, 2) - slice_.shape = shape + slice_.get_shape.return_value = shape expected_test_values = PairwiseSignificance(slice_).values assert len(expected_test_values) == 2 for i, column_pairwise_significance in enumerate(expected_test_values): @@ -37,13 +37,13 @@ def it_can_calculate_t_stats(self, t_stats_fixture, slice_): slice_.proportions.return_value = props slice_.margin.return_value = margin np.testing.assert_almost_equal( - _ColumnPairwiseSignificance(slice_, col_idx)._t_stats, t_stats + _ColumnPairwiseSignificance(slice_, col_idx).t_stats, t_stats ) - def it_can_calculate_p_vals(self, p_vals_fixture, slice_, _t_stats_prop_): + def it_can_calculate_p_vals(self, p_vals_fixture, slice_, t_stats_prop_): col_idx, t_stats, margin, p_vals = p_vals_fixture slice_.margin.return_value = margin - _t_stats_prop_.return_value = t_stats + t_stats_prop_.return_value = t_stats np.testing.assert_almost_equal( _ColumnPairwiseSignificance(slice_, col_idx).p_vals, p_vals ) @@ -177,10 +177,6 @@ def t_stats_fixture(self, request): def slice_(self, request): return instance_mock(request, CubeSlice) - @pytest.fixture - def _t_stats_prop_(self, request): - return property_mock(request, _ColumnPairwiseSignificance, "_t_stats") - @pytest.fixture def t_stats_prop_(self, request): return property_mock(request, _ColumnPairwiseSignificance, "t_stats")