Merge 30d851d into b1d5f3c

Crunch-io · Apr 23, 2019 · cb2ac12 · cb2ac12
2 parents b1d5f3c + 30d851d
commit cb2ac12
Show file tree

Hide file tree

Showing 4 changed files with 214 additions and 11 deletions.
diff --git a/src/cr/cube/cube_slice.py b/src/cr/cube/cube_slice.py
@@ -563,6 +563,11 @@ def pairwise_indices(self, alpha=0.05, only_larger=True, hs_dims=None):
             self, alpha=alpha, only_larger=only_larger, hs_dims=hs_dims
         ).pairwise_indices
 
+    def summary_pairwise_indices(self, alpha=0.05, only_larger=True, hs_dims=None):
+        return PairwiseSignificance(
+            self, alpha=alpha, only_larger=only_larger, hs_dims=hs_dims
+        ).summary_pairwise_indices
+
     def pairwise_significance_tests(self, column_idx, hs_dims=None):
         """list of _ColumnPairwiseSignificance tests.
 

diff --git a/src/cr/cube/measures/pairwise_significance.py b/src/cr/cube/measures/pairwise_significance.py
@@ -56,6 +56,17 @@ def pairwise_indices(self):
         """ndarray containing tuples of pairwise indices."""
         return np.array([sig.pairwise_indices for sig in self.values]).T
 
+    @lazyproperty
+    def summary_pairwise_indices(self):
+        """ndarray containing tuples of pairwise indices for the column summary."""
+        summary_pairwise_indices = np.empty(
+            self.values[0].t_stats.shape[1], dtype=object
+        )
+        summary_pairwise_indices[:] = [
+            sig.summary_pairwise_indices for sig in self.values
+        ]
+        return summary_pairwise_indices
+
 
 # pylint: disable=too-few-public-methods
 class _ColumnPairwiseSignificance:
@@ -79,30 +90,64 @@ def __init__(
         self._only_larger = only_larger
         self._hs_dims = hs_dims
 
+    @lazyproperty
+    def _unweighted_col_margin(self):
+        return self._slice.margin(
+            axis=0, weighted=False, include_transforms_for_dims=self._hs_dims
+        )
+
     @lazyproperty
     def t_stats(self):
         props = self._slice.proportions(
             axis=0, include_transforms_for_dims=self._hs_dims
         )
         diff = props - props[:, [self._col_idx]]
-        unweighted_margin = self._slice.margin(
-            axis=0, weighted=False, include_transforms_for_dims=self._hs_dims
-        )
-        var_props = props * (1.0 - props) / unweighted_margin
+        var_props = props * (1.0 - props) / self._unweighted_col_margin
         se_diff = np.sqrt(var_props + var_props[:, [self._col_idx]])
         return diff / se_diff
 
     @lazyproperty
     def p_vals(self):
-        unweighted_n = self._slice.margin(
-            axis=0, weighted=False, include_transforms_for_dims=self._hs_dims
-        )
-        df = unweighted_n + unweighted_n[self._col_idx] - 2
-        return 2 * (1 - t.cdf(abs(self.t_stats), df=df))
+        return 2 * (1 - t.cdf(abs(self.t_stats), df=self._df))
 
     @lazyproperty
     def pairwise_indices(self):
         significance = self.p_vals < self._alpha
         if self._only_larger:
             significance = np.logical_and(self.t_stats < 0, significance)
         return [tuple(np.where(sig_row)[0]) for sig_row in significance]
+
+    @lazyproperty
+    def summary_pairwise_indices(self):
+        significance = self.summary_p_vals < self._alpha
+        if self._only_larger:
+            significance = np.logical_and(self.summary_t_stats < 0, significance)
+        return tuple(np.where(significance)[0])
+
+    @lazyproperty
+    def summary_t_stats(self):
+        total_margin = self._slice.margin(weighted=self._weighted)
+        col_margin_props = self._unweighted_col_margin / total_margin
+        diff = col_margin_props - col_margin_props[self._col_idx]
+        var_props = col_margin_props * (1.0 - col_margin_props) / total_margin
+        se_diff = np.sqrt(var_props + var_props[self._col_idx])
+        return diff / se_diff
+
+    @lazyproperty
+    def summary_p_vals(self):
+        return 2 * (1 - t.cdf(abs(self.summary_t_stats), df=self._df))
+
+    @lazyproperty
+    def _df(self):
+        selected_unweighted_n = (
+            self._unweighted_n[self._col_idx]
+            if self._unweighted_n.ndim < 2
+            else self._unweighted_n[:, self._col_idx][:, None]
+        )
+        return self._unweighted_n + selected_unweighted_n - 2
+
+    @lazyproperty
+    def _unweighted_n(self):
+        return self._slice.margin(
+            axis=0, weighted=False, include_transforms_for_dims=self._hs_dims
+        )
diff --git a/tests/integration/test_pairwise_significance.py b/tests/integration/test_pairwise_significance.py
@@ -315,7 +315,7 @@ def test_compare_to_column(self):
         np.testing.assert_almost_equal(actual.t_stats, expected_tstats)
         np.testing.assert_almost_equal(actual.p_vals, expected_pvals)
 
-    def test_pairwise_indices_only_larger(self):
+    def test_cat_x_cat_pairwise_indices_only_larger(self):
         cube = CrunchCube(CR.PAIRWISE_HIROTSU_OCCUPATION_X_ILLNESS)
         expected_indices = np.array(
             [
@@ -334,6 +334,48 @@ def test_pairwise_indices_only_larger(self):
         pairwise_indices = cube.slices[0].pairwise_indices()
         np.testing.assert_array_equal(pairwise_indices, expected_indices)
 
+    def test_mr_x_cat_pairwise_indices_only_larger(self):
+        cube = CrunchCube(CR.MR_X_CAT_HS)
+        expected_indices = np.array(
+            [
+                [(1, 3, 4), (), (), (), (), ()],
+                [(), (), (), (), (), ()],
+                [(), (), (), (0,), (0,), ()],
+                [(), (), (), (), (1,), ()],
+                [(), (), (), (), (), ()],
+            ]
+        )
+        pairwise_indices = cube.slices[0].pairwise_indices()
+        np.testing.assert_array_equal(pairwise_indices, expected_indices)
+
+    def test_cat_x_mr_pairwise_indices_only_larger(self):
+        cube = CrunchCube(CR.CAT_X_MR_HS)
+        expected_indices = np.array(
+            [
+                [(1, 2, 3, 4), (2, 3), (), (), (2,)],
+                [(), (), (), (), (3,)],
+                [(), (), (), (), ()],
+                [(), (0,), (0,), (0,), (0,)],
+                [(), (), (), (0, 1, 4), ()],
+                [(), (), (), (), ()],
+            ]
+        )
+        pairwise_indices = cube.slices[0].pairwise_indices()
+        np.testing.assert_array_equal(pairwise_indices, expected_indices)
+
+    def test_mr_x_mr_pairwise_indices_only_larger(self):
+        cube = CrunchCube(CR.MR_X_MR)
+        expected_indices = np.array(
+            [
+                [(1, 2, 3), (), (), ()],
+                [(), (0, 2, 3), (), (2,)],
+                [(), (), (0, 1, 3), (1,)],
+                [(), (), (), ()],
+            ]
+        )
+        pairwise_indices = cube.slices[0].pairwise_indices()
+        np.testing.assert_array_equal(pairwise_indices, expected_indices)
+
     def test_pairwise_indices_larger_and_smaller(self):
         cube = CrunchCube(CR.PAIRWISE_HIROTSU_OCCUPATION_X_ILLNESS)
         expected_indices = np.array(
@@ -353,6 +395,19 @@ def test_pairwise_indices_larger_and_smaller(self):
         pairwise_indices = cube.slices[0].pairwise_indices(only_larger=False)
         np.testing.assert_array_equal(pairwise_indices, expected_indices)
 
+    def test_cat_x_cat_summary_pairwise_indices(self):
+        slice_ = CrunchCube(CR.PAIRWISE_HIROTSU_OCCUPATION_X_ILLNESS).slices[0]
+
+        # Only larger
+        pairwise_indices = slice_.summary_pairwise_indices()
+        expected_indices = np.array([(2,), (0, 2), ()])
+        np.testing.assert_array_equal(pairwise_indices, expected_indices)
+
+        # Larger and smaller
+        pairwise_indices = slice_.summary_pairwise_indices(only_larger=False)
+        expected_indices = np.array([(1, 2), (0, 2), (0, 1)], dtype="i,i")
+        np.testing.assert_array_equal(pairwise_indices, expected_indices)
+
     def test_ttests_use_unweighted_n_for_variance(self):
         """The weights on this cube demonstrate much higher variance (less
         extreme t values, and higher associated p-values) than if weighted_n

diff --git a/tests/unit/test_wishart_pairwise_significance.py b/tests/unit/test_wishart_pairwise_significance.py
@@ -57,8 +57,82 @@ def it_knows_its_pairwise_indices(
         cps = _ColumnPairwiseSignificance(slice_, None, only_larger=only_larger)
         assert cps.pairwise_indices == pairwise_indices
 
+    def it_can_calculate_summary_t_stats(
+        self, slice_, _unweighted_col_margin_prop_, summary_t_stats_fixture
+    ):
+        margin, col_idx, expected, _ = summary_t_stats_fixture
+        _unweighted_col_margin_prop_.return_value = margin
+        slice_.margin.return_value = np.sum(margin)
+        np.testing.assert_almost_equal(
+            _ColumnPairwiseSignificance(slice_, col_idx).summary_t_stats, expected
+        )
+
+    def it_can_calculate_summary_p_vals(
+        self,
+        slice_,
+        _unweighted_col_margin_prop_,
+        _unweighted_n_prop,
+        summary_t_stats_fixture,
+    ):
+        margin, col_idx, _, expected = summary_t_stats_fixture
+        _unweighted_col_margin_prop_.return_value = margin
+        slice_.margin.return_value = np.sum(margin)
+        _unweighted_n_prop.return_value = margin
+        np.testing.assert_almost_equal(
+            _ColumnPairwiseSignificance(slice_, col_idx).summary_p_vals, expected
+        )
+
+    def it_can_calculate_summary_pairwise_indices(
+        self,
+        slice_,
+        summary_pairwise_indices_fixture,
+        summary_p_vals_prop,
+        summary_t_stats_prop,
+    ):
+        only_larger, col_idx, t_stats, p_vals, expected = (
+            summary_pairwise_indices_fixture
+        )
+        summary_p_vals_prop.return_value = p_vals
+        summary_t_stats_prop.return_value = t_stats
+        np.testing.assert_array_equal(
+            _ColumnPairwiseSignificance(
+                slice_, col_idx, only_larger=only_larger
+            ).summary_pairwise_indices,
+            expected,
+        )
+
     # fixtures -------------------------------------------------------
 
+    @pytest.fixture(
+        params=[
+            (True, 1, [-0.6793662, 0.0, -1], [0.6201015, 1.0, 0.01], (2,)),
+            (False, 0, None, [1, 0.01, 0.01], (1, 2)),
+        ]
+    )
+    def summary_pairwise_indices_fixture(self, request):
+        only_larger, col_idx, t_stats, p_vals, expected = request.param
+        return only_larger, col_idx, np.array(t_stats), np.array(p_vals), expected
+
+    @pytest.fixture(
+        params=[
+            (
+                [1, 2, 3],
+                0,
+                [0.0, 0.67936622, 1.30930734],
+                [np.nan, 0.62010151, 0.32063378],
+            ),
+            ([1, 2, 3], 1, [-0.6793662, 0.0, 0.5940885], [0.6201015, 1.0, 0.5942728]),
+        ]
+    )
+    def summary_t_stats_fixture(self, request):
+        margin, col_idx, expected_t_stats, expected_p_vals = request.param
+        return (
+            np.array(margin),
+            col_idx,
+            np.array(expected_t_stats),
+            np.array(expected_p_vals),
+        )
+
     @pytest.fixture(
         params=[
             (
@@ -164,7 +238,13 @@ def p_vals_fixture(self, request):
                     [0.0, -0.8586079707543924, -1.1774569464270872],
                     [0.0, 4.663801762560106, 3.743253010905157],
                 ],
-            )
+            ),
+            (
+                0,
+                [[0.25, 0.75], [0.75, 0.25]],
+                [[1, 2], [3, 4]],
+                [[0.0, 0.94280904], [0.0, -1.51185789]],
+            ),
         ]
     )
     def t_stats_fixture(self, request):
@@ -177,10 +257,28 @@ def t_stats_fixture(self, request):
     def slice_(self, request):
         return instance_mock(request, CubeSlice)
 
+    @pytest.fixture
+    def _unweighted_col_margin_prop_(self, request):
+        return property_mock(
+            request, _ColumnPairwiseSignificance, "_unweighted_col_margin"
+        )
+
+    @pytest.fixture
+    def _unweighted_n_prop(self, request):
+        return property_mock(request, _ColumnPairwiseSignificance, "_unweighted_n")
+
     @pytest.fixture
     def t_stats_prop_(self, request):
         return property_mock(request, _ColumnPairwiseSignificance, "t_stats")
 
+    @pytest.fixture
+    def summary_p_vals_prop(self, request):
+        return property_mock(request, _ColumnPairwiseSignificance, "summary_p_vals")
+
+    @pytest.fixture
+    def summary_t_stats_prop(self, request):
+        return property_mock(request, _ColumnPairwiseSignificance, "summary_t_stats")
+
     @pytest.fixture
     def p_vals_prop_(self, request):
         return property_mock(request, _ColumnPairwiseSignificance, "p_vals")