From 7c615cefe925b0fe46d12409bba51f563d10f2fa Mon Sep 17 00:00:00 2001
From: Slobodan Ilic <slobodan@crunch.io>
Date: Mon, 22 Apr 2019 15:44:19 +0200
Subject: [PATCH 1/3] Add pairwise summary

* Support with unit tests
* Support with integration tests
* Implement t-stats, p-vals, and pairwise indices for column summary
---
 src/cr/cube/cube_slice.py                     |  5 +
 src/cr/cube/measures/pairwise_significance.py | 53 +++++++++--
 .../integration/test_pairwise_significance.py | 13 +++
 .../test_wishart_pairwise_significance.py     | 92 +++++++++++++++++++
 4 files changed, 154 insertions(+), 9 deletions(-)

diff --git a/src/cr/cube/cube_slice.py b/src/cr/cube/cube_slice.py
index 6a9497219..16125c0ee 100644
--- a/src/cr/cube/cube_slice.py
+++ b/src/cr/cube/cube_slice.py
@@ -563,6 +563,11 @@ def pairwise_indices(self, alpha=0.05, only_larger=True, hs_dims=None):
             self, alpha=alpha, only_larger=only_larger, hs_dims=hs_dims
         ).pairwise_indices
 
+    def summary_pairwise_indices(self, alpha=0.05, only_larger=True, hs_dims=None):
+        return PairwiseSignificance(
+            self, alpha=alpha, only_larger=only_larger, hs_dims=hs_dims
+        ).summary_pairwise_indices
+
     def pairwise_significance_tests(self, column_idx, hs_dims=None):
         """list of _ColumnPairwiseSignificance tests.
 
diff --git a/src/cr/cube/measures/pairwise_significance.py b/src/cr/cube/measures/pairwise_significance.py
index 72a338535..2d94caabb 100644
--- a/src/cr/cube/measures/pairwise_significance.py
+++ b/src/cr/cube/measures/pairwise_significance.py
@@ -56,6 +56,11 @@ def pairwise_indices(self):
         """ndarray containing tuples of pairwise indices."""
         return np.array([sig.pairwise_indices for sig in self.values]).T
 
+    @lazyproperty
+    def summary_pairwise_indices(self):
+        """ndarray containing tuples of pairwise indices for the column summary."""
+        return np.array([sig.summary_pairwise_indices for sig in self.values]).T
+
 
 # pylint: disable=too-few-public-methods
 class _ColumnPairwiseSignificance:
@@ -79,26 +84,25 @@ def __init__(
         self._only_larger = only_larger
         self._hs_dims = hs_dims
 
+    @lazyproperty
+    def _unweighted_col_margin(self):
+        return self._slice.margin(
+            axis=0, weighted=False, include_transforms_for_dims=self._hs_dims
+        )
+
     @lazyproperty
     def t_stats(self):
         props = self._slice.proportions(
             axis=0, include_transforms_for_dims=self._hs_dims
         )
         diff = props - props[:, [self._col_idx]]
-        unweighted_margin = self._slice.margin(
-            axis=0, weighted=False, include_transforms_for_dims=self._hs_dims
-        )
-        var_props = props * (1.0 - props) / unweighted_margin
+        var_props = props * (1.0 - props) / self._unweighted_col_margin
         se_diff = np.sqrt(var_props + var_props[:, [self._col_idx]])
         return diff / se_diff
 
     @lazyproperty
     def p_vals(self):
-        unweighted_n = self._slice.margin(
-            axis=0, weighted=False, include_transforms_for_dims=self._hs_dims
-        )
-        df = unweighted_n + unweighted_n[self._col_idx] - 2
-        return 2 * (1 - t.cdf(abs(self.t_stats), df=df))
+        return 2 * (1 - t.cdf(abs(self.t_stats), df=self._df))
 
     @lazyproperty
     def pairwise_indices(self):
@@ -106,3 +110,34 @@ def pairwise_indices(self):
         if self._only_larger:
             significance = np.logical_and(self.t_stats < 0, significance)
         return [tuple(np.where(sig_row)[0]) for sig_row in significance]
+
+    @lazyproperty
+    def summary_pairwise_indices(self):
+        significance = self.summary_p_vals < self._alpha
+        if self._only_larger:
+            significance = np.logical_and(self.summary_t_stats < 0, significance)
+        spwi = tuple(np.where(significance[0])[0])
+        return spwi
+
+    @lazyproperty
+    def summary_t_stats(self):
+        total_margin = self._slice.margin(weighted=self._weighted)
+        col_margin_props = self._unweighted_col_margin / total_margin
+        diff = col_margin_props - col_margin_props[self._col_idx]
+        var_props = col_margin_props * (1.0 - col_margin_props) / total_margin
+        se_diff = np.sqrt(var_props + var_props[self._col_idx])
+        return diff / se_diff
+
+    @lazyproperty
+    def summary_p_vals(self):
+        return 2 * (1 - t.cdf(abs(self.summary_t_stats), df=self._df))
+
+    @lazyproperty
+    def _df(self):
+        return self._unweighted_n + self._unweighted_n[self._col_idx] - 2
+
+    @lazyproperty
+    def _unweighted_n(self):
+        return self._slice.margin(
+            axis=0, weighted=False, include_transforms_for_dims=self._hs_dims
+        )
diff --git a/tests/integration/test_pairwise_significance.py b/tests/integration/test_pairwise_significance.py
index 1b42913af..1ce18e019 100644
--- a/tests/integration/test_pairwise_significance.py
+++ b/tests/integration/test_pairwise_significance.py
@@ -353,6 +353,19 @@ def test_pairwise_indices_larger_and_smaller(self):
         pairwise_indices = cube.slices[0].pairwise_indices(only_larger=False)
         np.testing.assert_array_equal(pairwise_indices, expected_indices)
 
+    def test_summary_pairwise_indices(self):
+        slice_ = CrunchCube(CR.PAIRWISE_HIROTSU_OCCUPATION_X_ILLNESS).slices[0]
+
+        # Only larger
+        pairwise_indices = slice_.summary_pairwise_indices()
+        expected_indices = np.array([(), (0,), ()])
+        np.testing.assert_array_equal(pairwise_indices, expected_indices)
+
+        # Larger and smaller
+        pairwise_indices = slice_.summary_pairwise_indices(only_larger=False)
+        expected_indices = np.array([(), (0,), (0,)])
+        np.testing.assert_array_equal(pairwise_indices, expected_indices)
+
     def test_ttests_use_unweighted_n_for_variance(self):
         """The weights on this cube demonstrate much higher variance (less
         extreme t values, and higher associated p-values) than if weighted_n
diff --git a/tests/unit/test_wishart_pairwise_significance.py b/tests/unit/test_wishart_pairwise_significance.py
index 3abbe2711..3d0bcb2a6 100644
--- a/tests/unit/test_wishart_pairwise_significance.py
+++ b/tests/unit/test_wishart_pairwise_significance.py
@@ -57,8 +57,82 @@ def it_knows_its_pairwise_indices(
         cps = _ColumnPairwiseSignificance(slice_, None, only_larger=only_larger)
         assert cps.pairwise_indices == pairwise_indices
 
+    def it_can_calculate_summary_t_stats(
+        self, slice_, _unweighted_col_margin_prop_, summary_t_stats_fixture
+    ):
+        margin, col_idx, expected, _ = summary_t_stats_fixture
+        _unweighted_col_margin_prop_.return_value = margin
+        slice_.margin.return_value = np.sum(margin)
+        np.testing.assert_almost_equal(
+            _ColumnPairwiseSignificance(slice_, col_idx).summary_t_stats, expected
+        )
+
+    def it_can_calculate_summary_p_vals(
+        self,
+        slice_,
+        _unweighted_col_margin_prop_,
+        _unweighted_n_prop,
+        summary_t_stats_fixture,
+    ):
+        margin, col_idx, _, expected = summary_t_stats_fixture
+        _unweighted_col_margin_prop_.return_value = margin
+        slice_.margin.return_value = np.sum(margin)
+        _unweighted_n_prop.return_value = margin
+        np.testing.assert_almost_equal(
+            _ColumnPairwiseSignificance(slice_, col_idx).summary_p_vals, expected
+        )
+
+    def it_can_calculate_summary_pairwise_indices(
+        self,
+        slice_,
+        summary_pairwise_indices_fixture,
+        summary_p_vals_prop,
+        summary_t_stats_prop,
+    ):
+        only_larger, col_idx, t_stats, p_vals, expected = (
+            summary_pairwise_indices_fixture
+        )
+        summary_p_vals_prop.return_value = p_vals
+        summary_t_stats_prop.return_value = t_stats
+        np.testing.assert_array_equal(
+            _ColumnPairwiseSignificance(
+                slice_, col_idx, only_larger=only_larger
+            ).summary_pairwise_indices,
+            expected,
+        )
+
     # fixtures -------------------------------------------------------
 
+    @pytest.fixture(
+        params=[
+            (True, 1, [-0.6793662, 0.0, -1], [0.6201015, 1.0, 0.01], (2,)),
+            (False, 0, None, [1, 0.01, 0.01], (1, 2)),
+        ]
+    )
+    def summary_pairwise_indices_fixture(self, request):
+        only_larger, col_idx, t_stats, p_vals, expected = request.param
+        return only_larger, col_idx, np.array([t_stats]), np.array([p_vals]), expected
+
+    @pytest.fixture(
+        params=[
+            (
+                [1, 2, 3],
+                0,
+                [0.0, 0.67936622, 1.30930734],
+                [np.nan, 0.62010151, 0.32063378],
+            ),
+            ([1, 2, 3], 1, [-0.6793662, 0.0, 0.5940885], [0.6201015, 1.0, 0.5942728]),
+        ]
+    )
+    def summary_t_stats_fixture(self, request):
+        margin, col_idx, expected_t_stats, expected_p_vals = request.param
+        return (
+            np.array(margin),
+            col_idx,
+            np.array(expected_t_stats),
+            np.array(expected_p_vals),
+        )
+
     @pytest.fixture(
         params=[
             (
@@ -177,10 +251,28 @@ def t_stats_fixture(self, request):
     def slice_(self, request):
         return instance_mock(request, CubeSlice)
 
+    @pytest.fixture
+    def _unweighted_col_margin_prop_(self, request):
+        return property_mock(
+            request, _ColumnPairwiseSignificance, "_unweighted_col_margin"
+        )
+
+    @pytest.fixture
+    def _unweighted_n_prop(self, request):
+        return property_mock(request, _ColumnPairwiseSignificance, "_unweighted_n")
+
     @pytest.fixture
     def t_stats_prop_(self, request):
         return property_mock(request, _ColumnPairwiseSignificance, "t_stats")
 
+    @pytest.fixture
+    def summary_p_vals_prop(self, request):
+        return property_mock(request, _ColumnPairwiseSignificance, "summary_p_vals")
+
+    @pytest.fixture
+    def summary_t_stats_prop(self, request):
+        return property_mock(request, _ColumnPairwiseSignificance, "summary_t_stats")
+
     @pytest.fixture
     def p_vals_prop_(self, request):
         return property_mock(request, _ColumnPairwiseSignificance, "p_vals")

From ec990238eb55b945383f63ae93fc050b284e6058 Mon Sep 17 00:00:00 2001
From: Slobodan Ilic <slobodan@crunch.io>
Date: Tue, 23 Apr 2019 12:28:35 +0200
Subject: [PATCH 2/3] Add tests for all the cases with MR

---
 src/cr/cube/measures/pairwise_significance.py |  7 ++-
 .../integration/test_pairwise_significance.py | 46 ++++++++++++++++++-
 .../test_wishart_pairwise_significance.py     |  8 +++-
 3 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/src/cr/cube/measures/pairwise_significance.py b/src/cr/cube/measures/pairwise_significance.py
index 2d94caabb..688fa34f2 100644
--- a/src/cr/cube/measures/pairwise_significance.py
+++ b/src/cr/cube/measures/pairwise_significance.py
@@ -134,7 +134,12 @@ def summary_p_vals(self):
 
     @lazyproperty
     def _df(self):
-        return self._unweighted_n + self._unweighted_n[self._col_idx] - 2
+        selected_unweighted_n = (
+            self._unweighted_n[self._col_idx]
+            if self._unweighted_n.ndim < 2
+            else self._unweighted_n[:, self._col_idx][:, None]
+        )
+        return self._unweighted_n + selected_unweighted_n - 2
 
     @lazyproperty
     def _unweighted_n(self):
diff --git a/tests/integration/test_pairwise_significance.py b/tests/integration/test_pairwise_significance.py
index 1ce18e019..3e7e5e6f4 100644
--- a/tests/integration/test_pairwise_significance.py
+++ b/tests/integration/test_pairwise_significance.py
@@ -315,7 +315,7 @@ def test_compare_to_column(self):
         np.testing.assert_almost_equal(actual.t_stats, expected_tstats)
         np.testing.assert_almost_equal(actual.p_vals, expected_pvals)
 
-    def test_pairwise_indices_only_larger(self):
+    def test_cat_x_cat_pairwise_indices_only_larger(self):
         cube = CrunchCube(CR.PAIRWISE_HIROTSU_OCCUPATION_X_ILLNESS)
         expected_indices = np.array(
             [
@@ -334,6 +334,48 @@ def test_pairwise_indices_only_larger(self):
         pairwise_indices = cube.slices[0].pairwise_indices()
         np.testing.assert_array_equal(pairwise_indices, expected_indices)
 
+    def test_mr_x_cat_pairwise_indices_only_larger(self):
+        cube = CrunchCube(CR.MR_X_CAT_HS)
+        expected_indices = np.array(
+            [
+                [(1, 3, 4), (), (), (), (), ()],
+                [(), (), (), (), (), ()],
+                [(), (), (), (0,), (0,), ()],
+                [(), (), (), (), (1,), ()],
+                [(), (), (), (), (), ()],
+            ]
+        )
+        pairwise_indices = cube.slices[0].pairwise_indices()
+        np.testing.assert_array_equal(pairwise_indices, expected_indices)
+
+    def test_cat_x_mr_pairwise_indices_only_larger(self):
+        cube = CrunchCube(CR.CAT_X_MR_HS)
+        expected_indices = np.array(
+            [
+                [(1, 2, 3, 4), (2, 3), (), (), (2,)],
+                [(), (), (), (), (3,)],
+                [(), (), (), (), ()],
+                [(), (0,), (0,), (0,), (0,)],
+                [(), (), (), (0, 1, 4), ()],
+                [(), (), (), (), ()],
+            ]
+        )
+        pairwise_indices = cube.slices[0].pairwise_indices()
+        np.testing.assert_array_equal(pairwise_indices, expected_indices)
+
+    def test_mr_x_mr_pairwise_indices_only_larger(self):
+        cube = CrunchCube(CR.MR_X_MR)
+        expected_indices = np.array(
+            [
+                [(1, 2, 3), (), (), ()],
+                [(), (0, 2, 3), (), (2,)],
+                [(), (), (0, 1, 3), (1,)],
+                [(), (), (), ()],
+            ]
+        )
+        pairwise_indices = cube.slices[0].pairwise_indices()
+        np.testing.assert_array_equal(pairwise_indices, expected_indices)
+
     def test_pairwise_indices_larger_and_smaller(self):
         cube = CrunchCube(CR.PAIRWISE_HIROTSU_OCCUPATION_X_ILLNESS)
         expected_indices = np.array(
@@ -353,7 +395,7 @@ def test_pairwise_indices_larger_and_smaller(self):
         pairwise_indices = cube.slices[0].pairwise_indices(only_larger=False)
         np.testing.assert_array_equal(pairwise_indices, expected_indices)
 
-    def test_summary_pairwise_indices(self):
+    def test_cat_x_cat_summary_pairwise_indices(self):
         slice_ = CrunchCube(CR.PAIRWISE_HIROTSU_OCCUPATION_X_ILLNESS).slices[0]
 
         # Only larger
diff --git a/tests/unit/test_wishart_pairwise_significance.py b/tests/unit/test_wishart_pairwise_significance.py
index 3d0bcb2a6..445bf09d7 100644
--- a/tests/unit/test_wishart_pairwise_significance.py
+++ b/tests/unit/test_wishart_pairwise_significance.py
@@ -238,7 +238,13 @@ def p_vals_fixture(self, request):
                     [0.0, -0.8586079707543924, -1.1774569464270872],
                     [0.0, 4.663801762560106, 3.743253010905157],
                 ],
-            )
+            ),
+            (
+                0,
+                [[0.25, 0.75], [0.75, 0.25]],
+                [[1, 2], [3, 4]],
+                [[0.0, 0.94280904], [0.0, -1.51185789]],
+            ),
         ]
     )
     def t_stats_fixture(self, request):

From 30d851d015e4c966b2514dfabc43acb45adbc039 Mon Sep 17 00:00:00 2001
From: Slobodan Ilic <slobodan@crunch.io>
Date: Tue, 23 Apr 2019 13:01:29 +0200
Subject: [PATCH 3/3] Fix error in indexing and ndarray dtype

---
 src/cr/cube/measures/pairwise_significance.py    | 11 ++++++++---
 tests/integration/test_pairwise_significance.py  |  4 ++--
 tests/unit/test_wishart_pairwise_significance.py |  2 +-
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/cr/cube/measures/pairwise_significance.py b/src/cr/cube/measures/pairwise_significance.py
index 688fa34f2..3fc0da478 100644
--- a/src/cr/cube/measures/pairwise_significance.py
+++ b/src/cr/cube/measures/pairwise_significance.py
@@ -59,7 +59,13 @@ def pairwise_indices(self):
     @lazyproperty
     def summary_pairwise_indices(self):
         """ndarray containing tuples of pairwise indices for the column summary."""
-        return np.array([sig.summary_pairwise_indices for sig in self.values]).T
+        summary_pairwise_indices = np.empty(
+            self.values[0].t_stats.shape[1], dtype=object
+        )
+        summary_pairwise_indices[:] = [
+            sig.summary_pairwise_indices for sig in self.values
+        ]
+        return summary_pairwise_indices
 
 
 # pylint: disable=too-few-public-methods
@@ -116,8 +122,7 @@ def summary_pairwise_indices(self):
         significance = self.summary_p_vals < self._alpha
         if self._only_larger:
             significance = np.logical_and(self.summary_t_stats < 0, significance)
-        spwi = tuple(np.where(significance[0])[0])
-        return spwi
+        return tuple(np.where(significance)[0])
 
     @lazyproperty
     def summary_t_stats(self):
diff --git a/tests/integration/test_pairwise_significance.py b/tests/integration/test_pairwise_significance.py
index 3e7e5e6f4..5d436ffac 100644
--- a/tests/integration/test_pairwise_significance.py
+++ b/tests/integration/test_pairwise_significance.py
@@ -400,12 +400,12 @@ def test_cat_x_cat_summary_pairwise_indices(self):
 
         # Only larger
         pairwise_indices = slice_.summary_pairwise_indices()
-        expected_indices = np.array([(), (0,), ()])
+        expected_indices = np.array([(2,), (0, 2), ()])
         np.testing.assert_array_equal(pairwise_indices, expected_indices)
 
         # Larger and smaller
         pairwise_indices = slice_.summary_pairwise_indices(only_larger=False)
-        expected_indices = np.array([(), (0,), (0,)])
+        expected_indices = np.array([(1, 2), (0, 2), (0, 1)], dtype="i,i")
         np.testing.assert_array_equal(pairwise_indices, expected_indices)
 
     def test_ttests_use_unweighted_n_for_variance(self):
diff --git a/tests/unit/test_wishart_pairwise_significance.py b/tests/unit/test_wishart_pairwise_significance.py
index 445bf09d7..64f88564f 100644
--- a/tests/unit/test_wishart_pairwise_significance.py
+++ b/tests/unit/test_wishart_pairwise_significance.py
@@ -111,7 +111,7 @@ def it_can_calculate_summary_pairwise_indices(
     )
     def summary_pairwise_indices_fixture(self, request):
         only_larger, col_idx, t_stats, p_vals, expected = request.param
-        return only_larger, col_idx, np.array([t_stats]), np.array([p_vals]), expected
+        return only_larger, col_idx, np.array(t_stats), np.array(p_vals), expected
 
     @pytest.fixture(
         params=[