[#177801573]: change overlaps percentages

* Change how percentages are calculated for overlaps measures * Use weighted counts instead of (unweighted) overlaps selected counts * Prevent regression by changing a fixture and an expectation in a test
Crunch-io · Apr 19, 2021 · 297a726 · 297a726
1 parent d6b96aa
commit 297a726
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 34 deletions.
diff --git a/src/cr/cube/matrix/measure.py b/src/cr/cube/matrix/measure.py
@@ -527,6 +527,7 @@ def t_stats(self):
             [
                 [
                     _PairwiseSignificaneBetweenSubvariablesHelper(
+                        self._cube_measures.weighted_cube_counts.weighted_counts,
                         self._cube_measures.cube_overlaps.overlaps,
                         self._cube_measures.cube_overlaps.valid_overlaps,
                         row_idx,
@@ -565,6 +566,7 @@ def p_vals(self):
             [
                 [
                     _PairwiseSignificaneBetweenSubvariablesHelper(
+                        self._cube_measures.weighted_cube_counts.weighted_counts,
                         self._cube_measures.cube_overlaps.overlaps,
                         self._cube_measures.cube_overlaps.valid_overlaps,
                         row_idx,
@@ -1241,7 +1243,8 @@ def blocks(self):
 class _PairwiseSignificaneBetweenSubvariablesHelper(object):
     """Helper for calculating overlaps significance between subvariables."""
 
-    def __init__(self, overlaps, valid_overlaps, row_idx, idx_a, idx_b):
+    def __init__(self, counts, overlaps, valid_overlaps, row_idx, idx_a, idx_b):
+        self._counts = counts
         self._overlaps = overlaps
         self._valid_overlaps = valid_overlaps
         self._row_idx = row_idx
@@ -1292,15 +1295,15 @@ def _column_proportions(self):
         the total of selected counts from that column (to which the cell belongs).
 
         """
-        # ---pa and pb are the selected counts of the
-        # ---cells at [row_idx, A], [row_idx, B],
-        pa = self._overlaps[self._row_idx, self._idx_a, self._idx_a]
-        pb = self._overlaps[self._row_idx, self._idx_b, self._idx_b]
-        # ---Sa and Sb are the totals of the selected
-        # ---counts from columns A and B.
-        Sa, Sb, _ = self._selected_counts
-        # ---pa/Sa and pb/Sb represent the column proportions of selected counts
-        return (pa / Sa, pb / Sb)
+        # ---pa and pb are the column percentages of weighted counts
+        # ---of the cells at positions [row_idx, A] and [row_idx, B]
+        pa = self._counts[self._row_idx, self._idx_a] / np.sum(
+            self._counts[:, self._idx_a], axis=0
+        )
+        pb = self._counts[self._row_idx, self._idx_b] / np.sum(
+            self._counts[:, self._idx_b], axis=0
+        )
+        return (pa, pb)
 
     @lazyproperty
     def _selected_counts(self):

diff --git a/tests/integration/test_pairwise_significance.py b/tests/integration/test_pairwise_significance.py
@@ -512,25 +512,29 @@ def test_pairwise_significance_mr_x_mr(self):
     def test_pairwise_cat_x_mr_gender_x_all_pets_owned(self):
         slice_ = Cube(OL.CAT_X_MR_GENDER_X_ALL_PETS_OWNED).partitions[0]
 
-        assert slice_.column_percentages.tolist() == [
-            [75.0, 20.0, 60.0],
-            [25.0, 80.0, 40.0],
-        ]
+        assert slice_.column_percentages.tolist() == pytest.approx(
+            np.array(
+                [
+                    [66.6667, 14.28571, 50.0],
+                    [33.33333, 85.714286, 50.0],
+                ]
+            )
+        )
 
         # Assert for first column (subvariable)
         assert slice_.pairwise_significance_t_stats(0).tolist() == pytest.approx(
             np.array(
                 [
-                    [0.0, -2.76314, -1.587178],
-                    [0.0, 2.76314, 1.587178],
+                    [0.0, -2.6315597, -1.76353],
+                    [0.0, 2.6315597, 1.76353],
                 ]
             ),
         )
         assert slice_.pairwise_significance_p_vals(0) == pytest.approx(
             np.array(
                 [
-                    [0.0, 0.0103743, 0.1229579],
-                    [0.0, 0.0103743, 0.1229579],
+                    [0.0, 0.01410448, 0.0879948],
+                    [0.0, 0.01410448, 0.0879948],
                 ]
             ),
         )
@@ -539,16 +543,16 @@ def test_pairwise_cat_x_mr_gender_x_all_pets_owned(self):
         assert slice_.pairwise_significance_t_stats(1).tolist() == pytest.approx(
             np.array(
                 [
-                    [2.76314, 0.0, 9.07697],
-                    [-2.76314, 0.0, -9.07697],
+                    [2.63156, 0.0, 8.10444],
+                    [-2.63156, 0.0, -8.10444],
                 ]
             ),
         )
         assert slice_.pairwise_significance_p_vals(1) == pytest.approx(
             np.array(
                 [
-                    [0.0103743, 0.0, 0.003145e-06],
-                    [0.0103743, 0.0, 0.003145e-06],
+                    [0.01410448, 0, 0.025067e-06],
+                    [0.01410448, 0, 0.025067e-06],
                 ]
             ),
         )
@@ -557,16 +561,16 @@ def test_pairwise_cat_x_mr_gender_x_all_pets_owned(self):
         assert slice_.pairwise_significance_t_stats(2).tolist() == pytest.approx(
             np.array(
                 [
-                    [1.587178, -9.07697, 0.0],
-                    [-1.587178, 9.07697, 0.0],
+                    [1.763531, -8.104439, 0.0],
+                    [-1.763531, 8.104439, 0.0],
                 ]
             ),
         )
         assert slice_.pairwise_significance_p_vals(2) == pytest.approx(
             np.array(
                 [
-                    [0.1229579, 0.003146e-06, 0.0],
-                    [0.1229579, 0.003146e-06, 0.0],
+                    [0.0879948, 0.025067e-06, 0],
+                    [0.0879948, 0.025067e-06, 0],
                 ]
             ),
         )
@@ -577,10 +581,14 @@ def test_pairwise_significance_indices(self):
             OL.CAT_X_MR_GENDER_X_ALL_PETS_OWNED, transforms=transforms
         ).partitions[0]
 
-        assert slice_.column_percentages.tolist() == [
-            [75.0, 20.0, 60.0],
-            [25.0, 80.0, 40.0],
-        ]
+        assert slice_.column_percentages.tolist() == pytest.approx(
+            np.array(
+                [
+                    [66.6667, 14.28571, 50.0],
+                    [33.33333, 85.714286, 50.0],
+                ]
+            )
+        )
 
         assert slice_.pairwise_indices.tolist() == [
             [(1,), (), (1,)],
@@ -603,10 +611,14 @@ def test_pairwise_significance_all_empty(self):
             OL.CAT_X_MR_GENDER_X_ALL_PETS_OWNED, transforms=transforms
         ).partitions[0]
 
-        assert slice_.column_percentages.tolist() == [
-            [75.0, 20.0, 60.0],
-            [25.0, 80.0, 40.0],
-        ]
+        assert slice_.column_percentages.tolist() == pytest.approx(
+            np.array(
+                [
+                    [66.6667, 14.28571, 50.0],
+                    [33.33333, 85.714286, 50.0],
+                ]
+            )
+        )
 
         assert slice_.pairwise_indices.tolist() == [
             [(), (), ()],