From 9a03e6f2ffd3d6a3e4f8db45a654e720d2e51fb7 Mon Sep 17 00:00:00 2001
From: Ernesto Arbitrio <ernesto@crunch.io>
Date: Tue, 20 Apr 2021 11:54:15 +0200
Subject: [PATCH] [#177824342]: refactor sig letters for means

---
 src/cr/cube/matrix/assembler.py               |  33 +--
 src/cr/cube/matrix/measure.py                 | 227 ++++++++++++------
 src/cr/cube/matrix/subtotals.py               |  12 +-
 .../integration/test_pairwise_significance.py |  12 +-
 4 files changed, 180 insertions(+), 104 deletions(-)

diff --git a/src/cr/cube/matrix/assembler.py b/src/cr/cube/matrix/assembler.py
index 219fe34cc..066b8711a 100644
--- a/src/cr/cube/matrix/assembler.py
+++ b/src/cr/cube/matrix/assembler.py
@@ -191,30 +191,11 @@ def pairwise_means_indices(self, alpha, only_larger):
 
         Raises `ValueError if the cube-result does not include `means` cube-measures.
         """
-
-        def pairwise_indices(p_vals, t_stats):
-            """1D ndarray of tuples of int pairwise indices of each column."""
-            significance = p_vals < alpha
-            if only_larger:
-                significance = np.logical_and(t_stats < 0, significance)
-            col_signif = np.empty((len(significance),), dtype=object)
-            col_signif[:] = [tuple(np.where(sig_row)[0]) for sig_row in significance]
-            return col_signif
-
-        t_stats = [
-            self.pairwise_significance_means_t_stats(col)
-            for col in range(len(self._column_order))
-        ]
-        p_vals = [
-            self.pairwise_significance_means_p_vals(col)
-            for col in range(len(self._column_order))
-        ]
-        indices = np.array([pairwise_indices(p, t) for p, t in zip(p_vals, t_stats)]).T
-        # --- a None value indicates "cannot calculate", which is distinct from
-        # --- () that means "not significance"
-        for idx in self.inserted_column_idxs:
-            indices[:, idx] = None
-        return indices
+        return self._assemble_matrix(
+            self._measures.pairwise_means_indices(
+                self._column_order, alpha, only_larger
+            ).blocks
+        )
 
     def pairwise_significance_p_vals(self, subvar_idx):
         """2D optional np.float64 ndarray of overlaps-p_vals matrices for subvar idx.
@@ -243,7 +224,7 @@ def pairwise_significance_means_p_vals(self, column_idx):
         """
         return self._assemble_matrix(
             self._measures.pairwise_significance_means_p_vals(
-                column_idx, self.inserted_column_idxs
+                column_idx, self._column_order
             ).blocks
         )
 
@@ -254,7 +235,7 @@ def pairwise_significance_means_t_stats(self, column_idx):
         """
         return self._assemble_matrix(
             self._measures.pairwise_significance_means_t_stats(
-                column_idx, self.inserted_column_idxs
+                column_idx, self._column_order
             ).blocks
         )
 
diff --git a/src/cr/cube/matrix/measure.py b/src/cr/cube/matrix/measure.py
index 6d8c29e5c..2c4956ff4 100644
--- a/src/cr/cube/matrix/measure.py
+++ b/src/cr/cube/matrix/measure.py
@@ -8,7 +8,7 @@
 from scipy.stats import t
 
 from cr.cube.matrix.cubemeasure import CubeMeasures
-from cr.cube.matrix.subtotals import SumSubtotals, NanSubtotals
+from cr.cube.matrix.subtotals import SumSubtotals, NanSubtotals, NoneSubtotals
 from cr.cube.util import lazyproperty
 
 
@@ -62,6 +62,17 @@ def pairwise_indices(self, alpha, only_larger):
             self._dimensions, self, self._cube_measures, alpha, only_larger
         )
 
+    def pairwise_means_indices(self, column_order, alpha, only_larger):
+        """_PairwiseIndices measure object for this cube-result"""
+        return _PairwiseMeansIndices(
+            self._dimensions,
+            self,
+            self._cube_measures,
+            column_order,
+            alpha,
+            only_larger,
+        )
+
     def pairwise_p_vals_for_subvar(self, subvar_idx):
         """_PairwiseSigPVals measure object for this cube-result and selected subvar"""
         return _PairwiseSigPVals(
@@ -74,7 +85,7 @@ def pairwise_t_stats_for_subvar(self, subvar_idx):
             self._dimensions, self, self._cube_measures, subvar_idx
         )
 
-    def pairwise_significance_means_p_vals(self, column_idx, inserted_col_idxs):
+    def pairwise_significance_means_p_vals(self, column_idx, column_order):
         """_PairwiseMeansSigPVals measure object for this cube-result.
 
         The `column_idx` is the reference column on which calculate the pairwise sig
@@ -86,17 +97,17 @@ def pairwise_significance_means_p_vals(self, column_idx, inserted_col_idxs):
             self,
             self._cube_measures,
             column_idx,
-            inserted_col_idxs,
+            column_order,
         )
 
-    def pairwise_significance_means_t_stats(self, column_idx, inserted_col_idxs):
+    def pairwise_significance_means_t_stats(self, column_idx, column_order):
         """_PairwiseMeansSigTStats measure object for this cube-result."""
         return _PairwiseMeansSigTStats(
             self._dimensions,
             self,
             self._cube_measures,
             column_idx,
-            inserted_col_idxs,
+            column_order,
         )
 
     @lazyproperty
@@ -486,6 +497,138 @@ def blocks(self):
         )
 
 
+class _PairwiseIndices(_BaseSecondOrderMeasure):
+    """Provides pairwise significance indices measure for matrix."""
+
+    def __init__(
+        self, dimensions, second_order_measures, cube_measures, alpha, only_larger
+    ):
+        super(_PairwiseIndices, self).__init__(
+            dimensions, second_order_measures, cube_measures
+        )
+        self._alpha = alpha
+        self._only_larger = only_larger
+
+    @lazyproperty
+    def blocks(self):
+        """2D array of the four 2D "blocks" making up this measure."""
+        pairwise_indices = np.array(
+            [self._pairwise_indices(v.p_vals, v.t_stats) for v in self._values]
+        ).T
+        return NanSubtotals.blocks(pairwise_indices, self._dimensions)
+
+    def _pairwise_indices(self, p_vals, t_stats):
+        """1D ndarray containing tuples of int pairwise indices of each column."""
+        significance = p_vals < self._alpha
+        if self._only_larger:
+            significance = np.logical_and(t_stats < 0, significance)
+        col_significance = np.empty((len(significance),), dtype=object)
+        col_significance[:] = [tuple(np.where(sig_row)[0]) for sig_row in significance]
+        return col_significance
+
+    @lazyproperty
+    def _values(self):
+        """list of _PairwiseSigPVals tests objects.
+
+        Result has as many elements as there are columns in the slice. Each
+        significance test contains `p_vals` and `t_stats` significance tests.
+        """
+        return [
+            _PairwiseSigPVals(
+                self._dimensions,
+                self._second_order_measures,
+                self._cube_measures,
+                col_idx,
+            )
+            for col_idx in range(self._cube_measures.cube_overlaps.overlaps.shape[1])
+        ]
+
+
+class _PairwiseMeansIndices(_BaseSecondOrderMeasure):
+    def __init__(
+        self,
+        dimensions,
+        second_order_measures,
+        cube_measures,
+        column_order,
+        alpha,
+        only_larger,
+    ):
+        super(_PairwiseMeansIndices, self).__init__(
+            dimensions, second_order_measures, cube_measures
+        )
+        self._column_order = column_order
+        self._alpha = alpha
+        self._only_larger = only_larger
+
+    @lazyproperty
+    def blocks(self):
+        """2D array of the four 2D "blocks" making up this measure."""
+        pairwise_indices = np.array(
+            [self._pairwise_indices(v.p_vals, v.t_stats) for v in self._values]
+        ).T
+        return NoneSubtotals.blocks(pairwise_indices, self._dimensions)
+
+    def _pairwise_indices(self, p_vals, t_stats):
+        """1D ndarray containing tuples of int pairwise indices of each column."""
+
+        def _map_idxs(pairwise_idxs):
+            """Return tuple of `pairwise-idxs` adjusted to `col_order`.
+
+            If the indices without insertions are:
+            [
+              [(), (4,), (), (), ()],
+              [(), (), (), (), ()],
+              [(), (0, 2, 4), (), (0, 2, 4), ()],
+              [(), (), (), (), ()],
+            ]
+            with 4 insertions in the positions (0,3,5,8), they will be:
+            [
+              [(), (7,), (), (), ()],
+              [(), (), (), (), ()],
+              [(), (1, 4, 7), (), (1, 4, 7), ()],
+              [(), (), (), (), ()],
+            ]
+            """
+            column_order = self._column_order
+            mapped_idxs = [None] * len(column_order)
+
+            for sort_idx, item_idx in enumerate(column_order):
+                mapped_idxs[item_idx] = sort_idx
+
+            return tuple(
+                mapped_idxs[pairwise_idx]
+                for pairwise_idx in tuple(pairwise_idxs)
+                if mapped_idxs[pairwise_idx] is not None
+            )
+
+        significance = p_vals < self._alpha
+        if self._only_larger:
+            significance = np.logical_and(t_stats < 0, significance)
+        col_significance = np.empty((len(significance),), dtype=object)
+        col_significance[:] = [_map_idxs(np.where(row)[0]) for row in significance]
+        return col_significance
+
+    @lazyproperty
+    def _values(self):
+        """list of _PairwiseSigPVals tests objects.
+
+        Result has as many elements as there are columns in the slice. Each
+        significance test contains `p_vals` and `t_stats` significance tests.
+        """
+        return [
+            _PairwiseMeansSigPVals(
+                self._dimensions,
+                self._second_order_measures,
+                self._cube_measures,
+                col_idx,
+                self._column_order,
+            )
+            for col_idx, col_order in enumerate(self._column_order)
+            if col_order >= 0
+        ]
+
+
 class _PairwiseSigTStats(_BaseSecondOrderMeasure):
     """Provides pairwise significance t-stats measure for matrix and selected subvar.
 
@@ -594,13 +737,13 @@ def __init__(
         second_order_measures,
         cube_measures,
         selected_column_idx,
-        inserted_col_idxs,
+        column_order,
     ):
         super(_PairwiseMeansSigTStats, self).__init__(
             dimensions, second_order_measures, cube_measures
         )
         self._selected_column_idx = selected_column_idx
-        self._inserted_col_idxs = inserted_col_idxs
+        self._column_order = column_order
 
     @lazyproperty
     def blocks(self):
@@ -612,13 +755,6 @@ def t_stats(self):
         """2D float64 ndarray of means t-stats significance for the selected column."""
         return self._t_stats[self._selected_column_idx]
 
-    @lazyproperty
-    def _n_cols(self):
-        """integer number of columns including subtotals."""
-        return self._cube_measures.cube_means.means.shape[-1] + len(
-            self._inserted_col_idxs
-        )
-
     @lazyproperty
     def _t_stats(self):
         """List of 2D float64 ndarrays representing t-stats for means pairwise testing.
@@ -633,14 +769,14 @@ def _t_stats(self):
         col_bases = self._cube_measures.unweighted_cube_counts.column_bases
         t_stats = []
         offset = 0
-        for col in range(self._n_cols):
-            if col in self._inserted_col_idxs:
+        for col_idx, col_order in enumerate(self._column_order):
+            if col_order < 0:
                 offset += 1
                 t_stats.append(np.full(means.shape, np.nan))
             else:
-                combined_variance = variance[:, col - offset] + variance.T
-                diff = means.T - means[:, col - offset]
-                n = col_bases[:, col - offset] + col_bases.T
+                combined_variance = variance[:, col_idx - offset] + variance.T
+                diff = means.T - means[:, col_idx - offset]
+                n = col_bases[:, col_idx - offset] + col_bases.T
                 t_stats.append(diff.T * np.sqrt(n.T / combined_variance.T))
         return t_stats
 
@@ -668,64 +804,17 @@ def _p_vals(self):
         col_bases = self._cube_measures.unweighted_cube_counts.column_bases
         p_vals = []
         offset = 0
-        for col in range(self._n_cols):
-            if col in self._inserted_col_idxs:
+        for col_idx, col_order in enumerate(self._column_order):
+            if col_order < 0:
                 offset += 1
                 p_vals.append(np.full(t_stats.shape, np.nan))
             else:
-                n = col_bases[:, col - offset] + col_bases.T
+                n = col_bases[:, col_idx - offset] + col_bases.T
                 df = 2 * (n - 1)
                 p_vals.append(2 * (1 - t.cdf(abs(t_stats), df=df.T)))
         return p_vals
 
 
-class _PairwiseIndices(_BaseSecondOrderMeasure):
-    """Provides pairwise significance indices measure for matrix."""
-
-    def __init__(
-        self, dimensions, second_order_measures, cube_measures, alpha, only_larger
-    ):
-        super(_PairwiseIndices, self).__init__(
-            dimensions, second_order_measures, cube_measures
-        )
-        self._alpha = alpha
-        self._only_larger = only_larger
-
-    @lazyproperty
-    def blocks(self):
-        """2D array of the four 2D "blocks" making up this measure."""
-        pairwise_indices = np.array(
-            [self._pairwise_indices(v.p_vals, v.t_stats) for v in self._values]
-        ).T
-        return NanSubtotals.blocks(pairwise_indices, self._dimensions)
-
-    def _pairwise_indices(self, p_vals, t_stats):
-        """1D ndarray containing tuples of int pairwise indices of each column."""
-        significance = p_vals < self._alpha
-        if self._only_larger:
-            significance = np.logical_and(t_stats < 0, significance)
-        col_significance = np.empty((len(significance),), dtype=object)
-        col_significance[:] = [tuple(np.where(sig_row)[0]) for sig_row in significance]
-        return col_significance
-
-    @lazyproperty
-    def _values(self):
-        """list of _PairwiseSigPVals tests objects.
-
-        Result has as many elements as there are columns in the slice. Each
-        significance test contains `p_vals` and `t_stats` significance tests.
-        """
-        return [
-            _PairwiseSigPVals(
-                self._dimensions,
-                self._second_order_measures,
-                self._cube_measures,
-                col_idx,
-            )
-            for col_idx in range(self._cube_measures.cube_overlaps.overlaps.shape[1])
-        ]
-
-
 class _RowProportions(_BaseSecondOrderMeasure):
     """Provides the row-proportions measure for a matrix.
 
diff --git a/src/cr/cube/matrix/subtotals.py b/src/cr/cube/matrix/subtotals.py
index 85cab51ef..f4b489bb4 100644
--- a/src/cr/cube/matrix/subtotals.py
+++ b/src/cr/cube/matrix/subtotals.py
@@ -136,17 +136,23 @@ class NanSubtotals(_BaseSubtotals):
     Each subtotal value (and intersection value) is `np.nan`.
     """
 
+    filler = np.nan
+
     def _intersection(self, row_subtotal, column_subtotal):
         """Unconditionally return np.nan for each intersection cell."""
-        return np.nan
+        return self.filler
 
     def _subtotal_column(self, subtotal):
         """Return (n_rows,) ndarray of np.nan values."""
-        return np.full(self._nrows, np.nan)
+        return np.full(self._nrows, self.filler)
 
     def _subtotal_row(self, subtotal):
         """Return (n_cols,) ndarray of np.nan values."""
-        return np.full(self._ncols, np.nan)
+        return np.full(self._ncols, self.filler)
+
+
+class NoneSubtotals(NanSubtotals):
+    filler = None
 
 
 class SumSubtotals(_BaseSubtotals):
diff --git a/tests/integration/test_pairwise_significance.py b/tests/integration/test_pairwise_significance.py
index 204858487..fa0527ef3 100644
--- a/tests/integration/test_pairwise_significance.py
+++ b/tests/integration/test_pairwise_significance.py
@@ -916,12 +916,12 @@ def test_mean_diff_significance_indices_num_array_grouped_by_cat_hs_weighted(sel
         ).partitions[0]
 
         assert slice_.pairwise_means_indices.tolist() == [
-            [None, None, None, None, (), (), (), ()],
-            [None, None, None, None, (), (), (), ()],
-            [None, None, None, None, (), (), (), (5,)],
+            [None, None, None, None, (), (), (), None],
+            [None, None, None, None, (), (), (), None],
+            [None, None, None, None, (), (), (5,), None],
         ]
         assert slice_.pairwise_means_indices_alt.tolist() == [
-            [None, None, None, None, (), (), (), ()],
-            [None, None, None, None, (), (), (), ()],
-            [None, None, None, None, (), (), (), (4, 5)],
+            [None, None, None, None, (), (), (), None],
+            [None, None, None, None, (), (), (), None],
+            [None, None, None, None, (), (), (4, 5), None],
         ]