From 9a03e6f2ffd3d6a3e4f8db45a654e720d2e51fb7 Mon Sep 17 00:00:00 2001 From: Ernesto Arbitrio Date: Tue, 20 Apr 2021 11:54:15 +0200 Subject: [PATCH] [#177824342]: refactor sig letters for means --- src/cr/cube/matrix/assembler.py | 33 +-- src/cr/cube/matrix/measure.py | 227 ++++++++++++------ src/cr/cube/matrix/subtotals.py | 12 +- .../integration/test_pairwise_significance.py | 12 +- 4 files changed, 180 insertions(+), 104 deletions(-) diff --git a/src/cr/cube/matrix/assembler.py b/src/cr/cube/matrix/assembler.py index 219fe34cc..066b8711a 100644 --- a/src/cr/cube/matrix/assembler.py +++ b/src/cr/cube/matrix/assembler.py @@ -191,30 +191,11 @@ def pairwise_means_indices(self, alpha, only_larger): Raises `ValueError if the cube-result does not include `means` cube-measures. """ - - def pairwise_indices(p_vals, t_stats): - """1D ndarray of tuples of int pairwise indices of each column.""" - significance = p_vals < alpha - if only_larger: - significance = np.logical_and(t_stats < 0, significance) - col_signif = np.empty((len(significance),), dtype=object) - col_signif[:] = [tuple(np.where(sig_row)[0]) for sig_row in significance] - return col_signif - - t_stats = [ - self.pairwise_significance_means_t_stats(col) - for col in range(len(self._column_order)) - ] - p_vals = [ - self.pairwise_significance_means_p_vals(col) - for col in range(len(self._column_order)) - ] - indices = np.array([pairwise_indices(p, t) for p, t in zip(p_vals, t_stats)]).T - # --- a None value indicates "cannot calculate", which is distinct from - # --- () that means "not significance" - for idx in self.inserted_column_idxs: - indices[:, idx] = None - return indices + return self._assemble_matrix( + self._measures.pairwise_means_indices( + self._column_order, alpha, only_larger + ).blocks + ) def pairwise_significance_p_vals(self, subvar_idx): """2D optional np.float64 ndarray of overlaps-p_vals matrices for subvar idx. @@ -243,7 +224,7 @@ def pairwise_significance_means_p_vals(self, column_idx): """ return self._assemble_matrix( self._measures.pairwise_significance_means_p_vals( - column_idx, self.inserted_column_idxs + column_idx, self._column_order ).blocks ) @@ -254,7 +235,7 @@ def pairwise_significance_means_t_stats(self, column_idx): """ return self._assemble_matrix( self._measures.pairwise_significance_means_t_stats( - column_idx, self.inserted_column_idxs + column_idx, self._column_order ).blocks ) diff --git a/src/cr/cube/matrix/measure.py b/src/cr/cube/matrix/measure.py index 6d8c29e5c..2c4956ff4 100644 --- a/src/cr/cube/matrix/measure.py +++ b/src/cr/cube/matrix/measure.py @@ -8,7 +8,7 @@ from scipy.stats import t from cr.cube.matrix.cubemeasure import CubeMeasures -from cr.cube.matrix.subtotals import SumSubtotals, NanSubtotals +from cr.cube.matrix.subtotals import SumSubtotals, NanSubtotals, NoneSubtotals from cr.cube.util import lazyproperty @@ -62,6 +62,17 @@ def pairwise_indices(self, alpha, only_larger): self._dimensions, self, self._cube_measures, alpha, only_larger ) + def pairwise_means_indices(self, column_order, alpha, only_larger): + """_PairwiseIndices measure object for this cube-result""" + return _PairwiseMeansIndices( + self._dimensions, + self, + self._cube_measures, + column_order, + alpha, + only_larger, + ) + def pairwise_p_vals_for_subvar(self, subvar_idx): """_PairwiseSigPVals measure object for this cube-result and selected subvar""" return _PairwiseSigPVals( @@ -74,7 +85,7 @@ def pairwise_t_stats_for_subvar(self, subvar_idx): self._dimensions, self, self._cube_measures, subvar_idx ) - def pairwise_significance_means_p_vals(self, column_idx, inserted_col_idxs): + def pairwise_significance_means_p_vals(self, column_idx, column_order): """_PairwiseMeansSigPVals measure object for this cube-result. The `column_idx` is the reference column on which calculate the pairwise sig @@ -86,17 +97,17 @@ def pairwise_significance_means_p_vals(self, column_idx, inserted_col_idxs): self, self._cube_measures, column_idx, - inserted_col_idxs, + column_order, ) - def pairwise_significance_means_t_stats(self, column_idx, inserted_col_idxs): + def pairwise_significance_means_t_stats(self, column_idx, column_order): """_PairwiseMeansSigTStats measure object for this cube-result.""" return _PairwiseMeansSigTStats( self._dimensions, self, self._cube_measures, column_idx, - inserted_col_idxs, + column_order, ) @lazyproperty @@ -486,6 +497,138 @@ def blocks(self): ) +class _PairwiseIndices(_BaseSecondOrderMeasure): + """Provides pairwise significance indices measure for matrix.""" + + def __init__( + self, dimensions, second_order_measures, cube_measures, alpha, only_larger + ): + super(_PairwiseIndices, self).__init__( + dimensions, second_order_measures, cube_measures + ) + self._alpha = alpha + self._only_larger = only_larger + + @lazyproperty + def blocks(self): + """2D array of the four 2D "blocks" making up this measure.""" + pairwise_indices = np.array( + [self._pairwise_indices(v.p_vals, v.t_stats) for v in self._values] + ).T + return NanSubtotals.blocks(pairwise_indices, self._dimensions) + + def _pairwise_indices(self, p_vals, t_stats): + """1D ndarray containing tuples of int pairwise indices of each column.""" + significance = p_vals < self._alpha + if self._only_larger: + significance = np.logical_and(t_stats < 0, significance) + col_significance = np.empty((len(significance),), dtype=object) + col_significance[:] = [tuple(np.where(sig_row)[0]) for sig_row in significance] + return col_significance + + @lazyproperty + def _values(self): + """list of _PairwiseSigPVals tests objects. + + Result has as many elements as there are columns in the slice. Each + significance test contains `p_vals` and `t_stats` significance tests. + """ + return [ + _PairwiseSigPVals( + self._dimensions, + self._second_order_measures, + self._cube_measures, + col_idx, + ) + for col_idx in range(self._cube_measures.cube_overlaps.overlaps.shape[1]) + ] + + +class _PairwiseMeansIndices(_BaseSecondOrderMeasure): + def __init__( + self, + dimensions, + second_order_measures, + cube_measures, + column_order, + alpha, + only_larger, + ): + super(_PairwiseMeansIndices, self).__init__( + dimensions, second_order_measures, cube_measures + ) + self._column_order = column_order + self._alpha = alpha + self._only_larger = only_larger + + @lazyproperty + def blocks(self): + """2D array of the four 2D "blocks" making up this measure.""" + pairwise_indices = np.array( + [self._pairwise_indices(v.p_vals, v.t_stats) for v in self._values] + ).T + return NoneSubtotals.blocks(pairwise_indices, self._dimensions) + + def _pairwise_indices(self, p_vals, t_stats): + """1D ndarray containing tuples of int pairwise indices of each column.""" + + def _map_idxs(pairwise_idxs): + """Return tuple of `pairwise-idxs` adjusted to `col_order`. + + If the indices without insertions are: + [ + [(), (4,), (), (), ()], + [(), (), (), (), ()], + [(), (0, 2, 4), (), (0, 2, 4), ()], + [(), (), (), (), ()], + ] + with 4 insertions in the positions (0,3,5,8), they will be: + [ + [(), (7,), (), (), ()], + [(), (), (), (), ()], + [(), (1, 4, 7), (), (1, 4, 7), ()], + [(), (), (), (), ()], + ] + """ + column_order = self._column_order + mapped_idxs = [None] * len(column_order) + + for sort_idx, item_idx in enumerate(column_order): + mapped_idxs[item_idx] = sort_idx + + return tuple( + mapped_idxs[pairwise_idx] + for pairwise_idx in tuple(pairwise_idxs) + if mapped_idxs[pairwise_idx] is not None + ) + + significance = p_vals < self._alpha + if self._only_larger: + significance = np.logical_and(t_stats < 0, significance) + col_significance = np.empty((len(significance),), dtype=object) + col_significance[:] = [_map_idxs(np.where(row)[0]) for row in significance] + return col_significance + + @lazyproperty + def _values(self): + """list of _PairwiseSigPVals tests objects. + + Result has as many elements as there are columns in the slice. Each + significance test contains `p_vals` and `t_stats` significance tests. + """ + return [ + _PairwiseMeansSigPVals( + self._dimensions, + self._second_order_measures, + self._cube_measures, + col_idx, + self._column_order, + ) + for col_idx, col_order in enumerate(self._column_order) + if col_order >= 0 + ] + + class _PairwiseSigTStats(_BaseSecondOrderMeasure): """Provides pairwise significance t-stats measure for matrix and selected subvar. @@ -594,13 +737,13 @@ def __init__( second_order_measures, cube_measures, selected_column_idx, - inserted_col_idxs, + column_order, ): super(_PairwiseMeansSigTStats, self).__init__( dimensions, second_order_measures, cube_measures ) self._selected_column_idx = selected_column_idx - self._inserted_col_idxs = inserted_col_idxs + self._column_order = column_order @lazyproperty def blocks(self): @@ -612,13 +755,6 @@ def t_stats(self): """2D float64 ndarray of means t-stats significance for the selected column.""" return self._t_stats[self._selected_column_idx] - @lazyproperty - def _n_cols(self): - """integer number of columns including subtotals.""" - return self._cube_measures.cube_means.means.shape[-1] + len( - self._inserted_col_idxs - ) - @lazyproperty def _t_stats(self): """List of 2D float64 ndarrays representing t-stats for means pairwise testing. @@ -633,14 +769,14 @@ def _t_stats(self): col_bases = self._cube_measures.unweighted_cube_counts.column_bases t_stats = [] offset = 0 - for col in range(self._n_cols): - if col in self._inserted_col_idxs: + for col_idx, col_order in enumerate(self._column_order): + if col_order < 0: offset += 1 t_stats.append(np.full(means.shape, np.nan)) else: - combined_variance = variance[:, col - offset] + variance.T - diff = means.T - means[:, col - offset] - n = col_bases[:, col - offset] + col_bases.T + combined_variance = variance[:, col_idx - offset] + variance.T + diff = means.T - means[:, col_idx - offset] + n = col_bases[:, col_idx - offset] + col_bases.T t_stats.append(diff.T * np.sqrt(n.T / combined_variance.T)) return t_stats @@ -668,64 +804,17 @@ def _p_vals(self): col_bases = self._cube_measures.unweighted_cube_counts.column_bases p_vals = [] offset = 0 - for col in range(self._n_cols): - if col in self._inserted_col_idxs: + for col_idx, col_order in enumerate(self._column_order): + if col_order < 0: offset += 1 p_vals.append(np.full(t_stats.shape, np.nan)) else: - n = col_bases[:, col - offset] + col_bases.T + n = col_bases[:, col_idx - offset] + col_bases.T df = 2 * (n - 1) p_vals.append(2 * (1 - t.cdf(abs(t_stats), df=df.T))) return p_vals -class _PairwiseIndices(_BaseSecondOrderMeasure): - """Provides pairwise significance indices measure for matrix.""" - - def __init__( - self, dimensions, second_order_measures, cube_measures, alpha, only_larger - ): - super(_PairwiseIndices, self).__init__( - dimensions, second_order_measures, cube_measures - ) - self._alpha = alpha - self._only_larger = only_larger - - @lazyproperty - def blocks(self): - """2D array of the four 2D "blocks" making up this measure.""" - pairwise_indices = np.array( - [self._pairwise_indices(v.p_vals, v.t_stats) for v in self._values] - ).T - return NanSubtotals.blocks(pairwise_indices, self._dimensions) - - def _pairwise_indices(self, p_vals, t_stats): - """1D ndarray containing tuples of int pairwise indices of each column.""" - significance = p_vals < self._alpha - if self._only_larger: - significance = np.logical_and(t_stats < 0, significance) - col_significance = np.empty((len(significance),), dtype=object) - col_significance[:] = [tuple(np.where(sig_row)[0]) for sig_row in significance] - return col_significance - - @lazyproperty - def _values(self): - """list of _PairwiseSigPVals tests objects. - - Result has as many elements as there are columns in the slice. Each - significance test contains `p_vals` and `t_stats` significance tests. - """ - return [ - _PairwiseSigPVals( - self._dimensions, - self._second_order_measures, - self._cube_measures, - col_idx, - ) - for col_idx in range(self._cube_measures.cube_overlaps.overlaps.shape[1]) - ] - - class _RowProportions(_BaseSecondOrderMeasure): """Provides the row-proportions measure for a matrix. diff --git a/src/cr/cube/matrix/subtotals.py b/src/cr/cube/matrix/subtotals.py index 85cab51ef..f4b489bb4 100644 --- a/src/cr/cube/matrix/subtotals.py +++ b/src/cr/cube/matrix/subtotals.py @@ -136,17 +136,23 @@ class NanSubtotals(_BaseSubtotals): Each subtotal value (and intersection value) is `np.nan`. """ + filler = np.nan + def _intersection(self, row_subtotal, column_subtotal): """Unconditionally return np.nan for each intersection cell.""" - return np.nan + return self.filler def _subtotal_column(self, subtotal): """Return (n_rows,) ndarray of np.nan values.""" - return np.full(self._nrows, np.nan) + return np.full(self._nrows, self.filler) def _subtotal_row(self, subtotal): """Return (n_cols,) ndarray of np.nan values.""" - return np.full(self._ncols, np.nan) + return np.full(self._ncols, self.filler) + + +class NoneSubtotals(NanSubtotals): + filler = None class SumSubtotals(_BaseSubtotals): diff --git a/tests/integration/test_pairwise_significance.py b/tests/integration/test_pairwise_significance.py index 204858487..fa0527ef3 100644 --- a/tests/integration/test_pairwise_significance.py +++ b/tests/integration/test_pairwise_significance.py @@ -916,12 +916,12 @@ def test_mean_diff_significance_indices_num_array_grouped_by_cat_hs_weighted(sel ).partitions[0] assert slice_.pairwise_means_indices.tolist() == [ - [None, None, None, None, (), (), (), ()], - [None, None, None, None, (), (), (), ()], - [None, None, None, None, (), (), (), (5,)], + [None, None, None, None, (), (), (), None], + [None, None, None, None, (), (), (), None], + [None, None, None, None, (), (), (5,), None], ] assert slice_.pairwise_means_indices_alt.tolist() == [ - [None, None, None, None, (), (), (), ()], - [None, None, None, None, (), (), (), ()], - [None, None, None, None, (), (), (), (4, 5)], + [None, None, None, None, (), (), (), None], + [None, None, None, None, (), (), (), None], + [None, None, None, None, (), (), (4, 5), None], ]