Skip to content

Commit

Permalink
[#177801573]: change overlaps percentages
Browse files Browse the repository at this point in the history
* Change how percentages are calculated for overlaps measures
* Use weighted counts instead of (unweighted) overlaps selected counts
* Prevent regression by changing a fixture and an expectation in a test
  • Loading branch information
slobodan-ilic committed Apr 19, 2021
1 parent d6b96aa commit 297a726
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 34 deletions.
23 changes: 13 additions & 10 deletions src/cr/cube/matrix/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,7 @@ def t_stats(self):
[
[
_PairwiseSignificaneBetweenSubvariablesHelper(
self._cube_measures.weighted_cube_counts.weighted_counts,
self._cube_measures.cube_overlaps.overlaps,
self._cube_measures.cube_overlaps.valid_overlaps,
row_idx,
Expand Down Expand Up @@ -565,6 +566,7 @@ def p_vals(self):
[
[
_PairwiseSignificaneBetweenSubvariablesHelper(
self._cube_measures.weighted_cube_counts.weighted_counts,
self._cube_measures.cube_overlaps.overlaps,
self._cube_measures.cube_overlaps.valid_overlaps,
row_idx,
Expand Down Expand Up @@ -1241,7 +1243,8 @@ def blocks(self):
class _PairwiseSignificaneBetweenSubvariablesHelper(object):
"""Helper for calculating overlaps significance between subvariables."""

def __init__(self, overlaps, valid_overlaps, row_idx, idx_a, idx_b):
def __init__(self, counts, overlaps, valid_overlaps, row_idx, idx_a, idx_b):
self._counts = counts
self._overlaps = overlaps
self._valid_overlaps = valid_overlaps
self._row_idx = row_idx
Expand Down Expand Up @@ -1292,15 +1295,15 @@ def _column_proportions(self):
the total of selected counts from that column (to which the cell belongs).
"""
# ---pa and pb are the selected counts of the
# ---cells at [row_idx, A], [row_idx, B],
pa = self._overlaps[self._row_idx, self._idx_a, self._idx_a]
pb = self._overlaps[self._row_idx, self._idx_b, self._idx_b]
# ---Sa and Sb are the totals of the selected
# ---counts from columns A and B.
Sa, Sb, _ = self._selected_counts
# ---pa/Sa and pb/Sb represent the column proportions of selected counts
return (pa / Sa, pb / Sb)
# ---pa and pb are the column percentages of weighted counts
# ---of the cells at positions [row_idx, A] and [row_idx, B]
pa = self._counts[self._row_idx, self._idx_a] / np.sum(
self._counts[:, self._idx_a], axis=0
)
pb = self._counts[self._row_idx, self._idx_b] / np.sum(
self._counts[:, self._idx_b], axis=0
)
return (pa, pb)

@lazyproperty
def _selected_counts(self):
Expand Down
60 changes: 36 additions & 24 deletions tests/integration/test_pairwise_significance.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,25 +512,29 @@ def test_pairwise_significance_mr_x_mr(self):
def test_pairwise_cat_x_mr_gender_x_all_pets_owned(self):
slice_ = Cube(OL.CAT_X_MR_GENDER_X_ALL_PETS_OWNED).partitions[0]

assert slice_.column_percentages.tolist() == [
[75.0, 20.0, 60.0],
[25.0, 80.0, 40.0],
]
assert slice_.column_percentages.tolist() == pytest.approx(
np.array(
[
[66.6667, 14.28571, 50.0],
[33.33333, 85.714286, 50.0],
]
)
)

# Assert for first column (subvariable)
assert slice_.pairwise_significance_t_stats(0).tolist() == pytest.approx(
np.array(
[
[0.0, -2.76314, -1.587178],
[0.0, 2.76314, 1.587178],
[0.0, -2.6315597, -1.76353],
[0.0, 2.6315597, 1.76353],
]
),
)
assert slice_.pairwise_significance_p_vals(0) == pytest.approx(
np.array(
[
[0.0, 0.0103743, 0.1229579],
[0.0, 0.0103743, 0.1229579],
[0.0, 0.01410448, 0.0879948],
[0.0, 0.01410448, 0.0879948],
]
),
)
Expand All @@ -539,16 +543,16 @@ def test_pairwise_cat_x_mr_gender_x_all_pets_owned(self):
assert slice_.pairwise_significance_t_stats(1).tolist() == pytest.approx(
np.array(
[
[2.76314, 0.0, 9.07697],
[-2.76314, 0.0, -9.07697],
[2.63156, 0.0, 8.10444],
[-2.63156, 0.0, -8.10444],
]
),
)
assert slice_.pairwise_significance_p_vals(1) == pytest.approx(
np.array(
[
[0.0103743, 0.0, 0.003145e-06],
[0.0103743, 0.0, 0.003145e-06],
[0.01410448, 0, 0.025067e-06],
[0.01410448, 0, 0.025067e-06],
]
),
)
Expand All @@ -557,16 +561,16 @@ def test_pairwise_cat_x_mr_gender_x_all_pets_owned(self):
assert slice_.pairwise_significance_t_stats(2).tolist() == pytest.approx(
np.array(
[
[1.587178, -9.07697, 0.0],
[-1.587178, 9.07697, 0.0],
[1.763531, -8.104439, 0.0],
[-1.763531, 8.104439, 0.0],
]
),
)
assert slice_.pairwise_significance_p_vals(2) == pytest.approx(
np.array(
[
[0.1229579, 0.003146e-06, 0.0],
[0.1229579, 0.003146e-06, 0.0],
[0.0879948, 0.025067e-06, 0],
[0.0879948, 0.025067e-06, 0],
]
),
)
Expand All @@ -577,10 +581,14 @@ def test_pairwise_significance_indices(self):
OL.CAT_X_MR_GENDER_X_ALL_PETS_OWNED, transforms=transforms
).partitions[0]

assert slice_.column_percentages.tolist() == [
[75.0, 20.0, 60.0],
[25.0, 80.0, 40.0],
]
assert slice_.column_percentages.tolist() == pytest.approx(
np.array(
[
[66.6667, 14.28571, 50.0],
[33.33333, 85.714286, 50.0],
]
)
)

assert slice_.pairwise_indices.tolist() == [
[(1,), (), (1,)],
Expand All @@ -603,10 +611,14 @@ def test_pairwise_significance_all_empty(self):
OL.CAT_X_MR_GENDER_X_ALL_PETS_OWNED, transforms=transforms
).partitions[0]

assert slice_.column_percentages.tolist() == [
[75.0, 20.0, 60.0],
[25.0, 80.0, 40.0],
]
assert slice_.column_percentages.tolist() == pytest.approx(
np.array(
[
[66.6667, 14.28571, 50.0],
[33.33333, 85.714286, 50.0],
]
)
)

assert slice_.pairwise_indices.tolist() == [
[(), (), ()],
Expand Down

0 comments on commit 297a726

Please sign in to comment.