Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Proper hs in t tests 165348655 #151

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,9 @@ The detailed description can be found

## Changes

#### 1.9.16
- Proper t-stats for cubes with H&S

#### 1.9.15
- Implement pairwise indices for Wishart, directly in cube

Expand Down
2 changes: 1 addition & 1 deletion src/cr/cube/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

"""Initialization module for crunch-cube package."""

__version__ = "1.9.15"
__version__ = "1.9.16"
4 changes: 2 additions & 2 deletions src/cr/cube/cube_slice.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,14 +563,14 @@ def pairwise_indices(self, alpha=0.05, only_larger=True, hs_dims=None):
self, alpha=alpha, only_larger=only_larger, hs_dims=hs_dims
).pairwise_indices

def pairwise_significance_tests(self, column_idx):
def pairwise_significance_tests(self, column_idx, hs_dims=None):
"""list of _ColumnPairwiseSignificance tests.

Result has as many elements as there are columns in the slice. Each
significance test contains `p_vals` and `t_stats` (ndarrays that represent
probability values and statistical scores).
"""
return PairwiseSignificance(self).values[column_idx]
return PairwiseSignificance(self, hs_dims=hs_dims).values[column_idx]

def _apply_pruning_mask(self, res, hs_dims=None):
array = self.as_array(prune=True, include_transforms_for_dims=hs_dims)
Expand Down
35 changes: 14 additions & 21 deletions src/cr/cube/measures/pairwise_significance.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import numpy as np
from scipy.stats import t

from cr.cube.util import lazyproperty, intersperse_hs_in_std_res
from cr.cube.util import lazyproperty

try:
xrange
Expand Down Expand Up @@ -48,21 +48,13 @@ def values(self):
self._only_larger,
self._hs_dims,
)
for col_idx in range(self._slice.shape[1])
for col_idx in range(self._slice.get_shape(hs_dims=self._hs_dims)[1])
]

@lazyproperty
def pairwise_indices(self):
"""ndarray containing tuples of pairwise indices."""
pwi = np.array([sig.pairwise_indices for sig in self.values]).T

if self._hs_dims and 1 in self._hs_dims:
# If we need to account for the dimension 1 in pairwise indices, we need
# to intersperse with NaNs. The dimension 0 is already tackled
# when determining the indices.
pwi = intersperse_hs_in_std_res(self._slice, (1,), pwi)

return pwi
return np.array([sig.pairwise_indices for sig in self.values]).T


# pylint: disable=too-few-public-methods
Expand All @@ -88,24 +80,25 @@ def __init__(
self._hs_dims = hs_dims

@lazyproperty
def _t_stats(self):
props = self._slice.proportions(axis=0)
def t_stats(self):
props = self._slice.proportions(
axis=0, include_transforms_for_dims=self._hs_dims
)
diff = props - props[:, [self._col_idx]]
margin = self._slice.margin(axis=0, weighted=self._weighted)
margin = self._slice.margin(
axis=0, weighted=self._weighted, include_transforms_for_dims=self._hs_dims
)
var_props = props * (1.0 - props) / margin
se_diff = np.sqrt(var_props + var_props[:, [self._col_idx]])
return diff / se_diff

@lazyproperty
def t_stats(self):
return intersperse_hs_in_std_res(self._slice, self._hs_dims, self._t_stats)

@lazyproperty
def p_vals(self):
unweighted_n = self._slice.margin(axis=0, weighted=False)
unweighted_n = self._slice.margin(
axis=0, weighted=False, include_transforms_for_dims=self._hs_dims
)
df = unweighted_n + unweighted_n[self._col_idx] - 2
p_vals = 2 * (1 - t.cdf(abs(self._t_stats), df=df))
return intersperse_hs_in_std_res(self._slice, self._hs_dims, p_vals)
return 2 * (1 - t.cdf(abs(self.t_stats), df=df))

@lazyproperty
def pairwise_indices(self):
Expand Down
7 changes: 1 addition & 6 deletions src/cr/cube/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,7 @@ def compress_pruned(table):


def intersperse_hs_in_std_res(slice_, hs_dims, res):

if not hs_dims:
# Don't intersperse anything, just return the result
return res

# Perform the insertions of place-holding rows and cols for insertions
"""Perform the insertions of place-holding rows and cols for insertions."""
for dim, inds in enumerate(slice_.inserted_hs_indices()):
if dim not in hs_dims:
continue
Expand Down
118 changes: 108 additions & 10 deletions tests/integration/test_pairwise_significance.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,17 +96,115 @@ def test_hirotsu_pairwise_indices(self):
expected = np.array([(), (), (3, 4, 7, 9), (2,), (2,), (), (), (2,), (), (2,)])
np.testing.assert_array_equal(actual_pairwise_indices, expected)

def test_pairwise_t_stats_with_hs(self):
slice_ = CrunchCube(CR.PAIRWISE_HIROTSU_ILLNESS_X_OCCUPATION_WITH_HS).slices[0]
expected = np.array(
[
[
0.0,
-0.06178448,
0.30342874,
-3.18865018,
-3.82130608,
-2.99560531,
0.07456344,
-0.68932699,
-2.95469238,
-0.46970468,
-4.14956044,
],
[
0.0,
1.18922394,
0.38254102,
3.3306654,
3.45013209,
2.7520633,
0.59216241,
0.86352416,
2.54171145,
1.10130414,
3.3839919,
],
[
0.0,
-1.7080666,
-0.931165,
-0.87419923,
-0.24915622,
-0.2367748,
-0.97198009,
-0.38504801,
-0.03910193,
-1.02720423,
0.19773989,
],
]
)
t_stats = slice_.pairwise_significance_tests(
column_idx=0, hs_dims=(0, 1)
).t_stats
np.testing.assert_almost_equal(t_stats, expected)

def test_pairwise_p_vals_with_hs(self):
slice_ = CrunchCube(CR.PAIRWISE_HIROTSU_ILLNESS_X_OCCUPATION_WITH_HS).slices[0]
expected = np.array(
[
[
1.00000000e00,
9.50744855e-01,
7.61580875e-01,
1.45494511e-03,
1.35271514e-04,
2.75262668e-03,
9.40575477e-01,
4.90755141e-01,
3.16822332e-03,
6.38672254e-01,
3.44275537e-05,
],
[
1.00000000e00,
2.34589189e-01,
7.02082945e-01,
8.84605265e-04,
5.67562813e-04,
5.94375533e-03,
5.53862215e-01,
3.88027539e-01,
1.11098015e-02,
2.71039211e-01,
7.25442977e-04,
],
[
1.00000000e00,
8.78852262e-02,
3.51831360e-01,
3.82131035e-01,
8.03255937e-01,
8.12841336e-01,
3.31271828e-01,
7.00272311e-01,
9.68813218e-01,
3.04581978e-01,
8.43264694e-01,
],
]
)
p_vals = slice_.pairwise_significance_tests(column_idx=0, hs_dims=(0, 1)).p_vals
np.testing.assert_almost_equal(p_vals, expected)

def test_pairwise_indices_with_hs(self):
cube = CrunchCube(CR.PAIRWISE_HIROTSU_ILLNESS_X_OCCUPATION_WITH_HS)
slice_ = CrunchCube(CR.PAIRWISE_HIROTSU_ILLNESS_X_OCCUPATION_WITH_HS).slices[0]
expected = [
[
(3, 4, 8, 10),
(3, 4, 8, 10),
(3, 4, 8, 10),
(3, 4, 5, 8, 10),
(3, 4, 5, 8, 10),
(3, 4, 5, 8, 10),
(),
(),
np.nan,
(3, 4, 8, 10),
(10,),
(3, 4, 5, 8, 10),
(3, 4, 10),
(),
(4, 10),
Expand All @@ -118,17 +216,17 @@ def test_pairwise_indices_with_hs(self):
(),
(0, 2, 6, 7),
(0, 2, 6, 7),
np.nan,
(0, 2),
(),
(),
(0, 2),
(),
(0, 2, 6, 7),
],
[(), (), (), (), (), np.nan, (), (), (), (), (1,)],
[(), (), (), (), (), (1,), (), (), (), (), (1,)],
]
actual = cube.slices[0].pairwise_indices(hs_dims=[0, 1]).tolist()
assert actual == expected
pairwise_indices = slice_.pairwise_indices(hs_dims=(0, 1)).tolist()
assert pairwise_indices == expected

def test_hirotsu_pvals_with_hs(self):
"""The shape of the result should be 11 x 11, with H&S (at index 5)."""
Expand Down
12 changes: 4 additions & 8 deletions tests/unit/test_wishart_pairwise_significance.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
class DescribePairwiseSignificance:
def it_provides_access_to_its_values(self, request, slice_):
shape = (2, 2)
slice_.shape = shape
slice_.get_shape.return_value = shape
expected_test_values = PairwiseSignificance(slice_).values
assert len(expected_test_values) == 2
for i, column_pairwise_significance in enumerate(expected_test_values):
Expand All @@ -37,13 +37,13 @@ def it_can_calculate_t_stats(self, t_stats_fixture, slice_):
slice_.proportions.return_value = props
slice_.margin.return_value = margin
np.testing.assert_almost_equal(
_ColumnPairwiseSignificance(slice_, col_idx)._t_stats, t_stats
_ColumnPairwiseSignificance(slice_, col_idx).t_stats, t_stats
)

def it_can_calculate_p_vals(self, p_vals_fixture, slice_, _t_stats_prop_):
def it_can_calculate_p_vals(self, p_vals_fixture, slice_, t_stats_prop_):
col_idx, t_stats, margin, p_vals = p_vals_fixture
slice_.margin.return_value = margin
_t_stats_prop_.return_value = t_stats
t_stats_prop_.return_value = t_stats
np.testing.assert_almost_equal(
_ColumnPairwiseSignificance(slice_, col_idx).p_vals, p_vals
)
Expand Down Expand Up @@ -177,10 +177,6 @@ def t_stats_fixture(self, request):
def slice_(self, request):
return instance_mock(request, CubeSlice)

@pytest.fixture
def _t_stats_prop_(self, request):
return property_mock(request, _ColumnPairwiseSignificance, "_t_stats")

@pytest.fixture
def t_stats_prop_(self, request):
return property_mock(request, _ColumnPairwiseSignificance, "t_stats")
Expand Down