Skip to content

Commit

Permalink
Merge pull request #146 from Crunch-io/add-hs-to-pairwise-indices
Browse files Browse the repository at this point in the history
Intersperse pairwise indices with NaNs where H&S
  • Loading branch information
slobodan-ilic authored Apr 4, 2019
2 parents 34eda08 + 59c76f2 commit 11e7c8f
Show file tree
Hide file tree
Showing 10 changed files with 149 additions and 386 deletions.
7 changes: 4 additions & 3 deletions src/cr/cube/crunch_cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -731,7 +731,7 @@ def wishart_pairwise_pvals(self, axis=0):
"""
return [slice_.wishart_pairwise_pvals(axis=axis) for slice_ in self.slices]

def compare_to_column(self, slice=0, column=0):
def compare_to_column(self, slice_idx=0, column_idx=0):
"""Return matrices of column-comparison p-values as list of numpy.ndarrays.
*t*-statistic and associated p-values for one column of one slice
Expand All @@ -740,9 +740,10 @@ def compare_to_column(self, slice=0, column=0):
(It does not make sense to compare the same column at once across slices.)
*column* (int): Index of slice to compare to, by default 0, the first.
*slice_idx* (int): Index of slice for which we perform comparisons
*column_idx* (int): Index of column to compare to, by default 0 (the first)
"""
return self.slices[slice].pairwise_significance_tests[column]
return self.slices[slice_idx].pairwise_significance_tests(column_idx)

def _adjust_axis(self, axis):
"""Return raw axis/axes corresponding to apparent axis/axes.
Expand Down
34 changes: 17 additions & 17 deletions src/cr/cube/cube_slice.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@
from cr.cube.measures.scale_means import ScaleMeans
from cr.cube.measures.wishart_pairwise_significance import WishartPairwiseSignificance
from cr.cube.measures.pairwise_significance import PairwiseSignificance
from cr.cube.util import compress_pruned, lazyproperty, memoize
from cr.cube.util import (
compress_pruned,
lazyproperty,
memoize,
intersperse_hs_in_std_res,
)

try:
xrange
Expand Down Expand Up @@ -532,14 +537,14 @@ def zscore(self, weighted=True, prune=False, hs_dims=None):
zscore = self._calculate_std_res(counts, total, colsum, rowsum)

if hs_dims:
zscore = self._intersperse_hs_in_std_res(hs_dims, zscore)
zscore = intersperse_hs_in_std_res(self, hs_dims, zscore)

if prune:
return self._apply_pruning_mask(zscore, hs_dims)

return zscore

def pairwise_indices(self, alpha=0.05, only_larger=True):
def pairwise_indices(self, alpha=0.05, only_larger=True, hs_dims=None):
"""Indices of columns where p < alpha for column-comparison t-tests
Returns an array of tuples of columns that are significant at p<alpha,
Expand All @@ -549,19 +554,22 @@ def pairwise_indices(self, alpha=0.05, only_larger=True):
False, however, only the index of values *significantly smaller* than
each cell are indicated.
"""
return PairwiseSignificance(
self, alpha=alpha, only_larger=only_larger
).pairwise_indices
return intersperse_hs_in_std_res(
self,
hs_dims,
PairwiseSignificance(
self, alpha=alpha, only_larger=only_larger, hs_dims=hs_dims
).pairwise_indices,
)

@lazyproperty
def pairwise_significance_tests(self):
def pairwise_significance_tests(self, column_idx):
"""list of _ColumnPairwiseSignificance tests.
Result has as many elements as there are columns in the slice. Each
significance test contains `p_vals` and `t_stats` (ndarrays that represent
probability values and statistical scores).
"""
return PairwiseSignificance(self).values
return PairwiseSignificance(self).values[column_idx]

def _apply_pruning_mask(self, res, hs_dims=None):
array = self.as_array(prune=True, include_transforms_for_dims=hs_dims)
Expand Down Expand Up @@ -655,14 +663,6 @@ def _hs_dims_for_cube(self, hs_dims):
# the indexes of the required dims.
return [d + 1 for d in hs_dims] if hs_dims is not None else None

def _intersperse_hs_in_std_res(self, hs_dims, res):
for dim, inds in enumerate(self.inserted_hs_indices()):
if dim not in hs_dims:
continue
for i in inds:
res = np.insert(res, i, np.nan, axis=(dim - self.ndim))
return res

def _prepare_index_baseline(self, axis):
# First get the margin of the opposite direction of the index axis.
# We need this in order to end up with the right shape of the
Expand Down
35 changes: 28 additions & 7 deletions src/cr/cube/measures/pairwise_significance.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import numpy as np
from scipy.stats import t

from cr.cube.util import lazyproperty
from cr.cube.util import lazyproperty, intersperse_hs_in_std_res

try:
xrange
Expand All @@ -19,12 +19,15 @@
class PairwiseSignificance:
"""Implementation of p-vals and t-tests for each column proportions comparison."""

def __init__(self, slice_, axis=0, weighted=True, alpha=0.05, only_larger=True):
def __init__(
self, slice_, axis=0, weighted=True, alpha=0.05, only_larger=True, hs_dims=None
):
self._slice = slice_
self._axis = axis
self._weighted = weighted
self._alpha = alpha
self._only_larger = only_larger
self._hs_dims = hs_dims

@lazyproperty
def values(self):
Expand All @@ -33,10 +36,15 @@ def values(self):
Result has as many elements as there are coliumns in the slice. Each
significance test contains `p_vals` and `t_stats` significance tests.
"""
slice_, axis, weighted = self._slice, self._axis, self._weighted
return [
_ColumnPairwiseSignificance(
slice_, col_idx, axis, weighted, self._alpha, self._only_larger
self._slice,
col_idx,
self._axis,
self._weighted,
self._alpha,
self._only_larger,
self._hs_dims,
)
for col_idx in range(self._slice.shape[1])
]
Expand All @@ -51,29 +59,42 @@ class _ColumnPairwiseSignificance:
"""Value object providing matrix of T-score based pairwise-comparison P-values"""

def __init__(
self, slice_, col_idx, axis=0, weighted=True, alpha=0.05, only_larger=True
self,
slice_,
col_idx,
axis=0,
weighted=True,
alpha=0.05,
only_larger=True,
hs_dims=None,
):
self._slice = slice_
self._col_idx = col_idx
self._axis = axis
self._weighted = weighted
self._alpha = alpha
self._only_larger = only_larger
self._hs_dims = hs_dims

@lazyproperty
def t_stats(self):
def _t_stats(self):
props = self._slice.proportions(axis=0)
diff = props - props[:, [self._col_idx]]
margin = self._slice.margin(axis=0, weighted=self._weighted)
var_props = props * (1.0 - props) / margin
se_diff = np.sqrt(var_props + var_props[:, [self._col_idx]])
return diff / se_diff

@lazyproperty
def t_stats(self):
return intersperse_hs_in_std_res(self._slice, self._hs_dims, self._t_stats)

@lazyproperty
def p_vals(self):
unweighted_n = self._slice.margin(axis=0, weighted=False)
df = unweighted_n + unweighted_n[self._col_idx] - 2
return 2 * (1 - t.cdf(abs(self.t_stats), df=df))
p_vals = 2 * (1 - t.cdf(abs(self._t_stats), df=df))
return intersperse_hs_in_std_res(self._slice, self._hs_dims, p_vals)

@lazyproperty
def pairwise_indices(self):
Expand Down
15 changes: 15 additions & 0 deletions src/cr/cube/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,21 @@ def compress_pruned(table):
return table


def intersperse_hs_in_std_res(slice_, hs_dims, res):

if not hs_dims:
# Don't intersperse anything, just return the result
return res

# Perform the insertions of place-holding rows and cols for insertions
for dim, inds in enumerate(slice_.inserted_hs_indices()):
if dim not in hs_dims:
continue
for i in inds:
res = np.insert(res, i, np.nan, axis=(dim - slice_.ndim))
return res


class Counter(dict):
"""Mapping where default values are zero"""

Expand Down
12 changes: 12 additions & 0 deletions tests/expectations/cat_x_cat_hirotsu_chi_squared.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[
[0.0, 2.821910158116655, 0.9259711818781733, 12.780855448128131, 16.79727869630099, 0.924655442873681, 0.8008976269312448, 9.616972398702428, 1.4496863124510315, 18.556098937181705],
[2.821910158116655, 0.0, 1.6831132737959318, 8.683471852181562, 13.451053159265136, 0.38467827774871005, 1.5094961530071807, 9.081312924348003, 0.25833985406056126, 16.3533306337074],
[0.9259711818781733, 1.6831132737959318, 0.0, 24.348935423464653, 46.689386077899826, 0.18470822825752797, 1.376598707986204, 22.063658540387774, 1.0102118795109807, 47.62124004565971],
[12.780855448128131, 8.683471852181562, 24.348935423464653, 0.0, 0.8073979083263744, 8.490641259215641, 5.141740694105387, 1.2536004848874829, 3.576241745092247, 2.1974561987876613],
[16.79727869630099, 13.451053159265136, 46.689386077899826, 0.8073979083263744, 0.0, 11.792012011326468, 6.847609367845222, 0.743555569450378, 5.218390456727495, 0.725476017865348],
[0.924655442873681, 0.38467827774871005, 0.18470822825752797, 8.490641259215641, 11.792012011326468, 0.0, 0.7072537831958036, 7.620018353425002, 0.3321969685319031, 14.087591553810693],
[0.8008976269312448, 1.5094961530071807, 1.376598707986204, 5.141740694105387, 6.847609367845222, 0.7072537831958036, 0.0, 3.6724354409467352, 0.39674326208673527, 8.546159019524978],
[9.616972398702428, 9.081312924348003, 22.063658540387774, 1.2536004848874829, 0.743555569450378, 7.620018353425002, 3.6724354409467352, 0.0, 3.4464292421171003, 1.5916695633869193],
[1.4496863124510315, 0.25833985406056126, 1.0102118795109807, 3.576241745092247, 5.218390456727495, 0.3321969685319031, 0.39674326208673527, 3.4464292421171003, 0.0, 6.85424450468994],
[18.556098937181705, 16.3533306337074, 47.62124004565971, 2.1974561987876613, 0.725476017865348, 14.087591553810693, 8.546159019524978, 1.5916695633869193, 6.85424450468994, 0.0]
]
12 changes: 12 additions & 0 deletions tests/expectations/cat_x_cat_hirotsu_pvals.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[
[1.0, 0.9996037164438155, 0.9999999307678402, 0.43583018698994225, 0.17136567049444762, 0.9999999315817448, 0.9999999794278623, 0.7267408061224014, 0.9999973380474136, 0.1057077398991062],
[0.9996037164438155, 1.0, 0.9999913953960333, 0.8064071500427167, 0.38029664889866643, 0.9999999999618748, 0.999996333717649, 0.7735835820931571, 0.9999999999988359, 0.19237524618473767],
[0.9999999307678402, 0.9999913953960333, 1.0, 0.01727762317121584, 3.2901218932623877e-06, 0.9999999999999399, 0.9999982370458956, 0.03652731193295922, 0.9999998575555376, 2.2345630659170723e-06],
[0.43583018698994225, 0.8064071500427167, 0.01727762317121584, 1.0, 0.999999977981595, 0.8215867010430609, 0.9825731149524656, 0.9999991690270158, 0.9980410308375879, 0.9999346879689056],
[0.17136567049444762, 0.38029664889866643, 3.2901218932623877e-06, 0.999999977981595, 1.0, 0.5240635452028406, 0.9263228060483779, 0.9999999890011803, 0.9811003546079167, 0.9999999910679708],
[0.9999999315817448, 0.9999999999618748, 0.9999999999999399, 0.8215867010430609, 0.5240635452028406, 1.0, 0.9999999927991261, 0.8830256555030864, 0.9999999999894102, 0.3314956026407795],
[0.9999999794278623, 0.999996333717649, 0.9999982370458956, 0.9825731149524656, 0.9263228060483779, 0.9999999927991261, 1.0, 0.9976748629172819, 0.9999999999501099, 0.8172690111179404],
[0.7267408061224014, 0.7735835820931571, 0.03652731193295922, 0.9999991690270158, 0.9999999890011803, 0.8830256555030864, 0.9976748629172819, 1.0, 0.9984612271156079, 0.9999944364992435],
[0.9999973380474136, 0.9999999999988359, 0.9999998575555376, 0.9980410308375879, 0.9811003546079167, 0.9999999999894102, 0.9999999999501099, 0.9984612271156079, 1.0, 0.9259991259591228],
[0.1057077398991062, 0.19237524618473767, 2.2345630659170723e-06, 0.9999346879689056, 0.9999999910679708, 0.3314956026407795, 0.8172690111179404, 0.9999944364992435, 0.9259991259591228, 1.0]
]
13 changes: 13 additions & 0 deletions tests/expectations/cat_x_cat_with_hs_hirotsu_pvals.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[
[1.0, 0.9996037164438155, 0.9999999307678402, 0.43583018698994225, 0.17136567049444762, NaN, 0.9999999315817448, 0.9999999794278623, 0.7267408061224014, 0.9999973380474136, 0.1057077398991062],
[0.9996037164438155, 1.0, 0.9999913953960333, 0.8064071500427167, 0.38029664889866643, NaN, 0.9999999999618748, 0.999996333717649, 0.7735835820931571, 0.9999999999988359, 0.19237524618473767],
[0.9999999307678402, 0.9999913953960333, 1.0, 0.01727762317121584, 3.2901218932623877e-06, NaN, 0.9999999999999399, 0.9999982370458956, 0.03652731193295922, 0.9999998575555376, 2.2345630659170723e-06],
[0.43583018698994225, 0.8064071500427167, 0.01727762317121584, 1.0, 0.999999977981595, NaN, 0.8215867010430609, 0.9825731149524656, 0.9999991690270158, 0.9980410308375879, 0.9999346879689056],
[0.17136567049444762, 0.38029664889866643, 3.2901218932623877e-06, 0.999999977981595, 1.0, NaN, 0.5240635452028406, 0.9263228060483779, 0.9999999890011803, 0.9811003546079167, 0.9999999910679708],
[NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN],
[0.9999999315817448, 0.9999999999618748, 0.9999999999999399, 0.8215867010430609, 0.5240635452028406, NaN, 1.0, 0.9999999927991261, 0.8830256555030864, 0.9999999999894102, 0.3314956026407795],
[0.9999999794278623, 0.999996333717649, 0.9999982370458956, 0.9825731149524656, 0.9263228060483779, NaN, 0.9999999927991261, 1.0, 0.9976748629172819, 0.9999999999501099, 0.8172690111179404],
[0.7267408061224014, 0.7735835820931571, 0.03652731193295922, 0.9999991690270158, 0.9999999890011803, NaN, 0.8830256555030864, 0.9976748629172819, 1.0, 0.9984612271156079, 0.9999944364992435],
[0.9999973380474136, 0.9999999999988359, 0.9999998575555376, 0.9980410308375879, 0.9811003546079167, NaN, 0.9999999999894102, 0.9999999999501099, 0.9984612271156079, 1.0, 0.9259991259591228],
[0.1057077398991062, 0.19237524618473767, 2.2345630659170723e-06, 0.9999346879689056, 0.9999999910679708, NaN, 0.3314956026407795, 0.8172690111179404, 0.9999944364992435, 0.9259991259591228, 1.0]
]
3 changes: 2 additions & 1 deletion tests/expectations/mr_x_cat_chi_squared_rows.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
[0.00082446, 0.00666948, 0.0, 2.59397955, 4.62633098, 0.01916237],
[0.74758493, 1.49209171, 2.59397955, 0.0, 0.1780604, 1.26597337],
[1.23711308, 2.48290654, 4.62633098, 0.1780604, 0.0, 2.03470742],
[0.0050688, 0.00335435, 0.01916237, 1.26597337, 2.03470742, 0.0]],
[0.0050688, 0.00335435, 0.01916237, 1.26597337, 2.03470742, 0.0]
],
[
[0.0, 2.59881692, 2.25906878, 1.35337047, 1.4880658, 0.59888122],
[2.59881692, 0.0, 0.20143432, 0.75290712, 0.72974779, 0.97118359],
Expand Down
Loading

0 comments on commit 11e7c8f

Please sign in to comment.