-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
tstats correct for CATxMRxITSELF Cubes #186
Changes from 2 commits
d5720fe
031f14b
d05d25f
68404ab
9212bb9
8c168c6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -84,6 +84,7 @@ celerybeat-schedule | |
.env | ||
|
||
# virtualenv | ||
Pipfile | ||
.venv | ||
venv/ | ||
venv3/ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -268,6 +268,22 @@ def inflate(self): | |
self._mask_size, | ||
) | ||
|
||
@lazyproperty | ||
def is_mr_by_itself(self): | ||
""" | ||
It identify if the cube contains MRxItself as last 2 dimensions. | ||
|
||
If the last 2 dimensions in cube (ndim>=3) are MR and they have | ||
the same alias returns True | ||
""" | ||
return ( | ||
True | ||
if len(set([dimension.alias for dimension in self.dimensions[-2:]])) == 1 | ||
and all(dim_type == DT.MR for dim_type in self.dimension_types[-2:]) | ||
and self.ndim >= 3 | ||
else False | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The return (
# ---there are at least three dimensions---
self.ndim >= 3
# ---the last two are both MR---
and all(dim_type == DT.MR for dim_type in self.dimension_types[-2:])
# ---and they both have the same alias---
and len(set([dimension.alias for dimension in self.dimensions[-2:]])) == 1
) Also note the use of comments to clarify your intent with otherwise somewhat complex expressions. Also, the most general qualification ( |
||
|
||
@lazyproperty | ||
def is_weighted(self): | ||
"""True if cube response contains weighted data.""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -186,6 +186,10 @@ def inserted_row_idxs(self): | |
def is_empty(self): | ||
return any(s == 0 for s in self.shape) | ||
|
||
@lazyproperty | ||
def cube_is_mr_by_itself(self): | ||
return self._cube.is_mr_by_itself | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Respect alphabetical ordering when adding new methods; do not locate them randomly. This is especially important with a value object like this having many external methods. Locating a method in a long list by scanning is tedious and error-prone. |
||
@lazyproperty | ||
def means(self): | ||
return np.array([row.means for row in self._matrix.rows]) | ||
|
@@ -202,6 +206,11 @@ def name(self): | |
""" | ||
return self.rows_dimension_name | ||
|
||
@lazyproperty | ||
def overlaps_tstats(self): | ||
return self._matrix.overlaps_tstats | ||
# return self._matrix.overlaps_tstats[self._slice_idx] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't leave commented code in a commit. |
||
|
||
@lazyproperty | ||
def pairwise_indices(self): | ||
alpha = self._transforms_dict.get("pairwise_indices", {}).get("alpha", 0.05) | ||
|
@@ -458,6 +467,10 @@ def table_name(self): | |
|
||
title = self._cube.name | ||
table_name = self._cube.dimensions[0].valid_elements[self._slice_idx].label | ||
|
||
if self._cube.is_mr_by_itself: | ||
return title | ||
|
||
return "%s: %s" % (title, table_name) | ||
|
||
@lazyproperty | ||
|
@@ -562,6 +575,10 @@ def counts(self): | |
def is_empty(self): | ||
return any(s == 0 for s in self._shape) | ||
|
||
@lazyproperty | ||
def cube_is_mr_by_itself(self): | ||
return False | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this property required in a |
||
@lazyproperty | ||
def inserted_row_idxs(self): | ||
# TODO: add integration-test coverage for this. | ||
|
@@ -797,6 +814,10 @@ def base_count(self): | |
def is_empty(self): | ||
return False if self.base_count else True | ||
|
||
@lazyproperty | ||
def cube_is_mr_by_itself(self): | ||
return False | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not in alphabetical order and unclear what client requires this on a nub. |
||
@lazyproperty | ||
def means(self): | ||
return self._scalar.means | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -258,6 +258,10 @@ def __init__(self, dimension_dict, dimension_type, dimension_transforms=None): | |
self._dimension_type = dimension_type | ||
self._dimension_transforms_arg = dimension_transforms | ||
|
||
@lazyproperty | ||
def alias(self): | ||
return self._dimension_dict["references"]["alias"] | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we sure that all dimensions include an alias? Add a docstring. |
||
@lazyproperty | ||
def all_elements(self): | ||
"""_AllElements object providing cats or subvars of this dimension. | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -15,7 +15,7 @@ | |||||||||||||||||||
from scipy.stats.contingency import expected_freq | ||||||||||||||||||||
|
||||||||||||||||||||
from cr.cube.enum import DIMENSION_TYPE as DT | ||||||||||||||||||||
from cr.cube.util import lazyproperty | ||||||||||||||||||||
from cr.cube.util import lazyproperty, calculate_overlap_tstats | ||||||||||||||||||||
|
||||||||||||||||||||
|
||||||||||||||||||||
class TransformedMatrix(object): | ||||||||||||||||||||
|
@@ -45,6 +45,14 @@ def rows(self): | |||||||||||||||||||
if not row.hidden | ||||||||||||||||||||
) | ||||||||||||||||||||
|
||||||||||||||||||||
@lazyproperty | ||||||||||||||||||||
def overlaps_tstats(self): | ||||||||||||||||||||
return ( | ||||||||||||||||||||
self._unordered_matrix.overlaps_tstats | ||||||||||||||||||||
if self._unordered_matrix._is_cat_x_mr_x_itself | ||||||||||||||||||||
else None | ||||||||||||||||||||
) | ||||||||||||||||||||
|
||||||||||||||||||||
Comment on lines
+40
to
+47
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not in alphabetical order. |
||||||||||||||||||||
@lazyproperty | ||||||||||||||||||||
def table_base(self): | ||||||||||||||||||||
return self.table_base_unpruned[ | ||||||||||||||||||||
|
@@ -218,7 +226,24 @@ def factory(cls, cube, dimensions, slice_idx): | |||||||||||||||||||
base_counts = cube.base_counts | ||||||||||||||||||||
counts_with_missings = cube.counts_with_missings | ||||||||||||||||||||
dimension_types = cube.dimension_types[-2:] | ||||||||||||||||||||
if cube.dimension_types == (DT.CAT, DT.MR, DT.MR) and cube.is_mr_by_itself: | ||||||||||||||||||||
|
||||||||||||||||||||
overlap_tstats = calculate_overlap_tstats( | ||||||||||||||||||||
_MrXMrMatrix, dimensions, counts, base_counts, counts_with_missings | ||||||||||||||||||||
) | ||||||||||||||||||||
|
||||||||||||||||||||
# These are apparent dimensions (user dimensions). Do we need to get all the dims? | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This question should be resolved before PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (consider it resolved) |
||||||||||||||||||||
dimensions = cube.dimensions[:-1] | ||||||||||||||||||||
counts = np.sum(counts[:, :, :, 0], axis=3) | ||||||||||||||||||||
base_counts = np.sum(base_counts[:, :, :, 0], axis=3) | ||||||||||||||||||||
counts_with_missings = np.sum(counts_with_missings[:, :, :, 0], axis=3) | ||||||||||||||||||||
return _CatXMrMatrix( | ||||||||||||||||||||
dimensions, | ||||||||||||||||||||
counts, | ||||||||||||||||||||
base_counts, | ||||||||||||||||||||
counts_with_missings, | ||||||||||||||||||||
overlaps=overlap_tstats, | ||||||||||||||||||||
) | ||||||||||||||||||||
# For cubes with means, create one of the means-matrix types | ||||||||||||||||||||
if cube.has_means: | ||||||||||||||||||||
if cube.ndim == 3: | ||||||||||||||||||||
|
@@ -562,13 +587,25 @@ class _CatXMrMatrix(_MatrixWithMR): | |||||||||||||||||||
(which correspond to the MR dimension). | ||||||||||||||||||||
""" | ||||||||||||||||||||
|
||||||||||||||||||||
def __init__( | ||||||||||||||||||||
self, dimensions, counts, base_counts, counts_with_missings, overlaps=None | ||||||||||||||||||||
): | ||||||||||||||||||||
super(_CatXMrMatrix, self).__init__( | ||||||||||||||||||||
dimensions, counts, base_counts, counts_with_missings | ||||||||||||||||||||
) | ||||||||||||||||||||
self._overlaps = overlaps | ||||||||||||||||||||
|
||||||||||||||||||||
@lazyproperty | ||||||||||||||||||||
def columns(self): | ||||||||||||||||||||
return tuple( | ||||||||||||||||||||
_CatXMrVector(counts.T, base_counts.T, element, table_margin) | ||||||||||||||||||||
for counts, base_counts, element, table_margin in self._column_generator | ||||||||||||||||||||
) | ||||||||||||||||||||
|
||||||||||||||||||||
@lazyproperty | ||||||||||||||||||||
def overlaps_tstats(self): | ||||||||||||||||||||
return self._overlaps if self._is_cat_x_mr_x_itself else None | ||||||||||||||||||||
|
||||||||||||||||||||
@lazyproperty | ||||||||||||||||||||
def rows(self): | ||||||||||||||||||||
return tuple( | ||||||||||||||||||||
|
@@ -613,6 +650,10 @@ def _baseline(self): | |||||||||||||||||||
dim_sum = np.sum(self._all_counts, axis=2)[self._valid_rows_idxs] | ||||||||||||||||||||
return dim_sum / np.sum(dim_sum, axis=0) | ||||||||||||||||||||
|
||||||||||||||||||||
@lazyproperty | ||||||||||||||||||||
def _is_cat_x_mr_x_itself(self): | ||||||||||||||||||||
return True if self._overlaps is not None else False | ||||||||||||||||||||
|
||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not in alphabetical order (within private methods). |
||||||||||||||||||||
@lazyproperty | ||||||||||||||||||||
def _column_generator(self): | ||||||||||||||||||||
return zip( | ||||||||||||||||||||
|
@@ -626,6 +667,8 @@ def _column_generator(self): | |||||||||||||||||||
|
||||||||||||||||||||
@lazyproperty | ||||||||||||||||||||
def _zscores(self): | ||||||||||||||||||||
# if the cube is a special one (5D with MRxItself as last dims) | ||||||||||||||||||||
# the zscores should be the same as a 2D MRxMR matrix | ||||||||||||||||||||
return self._array_type_std_res( | ||||||||||||||||||||
self._counts[:, :, 0], | ||||||||||||||||||||
self.table_margin, | ||||||||||||||||||||
|
@@ -635,9 +678,11 @@ def _zscores(self): | |||||||||||||||||||
|
||||||||||||||||||||
|
||||||||||||||||||||
class _CatXMrMeansMatrix(_CatXMrMatrix): | ||||||||||||||||||||
def __init__(self, dimensions, means, base_counts): | ||||||||||||||||||||
def __init__(self, dimensions, means, base_counts, overlaps=None): | ||||||||||||||||||||
counts = np.zeros(means.shape) | ||||||||||||||||||||
super(_CatXMrMeansMatrix, self).__init__(dimensions, counts, base_counts) | ||||||||||||||||||||
super(_CatXMrMeansMatrix, self).__init__( | ||||||||||||||||||||
dimensions, counts, base_counts, overlaps | ||||||||||||||||||||
) | ||||||||||||||||||||
self._means = means | ||||||||||||||||||||
|
||||||||||||||||||||
@lazyproperty | ||||||||||||||||||||
|
@@ -667,6 +712,10 @@ def columns(self): | |||||||||||||||||||
for counts, base_counts, element, table_margin in self._column_generator | ||||||||||||||||||||
) | ||||||||||||||||||||
|
||||||||||||||||||||
@lazyproperty | ||||||||||||||||||||
def _mr_shadow_proportions(self): | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a docstring explaining what a shadow proportion is. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @malecki can u provide a meaningful description for what a shadow prop is? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||
return self._counts[:, 0, :, 0] / self._pairwise_overlap_total | ||||||||||||||||||||
|
||||||||||||||||||||
@lazyproperty | ||||||||||||||||||||
def rows(self): | ||||||||||||||||||||
return tuple( | ||||||||||||||||||||
|
@@ -725,6 +774,10 @@ def _column_generator(self): | |||||||||||||||||||
self.table_margin.T, | ||||||||||||||||||||
) | ||||||||||||||||||||
|
||||||||||||||||||||
@lazyproperty | ||||||||||||||||||||
def _pairwise_overlap_total(self): | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Docstrings should be of the following PEP 257 form: def method(self):
"""Return 2D ndarray symmetric-square matrix of valid observations.
Given a 4D hypercube of multiple-response (MR) items, this method calculates the
*symmetric square matrix* which is a fancy word for {this simple idea}.
""" The key features are a single line to start, short enough not to wrap, that gives some gist of what to expect but should always start by mentioning the return-type, since that is the effective "type" of the function. Then, when more is called for, a separate passage, separated by a blank line that fills in any important remaining details. The key thing is to describe the contract of the method, i.e. what it gives back and what you need to give it, in that order. |
||||||||||||||||||||
return np.sum(np.sum(self._counts, axis=1), axis=2) | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a docstring. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @malecki also here :D |
||||||||||||||||||||
|
||||||||||||||||||||
@lazyproperty | ||||||||||||||||||||
def _row_generator(self): | ||||||||||||||||||||
return zip( | ||||||||||||||||||||
|
@@ -745,6 +798,53 @@ def _zscores(self): | |||||||||||||||||||
np.sum(self._counts, axis=3)[:, 0, :], | ||||||||||||||||||||
) | ||||||||||||||||||||
|
||||||||||||||||||||
@lazyproperty | ||||||||||||||||||||
def tstats_overlap(self): | ||||||||||||||||||||
""" | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Place this in alphabetical order within public instance methods. |
||||||||||||||||||||
ndarray of correct tstats values considering the overlapped observations | ||||||||||||||||||||
t = (pi-pj)/s.e.(pi-pj) | ||||||||||||||||||||
where | ||||||||||||||||||||
s.e.(pi-pj) = sqrt(p_i*(1-p_i)/n_i+p_j*(1-p_j)/n_j-2*n_ij*(p_ij-p_i*p_j)/(n_i*n_j)) | ||||||||||||||||||||
ni = base size for first subvar | ||||||||||||||||||||
nj = base size for second subvar | ||||||||||||||||||||
nij = number of overlapping observations | ||||||||||||||||||||
pij = proportion for which both subvar are True (selected) | ||||||||||||||||||||
In this case MRxMR the diff pi-pj is the pairwise subtraction of the diagonal of the | ||||||||||||||||||||
shadow_proportions the denominator is the matrix containing the unweighted counts | ||||||||||||||||||||
of the cube | ||||||||||||||||||||
""" | ||||||||||||||||||||
|
||||||||||||||||||||
# Subtraction of the proportions foreach observation | ||||||||||||||||||||
diff = np.subtract.outer( | ||||||||||||||||||||
self._mr_shadow_proportions.diagonal(), | ||||||||||||||||||||
self._mr_shadow_proportions.diagonal(), | ||||||||||||||||||||
) | ||||||||||||||||||||
# Sum of the s.e. for each observation | ||||||||||||||||||||
se_pi_pj = np.add.outer( | ||||||||||||||||||||
self._mr_shadow_proportions.diagonal() | ||||||||||||||||||||
* (1 - self._mr_shadow_proportions.diagonal()) | ||||||||||||||||||||
/ self.table_base.diagonal(), | ||||||||||||||||||||
self._mr_shadow_proportions.diagonal() | ||||||||||||||||||||
* (1 - self._mr_shadow_proportions.diagonal()) | ||||||||||||||||||||
/ self.table_base.diagonal(), | ||||||||||||||||||||
) | ||||||||||||||||||||
# Correction factor considering the overlap | ||||||||||||||||||||
correction_factor = ( | ||||||||||||||||||||
2 | ||||||||||||||||||||
* self.table_base | ||||||||||||||||||||
* ( | ||||||||||||||||||||
self._mr_shadow_proportions | ||||||||||||||||||||
- np.multiply.outer( | ||||||||||||||||||||
self._mr_shadow_proportions.diagonal(), | ||||||||||||||||||||
self._mr_shadow_proportions.diagonal(), | ||||||||||||||||||||
) | ||||||||||||||||||||
) | ||||||||||||||||||||
) / np.multiply.outer(self.table_base.diagonal(), self.table_base.diagonal()) | ||||||||||||||||||||
se_diff = np.sqrt(se_pi_pj - correction_factor) | ||||||||||||||||||||
t_stats = diff / se_diff | ||||||||||||||||||||
np.fill_diagonal(t_stats, 0) | ||||||||||||||||||||
return t_stats | ||||||||||||||||||||
|
||||||||||||||||||||
|
||||||||||||||||||||
# ===INSERTION (SUBTOTAL) VECTORS=== | ||||||||||||||||||||
|
||||||||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is a PEP convention for docstring layout: https://www.python.org/dev/peps/pep-0257/
The first line goes on the same line as the opening """: