Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tstats correct for CATxMRxITSELF Cubes #186

Merged
merged 6 commits into from
Dec 5, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ celerybeat-schedule
.env

# virtualenv
Pipfile
.venv
venv/
venv3/
Expand Down
16 changes: 16 additions & 0 deletions src/cr/cube/cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,22 @@ def inflate(self):
self._mask_size,
)

@lazyproperty
def is_mr_by_itself(self):
"""
It identify if the cube contains MRxItself as last 2 dimensions.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a PEP convention for docstring layout: https://www.python.org/dev/peps/pep-0257/

The first line goes on the same line as the opening """:

def is_mr_by_itself(self):
    """True if cube contains MRxItself as last 2 dimensions.

    ...
    """


If the last 2 dimensions in cube (ndim>=3) are MR and they have
the same alias returns True
"""
return (
True
if len(set([dimension.alias for dimension in self.dimensions[-2:]])) == 1
and all(dim_type == DT.MR for dim_type in self.dimension_types[-2:])
and self.ndim >= 3
else False
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The True, False, if, and else are redundant here:

        return (
            # ---there are at least three dimensions---
            self.ndim >= 3
            # ---the last two are both MR---
            and all(dim_type == DT.MR for dim_type in self.dimension_types[-2:])
            # ---and they both have the same alias---
            and len(set([dimension.alias for dimension in self.dimensions[-2:]])) == 1
        )

Also note the use of comments to clarify your intent with otherwise somewhat complex expressions.

Also, the most general qualification (self.ndim >= 3) is moved to the top. The other two predicates would be True on a 1D MR array because [x][-2:] == [x].


@lazyproperty
def is_weighted(self):
"""True if cube response contains weighted data."""
Expand Down
21 changes: 21 additions & 0 deletions src/cr/cube/cubepart.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,10 @@ def inserted_row_idxs(self):
def is_empty(self):
return any(s == 0 for s in self.shape)

@lazyproperty
def cube_is_mr_by_itself(self):
return self._cube.is_mr_by_itself

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Respect alphabetical ordering when adding new methods; do not locate them randomly. This is especially important with a value object like this having many external methods. Locating a method in a long list by scanning is tedious and error-prone.

@lazyproperty
def means(self):
return np.array([row.means for row in self._matrix.rows])
Expand All @@ -202,6 +206,11 @@ def name(self):
"""
return self.rows_dimension_name

@lazyproperty
def overlaps_tstats(self):
return self._matrix.overlaps_tstats
# return self._matrix.overlaps_tstats[self._slice_idx]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't leave commented code in a commit.


@lazyproperty
def pairwise_indices(self):
alpha = self._transforms_dict.get("pairwise_indices", {}).get("alpha", 0.05)
Expand Down Expand Up @@ -458,6 +467,10 @@ def table_name(self):

title = self._cube.name
table_name = self._cube.dimensions[0].valid_elements[self._slice_idx].label

if self._cube.is_mr_by_itself:
return title

return "%s: %s" % (title, table_name)

@lazyproperty
Expand Down Expand Up @@ -562,6 +575,10 @@ def counts(self):
def is_empty(self):
return any(s == 0 for s in self._shape)

@lazyproperty
def cube_is_mr_by_itself(self):
return False

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this property required in a Strand or Nub? If it is, place this default case in the base class (in alphabetical order among public instance methods).

@lazyproperty
def inserted_row_idxs(self):
# TODO: add integration-test coverage for this.
Expand Down Expand Up @@ -797,6 +814,10 @@ def base_count(self):
def is_empty(self):
return False if self.base_count else True

@lazyproperty
def cube_is_mr_by_itself(self):
return False

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not in alphabetical order and unclear what client requires this on a nub.

@lazyproperty
def means(self):
return self._scalar.means
Expand Down
4 changes: 4 additions & 0 deletions src/cr/cube/dimension.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,10 @@ def __init__(self, dimension_dict, dimension_type, dimension_transforms=None):
self._dimension_type = dimension_type
self._dimension_transforms_arg = dimension_transforms

@lazyproperty
def alias(self):
return self._dimension_dict["references"]["alias"]

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure that all dimensions include an alias?

Add a docstring.

@lazyproperty
def all_elements(self):
"""_AllElements object providing cats or subvars of this dimension.
Expand Down
106 changes: 103 additions & 3 deletions src/cr/cube/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from scipy.stats.contingency import expected_freq

from cr.cube.enum import DIMENSION_TYPE as DT
from cr.cube.util import lazyproperty
from cr.cube.util import lazyproperty, calculate_overlap_tstats


class TransformedMatrix(object):
Expand Down Expand Up @@ -45,6 +45,14 @@ def rows(self):
if not row.hidden
)

@lazyproperty
def overlaps_tstats(self):
return (
self._unordered_matrix.overlaps_tstats
if self._unordered_matrix._is_cat_x_mr_x_itself
else None
)

Comment on lines +40 to +47
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not in alphabetical order.

@lazyproperty
def table_base(self):
return self.table_base_unpruned[
Expand Down Expand Up @@ -218,7 +226,24 @@ def factory(cls, cube, dimensions, slice_idx):
base_counts = cube.base_counts
counts_with_missings = cube.counts_with_missings
dimension_types = cube.dimension_types[-2:]
if cube.dimension_types == (DT.CAT, DT.MR, DT.MR) and cube.is_mr_by_itself:

overlap_tstats = calculate_overlap_tstats(
_MrXMrMatrix, dimensions, counts, base_counts, counts_with_missings
)

# These are apparent dimensions (user dimensions). Do we need to get all the dims?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This question should be resolved before PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# These are apparent dimensions (user dimensions). Do we need to get all the dims?
# These are apparent dimensions which hide 'selections' dims behind 'MR'

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(consider it resolved)

dimensions = cube.dimensions[:-1]
counts = np.sum(counts[:, :, :, 0], axis=3)
base_counts = np.sum(base_counts[:, :, :, 0], axis=3)
counts_with_missings = np.sum(counts_with_missings[:, :, :, 0], axis=3)
return _CatXMrMatrix(
dimensions,
counts,
base_counts,
counts_with_missings,
overlaps=overlap_tstats,
)
# For cubes with means, create one of the means-matrix types
if cube.has_means:
if cube.ndim == 3:
Expand Down Expand Up @@ -562,13 +587,25 @@ class _CatXMrMatrix(_MatrixWithMR):
(which correspond to the MR dimension).
"""

def __init__(
self, dimensions, counts, base_counts, counts_with_missings, overlaps=None
):
super(_CatXMrMatrix, self).__init__(
dimensions, counts, base_counts, counts_with_missings
)
self._overlaps = overlaps

@lazyproperty
def columns(self):
return tuple(
_CatXMrVector(counts.T, base_counts.T, element, table_margin)
for counts, base_counts, element, table_margin in self._column_generator
)

@lazyproperty
def overlaps_tstats(self):
return self._overlaps if self._is_cat_x_mr_x_itself else None

@lazyproperty
def rows(self):
return tuple(
Expand Down Expand Up @@ -613,6 +650,10 @@ def _baseline(self):
dim_sum = np.sum(self._all_counts, axis=2)[self._valid_rows_idxs]
return dim_sum / np.sum(dim_sum, axis=0)

@lazyproperty
def _is_cat_x_mr_x_itself(self):
return True if self._overlaps is not None else False

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not in alphabetical order (within private methods).

@lazyproperty
def _column_generator(self):
return zip(
Expand All @@ -626,6 +667,8 @@ def _column_generator(self):

@lazyproperty
def _zscores(self):
# if the cube is a special one (5D with MRxItself as last dims)
# the zscores should be the same as a 2D MRxMR matrix
return self._array_type_std_res(
self._counts[:, :, 0],
self.table_margin,
Expand All @@ -635,9 +678,11 @@ def _zscores(self):


class _CatXMrMeansMatrix(_CatXMrMatrix):
def __init__(self, dimensions, means, base_counts):
def __init__(self, dimensions, means, base_counts, overlaps=None):
counts = np.zeros(means.shape)
super(_CatXMrMeansMatrix, self).__init__(dimensions, counts, base_counts)
super(_CatXMrMeansMatrix, self).__init__(
dimensions, counts, base_counts, overlaps
)
self._means = means

@lazyproperty
Expand Down Expand Up @@ -667,6 +712,10 @@ def columns(self):
for counts, base_counts, element, table_margin in self._column_generator
)

@lazyproperty
def _mr_shadow_proportions(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a docstring explaining what a shadow proportion is.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@malecki can u provide a meaningful description for what a shadow prop is?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def _mr_shadow_proportions(self):
def _mr_shadow_proportions(self):
"""
Cube containing item-wise selections, overlap, and nonoverlap
with all other items in a multiple response dimension, for each
element of any prepended dimensions:
A 1d interface to a 4d hypercube of underlying counts.
"""

return self._counts[:, 0, :, 0] / self._pairwise_overlap_total

@lazyproperty
def rows(self):
return tuple(
Expand Down Expand Up @@ -725,6 +774,10 @@ def _column_generator(self):
self.table_margin.T,
)

@lazyproperty
def _pairwise_overlap_total(self):
Copy link
Contributor

@malecki malecki Dec 4, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def _pairwise_overlap_total(self):
def _pairwise_overlap_total(self):
"""
Given a 4d hypercube of multiple response items, return the
symmetric square matrix of valid observations between all pairs.
n1 = 2; n2 = 2; n12 = 1; overlap total = 3
"""
return np.sum(np.sum(self._counts, axis=1), axis=2)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Docstrings should be of the following PEP 257 form:

def method(self):
    """Return 2D ndarray symmetric-square matrix of valid observations.

    Given a 4D hypercube of multiple-response (MR) items, this method calculates the
    *symmetric square matrix* which is a fancy word for {this simple idea}.
    """

The key features are a single line to start, short enough not to wrap, that gives some gist of what to expect but should always start by mentioning the return-type, since that is the effective "type" of the function. Then, when more is called for, a separate passage, separated by a blank line that fills in any important remaining details. The key thing is to describe the contract of the method, i.e. what it gives back and what you need to give it, in that order.

return np.sum(np.sum(self._counts, axis=1), axis=2)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a docstring.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@malecki also here :D


@lazyproperty
def _row_generator(self):
return zip(
Expand All @@ -745,6 +798,53 @@ def _zscores(self):
np.sum(self._counts, axis=3)[:, 0, :],
)

@lazyproperty
def tstats_overlap(self):
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Place this in alphabetical order within public instance methods.

ndarray of correct tstats values considering the overlapped observations
t = (pi-pj)/s.e.(pi-pj)
where
s.e.(pi-pj) = sqrt(p_i*(1-p_i)/n_i+p_j*(1-p_j)/n_j-2*n_ij*(p_ij-p_i*p_j)/(n_i*n_j))
ni = base size for first subvar
nj = base size for second subvar
nij = number of overlapping observations
pij = proportion for which both subvar are True (selected)
In this case MRxMR the diff pi-pj is the pairwise subtraction of the diagonal of the
shadow_proportions the denominator is the matrix containing the unweighted counts
of the cube
"""

# Subtraction of the proportions foreach observation
diff = np.subtract.outer(
self._mr_shadow_proportions.diagonal(),
self._mr_shadow_proportions.diagonal(),
)
# Sum of the s.e. for each observation
se_pi_pj = np.add.outer(
self._mr_shadow_proportions.diagonal()
* (1 - self._mr_shadow_proportions.diagonal())
/ self.table_base.diagonal(),
self._mr_shadow_proportions.diagonal()
* (1 - self._mr_shadow_proportions.diagonal())
/ self.table_base.diagonal(),
)
# Correction factor considering the overlap
correction_factor = (
2
* self.table_base
* (
self._mr_shadow_proportions
- np.multiply.outer(
self._mr_shadow_proportions.diagonal(),
self._mr_shadow_proportions.diagonal(),
)
)
) / np.multiply.outer(self.table_base.diagonal(), self.table_base.diagonal())
se_diff = np.sqrt(se_pi_pj - correction_factor)
t_stats = diff / se_diff
np.fill_diagonal(t_stats, 0)
return t_stats


# ===INSERTION (SUBTOTAL) VECTORS===

Expand Down
13 changes: 12 additions & 1 deletion src/cr/cube/measures/new_pairwise_significance.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,11 @@ def t_stats(self):
return diff / se_diff

@lazyproperty
def t_stats_scale_means(self):
def t_stats_correct(self):
return self._slice.overlaps_tstats[:, self._col_idx, :]

@lazyproperty
def t_stats_scale_means(self):
"""
This property calculates the Two-tailed t-test using the formula:
t = X1 - X2 / Sx1x2 * sqrt(1/n1 + 1/n2)
Expand Down Expand Up @@ -119,6 +122,10 @@ def p_vals_scale_means(self):

@lazyproperty
def p_vals(self):
# if the cube to which the slice belongs is a CATxMRxITSELF
# returns the pvals using the t_stats_correct values
if self._slice.cube_is_mr_by_itself:
return 2 * (1 - t.cdf(abs(self.t_stats_correct), df=self._df))
return 2 * (1 - t.cdf(abs(self.t_stats), df=self._df))

@lazyproperty
Expand Down Expand Up @@ -167,11 +174,15 @@ def summary_p_vals(self):

@lazyproperty
def _df(self):
# if the cube to which the slice belongs is a CATxMRxITSELF
# returns the n1 + n2 as degrees of freedom, n1 + n2 -2 otherwise
selected_unweighted_n = (
self._slice.column_base[self._col_idx]
if self._slice.column_base.ndim < 2
else self._slice.column_base[:, self._col_idx][:, None]
)
if self._slice.cube_is_mr_by_itself:
return self._slice.column_base + selected_unweighted_n
return self._slice.column_base + selected_unweighted_n - 2

@lazyproperty
Expand Down
15 changes: 15 additions & 0 deletions src/cr/cube/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,21 @@
from itertools import filterfalse as ifilterfalse


def calculate_overlap_tstats(
cls, mr_dimensions, mr_counts, mr_base_counts, mr_counts_with_missings
):
overlaps = np.zeros(np.array(mr_counts.shape)[[0, 1, 3]])
for slice_index in range(mr_counts.shape[0]):
overlap_slice = cls(
mr_dimensions,
mr_counts[slice_index],
mr_base_counts[slice_index],
mr_counts_with_missings[slice_index],
)
overlaps[slice_index] = overlap_slice.tstats_overlap
return overlaps


def compress_pruned(table):
"""Compress table based on pruning mask.

Expand Down
Loading