Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Subdiff v2 176856023 #253

Merged
merged 30 commits into from
Mar 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
941379c
Add integration tests for subtotal diffs
gergness Feb 18, 2021
a87a07c
add subtrahend_ids
gergness Feb 18, 2021
2864497
add subtrahend_idxs
gergness Feb 18, 2021
f9bfcf3
stripe.SumSubtotals now adds subtrahends
gergness Feb 18, 2021
1191278
can now subtract subrahends when necessary too
gergness Feb 18, 2021
498b7e5
use SumDiffSubtotal
gergness Feb 18, 2021
7aac0dd
matrix.SumSubtotal is now subtrahend aware
gergness Feb 18, 2021
d598472
tablestderr is difference-aware
gergness Feb 18, 2021
af53518
Zscore is subtrahend aware
gergness Feb 18, 2021
9405231
add matrix.SumDiffSubtotals
gergness Feb 18, 2021
a8acce4
use subtrahends in matrix
gergness Feb 18, 2021
b28bf1e
Oops need to subdiff in MR margin
gergness Feb 20, 2021
b625fd6
model row_proportions after column_proportions
gergness Feb 20, 2021
b88bcdc
use assembler.row_proportion
gergness Feb 20, 2021
690aa52
intersection of subdiffs should be undefined?
gergness Feb 22, 2021
577d31e
a fix for changing from int to float
gergness Feb 22, 2021
cf8568a
integration test for corresponding dim prop is undefined for subdiffs
gergness Feb 24, 2021
e9a6b90
SumDiffSubtotal can now override row or column
gergness Feb 24, 2021
6b71fc1
override corresponding dim proportion
gergness Feb 24, 2021
9863c72
start of steve comments
gergness Feb 24, 2021
67b5fa1
refactor integration tests
gergness Feb 25, 2021
46d33f1
convert to float at source
gergness Feb 25, 2021
faa25c4
subtotals no longer claim to control dtype
gergness Feb 25, 2021
b214681
fix test broken during rebase
gergness Mar 2, 2021
2ecae53
All _flat_values are np.array
gergness Mar 3, 2021
f9e0b2a
update docstrings to remove outdated np.int references
gergness Mar 3, 2021
4830e2b
respond to easy part of second review
gergness Mar 4, 2021
cdc0842
refactor where cube determines if means are available
gergness Mar 4, 2021
dbc4e70
refactor where weighted counts are optional
gergness Mar 4, 2021
e43aba4
oops I guess this wasn't needed
gergness Mar 4, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 75 additions & 86 deletions src/cr/cube/cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

import copy
import json

import numpy as np

from cr.cube.cubepart import CubePartition
Expand Down Expand Up @@ -69,7 +68,7 @@ def has_means(self):
@lazyproperty
def has_weighted_counts(self):
"""True if cube-responses include a weighted-count measure."""
return self._cubes[0].is_weighted
return self._cubes[0].has_weighted_counts

@lazyproperty
def is_ca_as_0th(self):
Expand Down Expand Up @@ -238,7 +237,7 @@ def counts(self):

@lazyproperty
def counts_with_missings(self):
return self._measure(self.is_weighted).raw_cube_array
return self._measure.raw_cube_array

@lazyproperty
def cube_index(self):
Expand Down Expand Up @@ -300,9 +299,9 @@ def inflate(self):
)

@lazyproperty
def is_weighted(self):
"""True if cube response contains weighted data."""
return self._measures.is_weighted
def has_weighted_counts(self):
"""True if cube response has weighted count data."""
return self._measures.weighted_counts is not None

@lazyproperty
def missing(self):
Expand Down Expand Up @@ -381,7 +380,7 @@ def unweighted_counts(self):
@lazyproperty
def valid_counts(self):
"""ndarray of valid counts, valid elements only."""
valid_counts = self._measure(self.is_weighted).valid_counts
valid_counts = self._measure.valid_counts
if valid_counts.any():
return valid_counts[self._valid_idxs]
return np.empty(0)
Expand Down Expand Up @@ -473,22 +472,19 @@ def _mean_subvariables(self):
metadata = cube_measures.get("mean", {}).get("metadata", {})
return metadata.get("type", {}).get("subvariables", [])

def _measure(self, weighted):
@lazyproperty
def _measure(self):
"""_BaseMeasure subclass representing primary measure for this cube.

If the cube response includes a means measure, the return value is
means. Otherwise it is counts, with the choice between weighted or
unweighted determined by *weighted*.

Note that weighted counts are provided on an "as-available" basis.
When *weighted* is True and the cube response is not weighted,
unweighted counts are returned.
unweighted determined by whether the cube has weighted counts.
"""
return (
self._measures.means
if self._measures.means is not None
else self._measures.weighted_counts
if weighted
if self._measures.weighted_counts is not None
else self._measures.unweighted_counts
)

Expand Down Expand Up @@ -552,7 +548,7 @@ def _valid_idxs(self):
)

def _reshape_idxs(valid_indices):
if self._measure(self.is_weighted).requires_array_transposition:
if self._measure.requires_array_transposition:
if len(self._all_dimensions) == 3:
# ---In case of 3D array and a numeric array is involved we have to
# ---change the order of the valid idxs, from [0,1,2] to [1,2,0].
Expand All @@ -576,45 +572,19 @@ def __init__(self, cube_dict, all_dimensions, cube_idx_arg=None):
self._all_dimensions = all_dimensions
self._cube_idx_arg = cube_idx_arg

@lazyproperty
def is_weighted(self):
"""True if weights have been applied to the measure(s) for this cube.

Unweighted counts are available for all cubes. Weighting applies to
any other measures provided by the cube.
"""
cube_dict = self._cube_dict
if cube_dict.get("query", {}).get("weight") is not None:
return True
if cube_dict.get("weight_var") is not None:
return True
if cube_dict.get("weight_url") is not None:
return True
unweighted_counts = cube_dict["result"]["counts"]
count_data = cube_dict["result"]["measures"].get("count", {}).get("data")
if unweighted_counts != count_data:
return True
return False

@lazyproperty
def means(self):
"""_MeanMeasure object providing access to means values.
"""Optional _MeanMeasure object providing access to means values.

None when the cube response does not contain a mean measure.
Will be None if no means are available on the counts.
"""
mean_measure_dict = (
self._cube_dict.get("result", {}).get("measures", {}).get("mean")
)
return (
_MeanMeasure(self._cube_dict, self._all_dimensions, self._cube_idx_arg)
if mean_measure_dict
else None
)
mean = _MeanMeasure(self._cube_dict, self._all_dimensions, self._cube_idx_arg)
return None if mean.raw_cube_array is None else mean

@lazyproperty
def missing_count(self):
"""numeric representing count of missing rows in cube response."""
if self.means:
if self.means is not None:
return self.means.missing_count
return self._cube_dict["result"].get("missing", 0)

Expand Down Expand Up @@ -671,21 +641,14 @@ def unweighted_counts(self):

@lazyproperty
def weighted_counts(self):
"""_WeightedCountMeasure object for this cube.
"""Optional _WeightedCountMeasure object for this cube.

This object provides access to weighted counts for this cube, if
available. If the cube response is not weighted, the
_UnweightedCountMeasure object for this cube is returned.
Can be None when the cube is unweighted.
"""
return (
_WeightedCountMeasure(
self._cube_dict, self._all_dimensions, self._cube_idx_arg
)
if self.is_weighted
else _UnweightedCountMeasure(
self._cube_dict, self._all_dimensions, self._cube_idx_arg
)
weighted_counts = _WeightedCountMeasure(
self._cube_dict, self._all_dimensions, self._cube_idx_arg
)
return weighted_counts if weighted_counts.raw_cube_array is not None else None


class _BaseMeasure(object):
Expand All @@ -698,13 +661,16 @@ def __init__(self, cube_dict, all_dimensions, cube_idx_arg=None):

@lazyproperty
def raw_cube_array(self):
"""Return read-only ndarray of measure values from cube-response.
"""Optional read-only ndarray of measure values from cube-response.

The shape of the ndarray mirrors the shape of the (raw) cube
response. Specifically, it includes values for missing elements, any
MR_CAT dimensions, and any prunable rows and columns.
MR_CAT dimensions, and any prunable rows and columns. Returns None
if the measure is not available in cube.
"""
raw_cube_array = np.array(self._flat_values).flatten().reshape(self._shape)
if self._flat_values is None:
return None
raw_cube_array = self._flat_values.reshape(self._shape)
# ---must be read-only to avoid hard-to-find bugs---
raw_cube_array.flags.writeable = False
return raw_cube_array
Expand Down Expand Up @@ -735,7 +701,7 @@ def valid_counts(self):

@lazyproperty
def _flat_values(self): # pragma: no cover
"""Return tuple of mean values as found in cube response.
"""Return ndarray of np.float64 values as found in cube response.
Comment on lines -738 to +704
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's my habit, and perhaps more proper, to leave out the "Return " suffix on docstrings of properties (lazy or otherwise) since they behave like attributes. So we document a property like we would an attribute, by stating what it is rather that what it "returns", since returning a value is a characteristic of a function and attributes just "are" their value.

As a side benefit, it opens up first-docstring-line real estate for more descriptive words.


This property must be implemented by each subclass.
"""
Expand All @@ -762,48 +728,71 @@ class _MeanMeasure(_BaseMeasure):

@lazyproperty
def missing_count(self):
"""numeric representing count of missing rows reflected in response."""
return self._cube_dict["result"]["measures"]["mean"].get("n_missing", 0)
"""Optional numeric representing count of missing rows in response."""
return None if self._result is None else self._result.get("n_missing", 0)

@lazyproperty
def _flat_values(self):
"""Return tuple of mean values as found in cube response.
"""Optional 1D np.ndarray of np.float64 mean values as found in cube response.

Mean data may include missing items represented by a dict like
{'?': -1} in the cube response. These are replaced by np.nan in the
returned value.
returned value. In the case of numeric arrays, we can have list of lists.
Returns None if means are not available on the cube.
"""
return tuple(
np.nan if type(x) is dict else x
for x in self._cube_dict["result"]["measures"]["mean"]["data"]
)
if self._result is None:
return None

def dict_to_nan_recursive(x):
if type(x) is dict:
return np.nan
elif type(x) is list:
return [dict_to_nan_recursive(y) for y in x]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@scanny - is this the right way to do this?

The problem I hit is that because of numeric arrays, this can have two different forms:

Mean of non-array: list of floats or dictionaries (indicating missing values)
Mean of array: list of lists of floats or dictionaries

Copy link
Contributor

@scanny scanny Mar 3, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd say those are distinct enough to warrant two different subclasses, where the factory chooses between _MeanMeasure and perhaps something like _NumArrayMeanMeasure. Then each ._flat_values can do its own thing and we avoid using if statements as a substitute for polymorphism.

Btw, The docstring says "Return tuple ..." and the return value here is ndarray (which I think is preferable).

This issue came up in my review of Ernesto's branch here: #251 (comment) where I explained that I think all ._flat_values return type should be ndarray to avoid the unnecessary time and memory required to construct an intermediate tuple object that just gets immediately converted to an ndarray anyway.

So maybe a conflict brewing here, but that happens and Ernesto can figure it out during rebase (although it might be worth a mention to him in Slack). In any case, I think the right solution is to create the _NumArrayMeanMeasure or whatever name is appropriate and then adjust the factory in _Measures or wherever it is to choose the right subclass when constructing the measure. Also that these return ndarray and preferably that all of the four subtypes return ndarray and the _BaseMeasure.raw_cube_array property is modified to avoid constructing an array out of an array (which it looks like you already did).

Also the docstring for _WeightedCountMeasure._flat_values also says tuple.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd say those are distinct enough to warrant two different subclasses, where the factory chooses between _MeanMeasure and perhaps something like _NumArrayMeanMeasure. Then each ._flat_values can do its own thing and we avoid using if statements as a substitute for polymorphism.

Just saw your new set of comments, but the problem I'm hitting as I thought about this is that I don't see a good way to distinguish numeric arrays from not at the time of the factory. Did you have thoughts on that?

My first idea was to use use _BaseMeasure._shape but the shape of a CAT(N categories) X Numeric is the same as a Numeric Array (N subvariables).

The other areas where the code makes decisions about numeric arrays (eg _numeric_array_dimension) are in the Cube class, but I don't see a good way to pass that information along when creating the measures.

else:
return x

return np.array(
tuple(dict_to_nan_recursive(x) for x in self._result["data"]),
dtype=np.float64,
).flatten()

@lazyproperty
def _result(self):
return self._cube_dict.get("result", {}).get("measures", {}).get("mean")


class _UnweightedCountMeasure(_BaseMeasure):
"""Unweighted counts for cube."""

@lazyproperty
def _flat_values(self):
"""tuple of int counts before weighting."""
if (
self._cube_dict["result"]["measures"]
.get("valid_count_unweighted", {})
.get("data")
):
# ---If valid_count are expressed in the cube dict, returns its data.
# ---This condition can happen in case of numeric array cube response.
# ---Under this circumstances the numeric array measures will contain the
# ---mean measure and a valid count measure for the unweighted counts.
return tuple(
self._cube_dict["result"]["measures"]["valid_count_unweighted"]["data"]
)
return tuple(self._cube_dict["result"]["counts"])
"""1D np.ndarray of np.float64 counts before weighting.

Use np.float64s to avoid int overflow bugs and so we can use nan.
"""
result = self._cube_dict["result"]

# ---If valid_count are expressed in the cube dict, returns its data.
# ---This condition can happen in case of numeric array cube response.
# ---Under this circumstances the numeric array measures will contain the
# ---mean measure and a valid count measure for the unweighted counts.
valid_counts = result["measures"].get("valid_count_unweighted", {}).get("data")
return (
np.array(valid_counts, dtype=np.float64)
if valid_counts
else np.array(result["counts"], dtype=np.float64)
)


class _WeightedCountMeasure(_BaseMeasure):
"""Weighted counts for cube."""

@lazyproperty
def _flat_values(self):
"""tuple of numeric counts after weighting."""
return tuple(self._cube_dict["result"]["measures"]["count"]["data"])
"""Optional 1D np.ndarray of np.float64 numeric counts after weighting."""
unweighted_counts = self._cube_dict["result"]["counts"]
weighted_counts = self._cube_dict["result"]["measures"]["count"]["data"]
if unweighted_counts == weighted_counts:
return None

return np.array(weighted_counts, dtype=np.float64)
Loading