Skip to content

Commit

Permalink
Merge 842ea07 into c2211de
Browse files Browse the repository at this point in the history
  • Loading branch information
scanny committed Jun 20, 2019
2 parents c2211de + 842ea07 commit 9795860
Show file tree
Hide file tree
Showing 3 changed files with 232 additions and 16 deletions.
200 changes: 185 additions & 15 deletions src/cr/cube/cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,152 @@
np.seterr(divide="ignore", invalid="ignore")


class CubeSet(object):
"""Represents a multi-cube cube-response.
Also works just fine for a single cube-response passed inside a sequence, allowing
uniform handling of single and multi-cube responses.
`cube_responses` is a sequence of cube-response dicts received from Crunch. The
sequence can contain a single item, such as a cube-response for a slide, but it must
be contained in a sequence. A tabbook cube-response sequence can be passed as it was
received.
`transforms` is a sequence of transforms dicts corresponding in order to the
cube-responses. `population` is the estimated target population and is used when
a population-projection measure is requested. `min_base` is an integer representing
the minimum sample-size used for indicating values that are unreliable by reason of
insufficient sample (base).
"""

def __init__(self, cube_responses, transforms, population, min_base):
self._cube_responses = cube_responses
self._transforms_dicts = transforms
self._population = population
self._min_base = min_base

@lazyproperty
def can_show_pairwise(self):
"""True if all 2D cubes in a multi-cube set can provide pairwise comparison."""
if len(self._cubes) < 2:
return False

return all(
all(dt in DT.ALLOWED_PAIRWISE_TYPES for dt in cube.dimension_types[-2:])
and cube.ndim >= 2
for cube in self._cubes[1:]
)

@lazyproperty
def description(self):
"""str description of first cube in this set."""
return self._cubes[0].description

@lazyproperty
def has_means(self):
"""True if cubes in this set include a means measure."""
return self._cubes[0].has_means

@lazyproperty
def has_weighted_counts(self):
"""True if cube-responses include a weighted-count measure."""
return self._cubes[0].is_weighted

@lazyproperty
def is_ca_as_0th(self):
"""True for multi-cube when first cube represents a categorical-array.
A "CA-as-0th" tabbook tab is "3D" in the sense it is "sliced" into one table
(partition-set) for each of the CA subvariables.
"""
# ---can only be true for multi-cube case---
if not self._is_multi_cube:
return False
# ---the rest depends on the row-var cube---
cube = self._cubes[0]
# ---True if row-var cube is CA---
return cube.dimension_types[0] == DT.CA_SUBVAR

@lazyproperty
def missing_count(self):
"""The number of missing values from first cube in this set."""
return self._cubes[0].missing

@lazyproperty
def name(self):
"""str name of first cube in this set."""
return self._cubes[0].name

@lazyproperty
def partition_sets(self):
"""Sequence of cube-partition collections across all cubes of this cube-set.
This value might look like the following for a ca-as-0th tabbook, for example:
(
(_Strand, _Slice, _Slice),
(_Strand, _Slice, _Slice),
(_Strand, _Slice, _Slice),
)
and might often look like this for a typical slide:
((_Slice,))
Each partition set represents the partitions for a single "stacked" table. A 2D
slide has a single partition-set of a single _Slice object, as in the second
example above. A 3D slide would have multiple partition sets, each of a single
_Slice. A tabook will have multiple partitions in each set, the first being
a _Strand and the rest being _Slice objects. Multiple partition sets only arise
for a tabbook in the CA-as-0th case.
"""
return tuple(zip(*(cube.partitions for cube in self._cubes)))

@lazyproperty
def population_fraction(self):
"""The filtered/unfiltered ratio for this cube-set.
This value is required for properly calculating population on a cube where
a filter has been applied. Returns 1.0 for an unfiltered cube. Returns `np.nan`
if the unfiltered count is zero, which would otherwise result in
a divide-by-zero error.
"""
return self._cubes[0].population_fraction

@lazyproperty
def _cubes(self):
"""Sequence of Cube objects containing data for this analysis."""
return tuple(self._iter_cubes())

@lazyproperty
def _is_multi_cube(self):
"""True if more than one cube-response was provided on construction."""
return len(self._cube_responses) > 1

def _iter_cubes(self):
"""Generate a Cube object for each of cube_responses.
0D cube-responses and 1D second-and-later cubes are "inflated" to add their
missing row dimension.
"""
for idx, cube_response in enumerate(self._cube_responses):
cube = Cube(
cube_response,
self._transforms_dicts[idx],
first_cube_of_tab=(self._is_multi_cube and idx == 0),
population=self._population,
mask_size=self._min_base,
)
# ---a 0D rows-var cube gets inflated, as does a 1D cols-var cube---
if self._is_multi_cube and (
(idx == 0 and cube.ndim == 0) or (idx > 0 and cube.ndim == 1)
):
yield cube.inflate()
continue
# ---others don't---
yield cube


class Cube(object):
"""Provides access to individual slices on a cube-result.
Expand Down Expand Up @@ -56,21 +202,6 @@ def __repr__(self):
except Exception:
return super(Cube, self).__repr__()

@lazyproperty
def partitions(self):
"""Sequence of _Slice, _Strand, or _Nub objects from this cube-result."""
return tuple(
CubePartition.factory(
self,
slice_idx=slice_idx,
transforms=self._transforms_dict,
population=self._population,
ca_as_0th=self._ca_as_0th,
mask_size=self._mask_size,
)
for slice_idx in self._slice_idxs
)

@lazyproperty
def base_counts(self):
return self._measures.unweighted_counts.raw_cube_array[self._valid_idxs]
Expand Down Expand Up @@ -113,6 +244,30 @@ def has_means(self):
"""True if cube includes a means measure."""
return self._measures.means is not None

def inflate(self):
"""Return new Cube object with rows-dimension added.
A multi-cube (tabbook) response formed from a function (e.g. mean()) on
a numeric variable arrives without a rows-dimension.
"""
cube_dict = self._cube_dict
dimensions = cube_dict["result"]["dimensions"]
rows_dimension = {
"references": {"alias": "mean", "name": "mean"},
"type": {
"categories": [{"id": 1, "missing": False, "name": "Mean"}],
"class": "categorical",
},
}
dimensions.insert(0, rows_dimension)
return Cube(
cube_dict,
self._transforms_dict,
self._first_cube_of_tab,
self._population,
self._mask_size,
)

@lazyproperty
def is_weighted(self):
"""True if cube response contains weighted data."""
Expand Down Expand Up @@ -140,6 +295,21 @@ def ndim(self):
"""int count of dimensions for this cube."""
return len(self.dimensions)

@lazyproperty
def partitions(self):
"""Sequence of _Slice, _Strand, or _Nub objects from this cube-result."""
return tuple(
CubePartition.factory(
self,
slice_idx=slice_idx,
transforms=self._transforms_dict,
population=self._population,
ca_as_0th=self._ca_as_0th,
mask_size=self._mask_size,
)
for slice_idx in self._slice_idxs
)

@lazyproperty
def population_fraction(self):
"""The filtered/unfiltered ratio for cube response.
Expand Down
21 changes: 21 additions & 0 deletions src/cr/cube/cubepart.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,22 @@ def ndim(self):
"""int count of dimensions for this partition."""
return len(self._dimensions)

@lazyproperty
def shape(self):
"""Tuple of int vector counts for this partition.
Not to be confused with `numpy.ndarray.shape`, this represent the count of rows
and columns respectively, in this partition. It does not necessarily represent
the shape of any underlying `numpy.ndarray` object that may arise in the
implementation of the cube partition. In particular, the value of any count in
the shape can be zero.
A _Slice has a shape like (2, 3) representing (row-count, col-count). A _Strand
has a shape like (5,) which represents its row-count. The shape of a _Nub is
unconditionally () (an empty tuple).
"""
raise NotImplementedError("must be implemented by each subclass")


class _Slice(CubePartition):
"""2D cube partition.
Expand Down Expand Up @@ -153,6 +169,11 @@ def columns_margin(self):
def counts(self):
return np.array([row.values for row in self._matrix.rows])

@lazyproperty
def description(self):
"""str description of this slice, which it takes from its rows-dimension."""
return self._rows_dimension.description

@lazyproperty
def inserted_column_idxs(self):
return tuple(
Expand Down
27 changes: 26 additions & 1 deletion src/cr/cube/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,17 +339,22 @@ def factory(cls, cube, dimensions, slice_idx):
counts = cube.counts
base_counts = cube.base_counts
counts_with_missings = cube.counts_with_missings
dimension_types = cube.dimension_types[-2:]

# For cubes with means, create one of the means-matrix types
if cube.has_means:
if cube.ndim == 3:
base_counts = base_counts[slice_idx]
counts = counts[slice_idx]
if dimension_types == (DT.MR, DT.MR):
# TODO: Potentially address this case, which didn't arise yet
raise NotImplementedError("MR x MR with means is not implemented.")
if dimension_types[1] == DT.MR:
return _CatXMrMeansMatrix(dimensions, counts, base_counts)
if dimensions[0].dimension_type == DT.MR:
return _MrXCatMeansMatrix(dimensions, counts, base_counts)
return _CatXCatMeansMatrix(dimensions, counts, base_counts)

dimension_types = cube.dimension_types[-2:]
if cube.ndim > 2:
base_counts = base_counts[slice_idx]
counts = counts[slice_idx]
Expand Down Expand Up @@ -726,6 +731,26 @@ def _zscores(self):
)


class _CatXMrMeansMatrix(_CatXMrMatrix):
def __init__(self, dimensions, means, base_counts):
counts = np.empty(means.shape)
super(_CatXMrMeansMatrix, self).__init__(dimensions, counts, base_counts)
self._means = means

@lazyproperty
def rows(self):
"""rows of CAT x MR matrix."""

return tuple(
# We must inflate the means with [:, 0], because the values are oriented
# like columns (0th is selected while 1st is other)
_MeansWithMrVector(element, base_counts, means[:, 0])
for element, base_counts, means in zip(
self.rows_dimension.valid_elements, self._base_counts, self._means
)
)


class _MrXMrMatrix(_MatrixWithMR):
"""Represents MR x MR slices.
Expand Down

0 comments on commit 9795860

Please sign in to comment.