Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sort by value spike #218

Closed
wants to merge 54 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
7c42cdf
fix: docsttrings
scanny Nov 9, 2020
752b701
fix: suppress targeted divide-by-zero warnings
scanny Nov 9, 2020
923d603
test: add unit test for _Slice.scale_means_column
scanny Nov 9, 2020
917913a
rfctr: small Dimension implementation improvements
scanny Nov 9, 2020
47c2c6c
rfctr: condense CubeSet._cubes
scanny Aug 6, 2020
bf010ef
rfctr: remove unused Dimension.sort
scanny Aug 20, 2020
41e45ad
rfctr: rename _BaseInsertedVector
scanny Aug 20, 2020
1471efd
xfail: XFAILs for _BaseInsertedVector.insertion_id
scanny Aug 21, 2020
fcdc49a
matrix: add _BaseInsertedVector.insertion_id
scanny Aug 20, 2020
7eaefa9
dim: add _Subtotal.insertion_id
scanny Aug 20, 2020
f48a7f8
dim: add _Subtotal.insertion_id fallback
scanny Aug 20, 2020
aa65cb1
xfail: add XFAIL for PayloadOrderCollator
scanny Aug 21, 2020
4ce62d8
sort: add _BaseAnchoredCollator.display_order
scanny Aug 21, 2020
bd95cf5
sort: add _BaseAnchoredCollator._display_order
scanny Aug 21, 2020
60cf5fc
sort: add _BaseAnchoredCollator._base_element_orderings
scanny Aug 21, 2020
1066b13
sort: add PayloadOrderCollator._element_order_descriptors
scanny Aug 21, 2020
e47d57e
sort: add _BaseCollator._element_ids
scanny Aug 21, 2020
3e1fbbe
sort: add _BaseCollator._insertion_orderings
scanny Aug 21, 2020
b39a133
sort: add _BaseCollator._subtotals
scanny Aug 21, 2020
3813c07
sort: add _BaseCollator._insertion_position()
scanny Aug 21, 2020
5cac167
sort: add _BaseAnchoredCollator._element_positions_by_id
scanny Aug 21, 2020
d254dea
xfail: add XFAIL for ExplicitOrderCollator
scanny Aug 21, 2020
f8109ae
sort: add ExplicitOrderCollator._element_order_descriptors
scanny Aug 21, 2020
2ca9d6b
sort: add _BaseCollator._order_dict
scanny Aug 21, 2020
8a4d22c
xfail: add XFAIL for MarginalCollator
scanny Aug 22, 2020
e41455e
sort: add MarginalCollator.display_order()
scanny Aug 22, 2020
ecaa87e
sort: add _BaseSortByValueCollator._display_order
scanny Aug 22, 2020
a94436b
sort: add _BaseSortByValueCollator._top_subtotal_idxs
scanny Aug 22, 2020
eaf44c2
sort: add _BaseSortByValueCollator._descending
scanny Aug 22, 2020
ae5ec6b
sort: add _BaseSortByValueCollator._subtotal_idxs
scanny Aug 22, 2020
9d763df
sort: add MarginalCollator._subtotal_values
scanny Aug 22, 2020
c696187
sort: add MarginalCollator._marginal_propname
scanny Aug 23, 2020
987251d
sort: add _BaseSortByValueCollator._top_exclusion_idxs
scanny Aug 23, 2020
b22381c
sort: add _BaseSortByValueCollator._iter_exclusion_idxs()
scanny Aug 23, 2020
05101e5
sort: add _BaseSortByValueCollator._element_idxs_by_id
scanny Aug 23, 2020
1f0b47c
sort: add _BaseSortByValueCollator._body_idxs
scanny Aug 23, 2020
11a1f1f
sort: add _BaseSortByValueCollator._bottom_exclusion_idxs
scanny Aug 23, 2020
35f82a9
sort: add MarginalCollator._element_values
scanny Aug 23, 2020
ca502be
sort: add _BaseSortByValueCollator._bottom_subtotal_idxs
scanny Aug 23, 2020
3751e30
xfail: add XFAIL for OpposingElementCollator
scanny Aug 23, 2020
2be6fd1
sort: add OpposingElementCollator.display_order()
scanny Aug 23, 2020
8b8b25d
sort: add OpposingElementCollator._subtotal_values
scanny Aug 23, 2020
3f16be1
sort: add _BaseSortByValueCollator._measure_propname
scanny Aug 23, 2020
856585b
sort: add OpposingElementCollator._opposing_vector
scanny Aug 23, 2020
acbcd28
sort: add OpposingElementCollator._element_values
scanny Aug 23, 2020
ce71c55
xfail: add XFAIL for OpposingSubtotalCollator
scanny Aug 24, 2020
726f229
sort: add OpposingSubtotalCollator.display_order()
scanny Aug 24, 2020
9e1c61b
sort: add OpposingSubtotalCollator._subtotal_values
scanny Aug 24, 2020
393ddd5
sort: add OpposingSubtotalCollator._opposing_subtotal
scanny Aug 24, 2020
aec660e
sort: add OpposingSubtotalCollator._element_values
scanny Aug 24, 2020
e07b2c7
spike: ExplicitOrderCollator
scanny Aug 21, 2020
a5e574b
xfail: add XFAIL for sort-by-OPPOSING_ELEMENT
scanny Jul 30, 2020
410bdc6
xfail: XFAIL tests for sort-by-subtotal/marginal
scanny Aug 20, 2020
ebe0f2c
spike: sort-by-value
scanny Jul 31, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
652 changes: 652 additions & 0 deletions src/cr/cube/collator.py

Large diffs are not rendered by default.

39 changes: 20 additions & 19 deletions src/cr/cube/cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,26 @@ def population_fraction(self):
@lazyproperty
def _cubes(self):
"""Sequence of Cube objects containing data for this analysis."""
return tuple(self._iter_cubes())

def iter_cubes():
"""Generate a Cube object for each of cube_responses.

0D cube-responses and 1D second-and-later cubes are "inflated" to add their
missing row dimension.
"""
for idx, cube_response in enumerate(self._cube_responses):
cube = Cube(
cube_response,
cube_idx=idx if self._is_multi_cube else None,
transforms=self._transforms_dicts[idx],
population=self._population,
mask_size=self._min_base,
)
# --- numeric-mean cubes require inflation to restore their
# --- rows-dimension, others don't
yield cube.inflate() if self._is_numeric_mean else cube

return tuple(iter_cubes())

@lazyproperty
def _is_multi_cube(self):
Expand Down Expand Up @@ -166,24 +185,6 @@ def _is_numeric_mean(self):
# --- construction is low-overhead because all Cube properties are lazy.
return Cube(self._cube_responses[0]).ndim == 0

def _iter_cubes(self):
"""Generate a Cube object for each of cube_responses.

0D cube-responses and 1D second-and-later cubes are "inflated" to add their
missing row dimension.
"""
for idx, cube_response in enumerate(self._cube_responses):
cube = Cube(
cube_response,
cube_idx=idx if self._is_multi_cube else None,
transforms=self._transforms_dicts[idx],
population=self._population,
mask_size=self._min_base,
)
# --- all numeric-mean cubes require inflation to restore their
# --- rows-dimension, others don't
yield cube.inflate() if self._is_numeric_mean else cube


class Cube(object):
"""Provides access to individual slices on a cube-result.
Expand Down
16 changes: 8 additions & 8 deletions src/cr/cube/cubepart.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,18 +554,18 @@ def scale_mean_pairwise_indices_alt(self):

@lazyproperty
def scale_means_column(self):
"""1D float64 ndarray of column scale means
"""1D float64 ndarray of scale mean for each row (making a summary "column").

The calculation is based on multiply of the numeric values by the
row_proportions and divide by the rows_margin.
Each scale mean is based on the numeric values of the *columns-dimension*
elements.
"""
if np.all(np.isnan(self._columns_dimension_numeric_values)):
column_numeric_values = self._columns_dimension_numeric_values

if np.all(np.isnan(column_numeric_values)):
return None

inner = np.nansum(
self._columns_dimension_numeric_values * self.row_proportions, axis=1
)
not_a_nan_index = ~np.isnan(self._columns_dimension_numeric_values)
inner = np.nansum(column_numeric_values * self.row_proportions, axis=1)
not_a_nan_index = ~np.isnan(column_numeric_values)
denominator = np.sum(self.row_proportions[:, not_a_nan_index], axis=1)
return inner / denominator

Expand Down
179 changes: 52 additions & 127 deletions src/cr/cube/dimension.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import numpy as np

from cr.cube.enums import DIMENSION_TYPE as DT
from cr.cube.enums import COLLATION_METHOD as CM, DIMENSION_TYPE as DT
from cr.cube.util import lazyproperty


Expand Down Expand Up @@ -261,7 +261,7 @@ class Dimension(object):
def __init__(self, dimension_dict, dimension_type, dimension_transforms=None):
self._dimension_dict = dimension_dict
self._dimension_type = dimension_type
self._dimension_transforms_arg = dimension_transforms
self._dimension_transforms_dict = dimension_transforms or {}

@lazyproperty
def alias(self):
Expand Down Expand Up @@ -293,6 +293,14 @@ def apply_transforms(self, dimension_transforms):
self._dimension_dict, self._dimension_type, dimension_transforms
)

@lazyproperty
def collation_method(self):
"""Member of COLLATION_METHOD specifying ordering of dimension elements."""
method_keyword = self.order_dict.get("type")
if method_keyword is None:
return CM.PAYLOAD_ORDER
return CM(method_keyword)

@lazyproperty
def description(self):
"""str description of this dimension."""
Expand All @@ -314,29 +322,12 @@ def dimension_type(self):
return self._dimension_type

@lazyproperty
def display_order(self):
"""Sequence of int element indices specifying display order of elements.

The sequence includes only valid elements; missing elements do not appear.
Further, each index represents the document-order position of the element in the
sequence of valid elements; missing elements are skipped in the assignment of
indexes. The returned sequence is exhaustive; all valid elements are
represented.

The sequence reflects the resolved cascade of any *explicit* ordering
transforms, but does *not* reflect any *sort* transforms, which cannot be
resolved by the dimension. Use the `.sort` property to access any sort transform
that may apply.

Example with explicit-order transform:

(3, 0, 2, 1, 4)

Example with no explicit-order transform:
def element_ids(self):
"""tuple of int element-id for each valid element in this dimension.

(0, 1, 2, 3, 4)
Element-ids appear in the order defined in the cube-result.
"""
return self.valid_elements.display_order
return tuple(e.element_id for e in self.valid_elements)

@lazyproperty
def name(self):
Expand Down Expand Up @@ -373,6 +364,14 @@ def numeric_values(self):
"""
return tuple(element.numeric_value for element in self.valid_elements)

@lazyproperty
def order_dict(self):
"""dict "order": field in dimensions.transforms parsed from JSON payload.

Value is `{}` if no "order": field is present.
"""
return self._dimension_transforms_dict.get("order", {})

@lazyproperty
def prune(self):
"""True if empty elements should be automatically hidden on this dimension."""
Expand All @@ -385,35 +384,24 @@ def prune(self):
def shape(self):
return len(self.all_elements)

@lazyproperty
def sort(self):
"""A _BaseSort-subclass object or None, describing the applied sort method.

This value is None if no sort transform was specified for this dimension.
Currently that is its only possible value. The returned sort object describes
the sort method which can include sorting on the value of an opposing element or
on the margin and specify ascending or descending order.
"""
return None # pragma: no cover

@lazyproperty
def subtotals(self):
"""_Subtotals sequence object for this dimension.

Each item in the sequence is a _Subtotal object specifying a subtotal, including
its addends and anchor.
"""
# ---insertions in dimension-transforms override those on dimension itself---
insertion_dicts = self._dimension_transforms_dict.get("insertions")
if insertion_dicts is not None:
return _Subtotals(insertion_dicts, self.valid_elements, self.prune)
# --- insertions in dimension-transforms override those on dimension itself ---
if "insertions" in self._dimension_transforms_dict:
return _Subtotals(
self._dimension_transforms_dict["insertions"],
self.valid_elements,
self.prune,
)

# ---otherwise, insertions defined as default transforms apply---
view = self._dimension_dict.get("references", {}).get("view", {})
# ---view can be both None and {}, thus the edge case.---
insertion_dicts = (
[] if view is None else view.get("transform", {}).get("insertions", [])
)
# --- otherwise, insertions defined in cube as default transforms apply ---
view = self._dimension_dict.get("references", {}).get("view") or {}
insertion_dicts = view.get("transform", {}).get("insertions", [])
return _Subtotals(insertion_dicts, self.valid_elements, self.prune)

@lazyproperty
Expand All @@ -426,19 +414,6 @@ def valid_elements(self):
"""
return self.all_elements.valid_elements

@lazyproperty
def _dimension_transforms_dict(self):
"""dict complying with dimension-transforms schema for this dimension.

This value derives from the `dimension_transforms` argument passed on
construction. When that argument is not specified, this value is an empty dict.
"""
return (
self._dimension_transforms_arg
if self._dimension_transforms_arg is not None
else {}
)


class _BaseElements(Sequence):
"""Base class for element sequence containers."""
Expand Down Expand Up @@ -509,7 +484,7 @@ def __init__(self, type_dict, dimension_transforms_dict):
@lazyproperty
def valid_elements(self):
"""_ValidElements object containing only non-missing elements."""
return _ValidElements(self._elements, self._dimension_transforms_dict)
return _ValidElements(self._elements)

@lazyproperty
def _element_dicts(self):
Expand Down Expand Up @@ -570,72 +545,14 @@ class _ValidElements(_BaseElements):
directly.
"""

def __init__(self, all_elements, dimension_transforms_dict):
def __init__(self, all_elements):
self._all_elements = all_elements
self._dimension_transforms_dict = dimension_transforms_dict

@lazyproperty
def display_order(self):
"""Sequence of int element-idx reflecting order in which to display elements.

This order reflects the application of any explicit element-order transforms,
including resolution of any cascade. It does *not* reflect the results of
a *sort* transform, which can only be resolved at a higher level, where vector
values are known.
"""
return (
self._explicit_order
if self._explicit_order
else tuple(range(len(self._elements)))
)

@lazyproperty
def _elements(self):
"""tuple containing actual sequence of element objects."""
"""tuple containing valid (non-missing) element objects in payload order."""
return tuple(element for element in self._all_elements if not element.missing)

@lazyproperty
def _explicit_order(self):
"""Sequence of int element-idx or None, reflecting explicit-order transform.

This value is None if no explicit-order transform is specified. Otherwise, it is
an exhaustive collection of (valid) element offsets, in the order specified (and
in some cases implied) by the order transform.
"""
# ---get order transform if any, aborting if no explicit order transform---
order_dict = self._dimension_transforms_dict.get("order", {})
order_type = order_dict.get("type")
ordered_element_ids = order_dict.get("element_ids")
if order_type != "explicit" or not isinstance(ordered_element_ids, list):
return None

# ---list like [0, 1, 2, -1], perhaps ["0001", "0002", etc.], reflecting element
# ---ids in the order they appear in the cube result. We'll use this to map
# ---element-id to its index in the valid-elements sequence.
cube_result_order = tuple(element.element_id for element in self)
# ---this is a copy of the same, but we're going to mutate this one. This is
# ---required to implement the "no-duplicates" behavior.
remaining_element_ids = list(cube_result_order)

# ---we'll collect the results in this---
ordered_idxs = []
# ---append idx of each element mentioned by id in transform, in order. Remove
# ---each element-id from remaining as we go to keep track of dups and leftovers
for element_id in ordered_element_ids:
# ---An element-id appearing in transform but not in dimension is ignored.
# ---Also, a duplicated element-id is only used on first encounter.
if element_id not in remaining_element_ids:
continue
ordered_idxs.append(cube_result_order.index(element_id))
remaining_element_ids.remove(element_id)

# ---any remaining elements are tacked onto the end of the list in the order
# ---they originally appeared in the cube-result.
for element_id in remaining_element_ids:
ordered_idxs.append(cube_result_order.index(element_id))

return tuple(ordered_idxs)


class _Element(object):
"""A category or subvariable of a dimension.
Expand Down Expand Up @@ -854,11 +771,8 @@ def iter_for_anchor(self, anchor):

@lazyproperty
def _anchors(self):
"""List of int or str indicating element under which to insert this subtotal."""
return list(
_Subtotal(subtotal_dict, self._valid_elements, self._prune).anchor
for subtotal_dict in self._iter_valid_subtotal_dicts()
)
"""Sequence of int or str anchor for each subtotal."""
return tuple(s.anchor for s in self._subtotals)

@lazyproperty
def _element_ids(self):
Expand Down Expand Up @@ -896,18 +810,24 @@ def _iter_valid_subtotal_dicts(self):
def _subtotals(self):
"""Composed tuple storing actual sequence of _Subtotal objects."""
return tuple(
_Subtotal(subtotal_dict, self._valid_elements, self._prune)
for subtotal_dict in self._iter_valid_subtotal_dicts()
_Subtotal(subtotal_dict, self._valid_elements, self._prune, idx + 1)
for idx, subtotal_dict in enumerate(self._iter_valid_subtotal_dicts())
)


class _Subtotal(object):
"""A subtotal insertion on a cube dimension."""
"""A subtotal insertion on a cube dimension.

`fallback_insertion_id` is a fallback unique identifier for this insertion, until
real insertion-ids can be added. Its value is just the index+1 of this subtotal
within the insertions transform collection.
"""

def __init__(self, subtotal_dict, valid_elements, prune):
def __init__(self, subtotal_dict, valid_elements, prune, fallback_insertion_id):
self._subtotal_dict = subtotal_dict
self._valid_elements = valid_elements
self._prune = prune
self._fallback_insertion_id = fallback_insertion_id

@lazyproperty
def anchor(self):
Expand Down Expand Up @@ -966,6 +886,11 @@ def addend_idxs(self):
dtype=int,
)

@lazyproperty
def insertion_id(self):
"""int unique identifier of this subtotal within this dimension's insertions."""
return self._subtotal_dict.get("insertion_id", self._fallback_insertion_id)

@lazyproperty
def label(self):
"""str display name for this subtotal, suitable for use as label."""
Expand Down
12 changes: 12 additions & 0 deletions src/cr/cube/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

"""Enumerated sets related to cubes."""

from enum import Enum

from cr.cube.util import lazyproperty


Expand Down Expand Up @@ -53,3 +55,13 @@ class DIMENSION_TYPE(object):
ALLOWED_PAIRWISE_TYPES = frozenset(
(BINNED_NUMERIC, CA, CAT, CA_CAT, DATETIME, MR, TEXT)
)


class COLLATION_METHOD(Enum):
"""Enumerated values representing the methods of sorting dimension elements."""

EXPLICIT_ORDER = "explicit"
MARGINAL = "marginal"
OPPOSING_ELEMENT = "opposing_element"
OPPOSING_SUBTOTAL = "opposing_subtotal"
PAYLOAD_ORDER = "payload_order"
Loading