Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Differentiated dimension-types #113

Merged
merged 41 commits into from
Nov 1, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
c38a066
rfctr: improve deprecation warning
scanny Oct 22, 2018
7e2fad6
rfctr: normalize ordering of new CubeSlice methods
scanny Oct 29, 2018
828549e
cube: make CrunchCube._all_dimensions private
scanny Oct 27, 2018
fbe683a
cube: remove dead CrunchCube.ca_dim_ind
scanny Oct 27, 2018
e060402
cube: make CrunchCube.col_direction_axis private
scanny Oct 27, 2018
8a49d08
cube: make CrunchCube.data private
scanny Oct 27, 2018
1978f4c
cube: make CrunchCube.flat_values private
scanny Oct 27, 2018
afe5478
cube: make CrunchCube.prune_indices private
scanny Oct 27, 2018
1b2afe4
cube: make .valid_indices_with_selections private
scanny Oct 27, 2018
f695725
dim: extract DIMENSION_TYPE enumeration
scanny Oct 28, 2018
10a93ad
cube: parameterize CrunchCube unit tests
scanny Oct 28, 2018
c481b8f
dim: add xfail integration test to drive TDD
scanny Oct 28, 2018
94a0ebc
dim: add sequence behaviors to _BaseDimensions
scanny Oct 27, 2018
4b9f479
dim: add AllDimensions._dimensions
scanny Oct 26, 2018
6425eb9
dim: add _DimensionFactory.iter_dimensions()
scanny Oct 26, 2018
7538608
dim: add _DimensionFactory._iter_dimensions()
scanny Oct 26, 2018
54b476e
dim: add _DimensionFactory._raw_dimensions
scanny Oct 26, 2018
f0d08d5
dim: add _RawDimension.dimension_dict
scanny Oct 26, 2018
fc065fb
dim: add _RawDimension.dimension_type
scanny Oct 27, 2018
3c76720
dim: add _RawDimension._base_type
scanny Oct 27, 2018
252466f
dim: add _RawDimension._resolve_categorical()
scanny Oct 27, 2018
e036050
dim: add _RawDimension._is_array_cat
scanny Oct 27, 2018
b8bc449
dim: add _RawDimension._has_selected_category
scanny Oct 27, 2018
7cc9814
dim: reimplement Dimension.dimension_type
scanny Oct 27, 2018
eea9c34
dim: add _RawDimension._resolve_array_type()
scanny Oct 27, 2018
6ad3053
dim: add _RawDimension._next_raw_dimension
scanny Oct 27, 2018
903dcce
dim: add _RawDimension._alias
scanny Oct 27, 2018
fc5f12d
dim: add AllDimensions.apparent_dimensions
scanny Oct 26, 2018
355cb48
dim: add _ApparentDimensions._dimensions
scanny Oct 27, 2018
caa5424
dim: integrate new AllDimensions implementation
scanny Oct 27, 2018
821a37d
cube: remimplement CrunchCube.is_univariate_ca
scanny Oct 28, 2018
b671c3d
cube: reimplement CrunchCube.univariate_ca_main_axis
scanny Oct 28, 2018
d8db2b5
test: rework CrunchCube.dimensions integration test
scanny Oct 28, 2018
5562c9b
test: rework logical-univariate integration test
scanny Oct 28, 2018
159e9d1
test: rework cat-x-logical test
scanny Oct 28, 2018
6ed25b1
rfctr: remove CrunchCube.is_mr_selections()
scanny Oct 28, 2018
34aa402
rfctr: remove Dimension.is_selections
scanny Oct 29, 2018
b3d883d
rfctr: remove CrunchCube.mr_selections_indices
scanny Oct 29, 2018
4f3461b
rfctr: remove Dimension.alias
scanny Oct 29, 2018
b374ec1
cube: make CrunchCube.dim_types immutable
scanny Oct 29, 2018
c7073c9
rfctr: improve docstrings and naming
scanny Oct 27, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions src/cr/cube/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +0,0 @@
# encoding: utf-8

'''Shared constants'''

ITEM_DIMENSION_TYPES = ('categorical_array', 'multiple_response')
398 changes: 192 additions & 206 deletions src/cr/cube/crunch_cube.py

Large diffs are not rendered by default.

129 changes: 52 additions & 77 deletions src/cr/cube/cube_slice.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import numpy as np
from tabulate import tabulate

from cr.cube.enum import DIMENSION_TYPE as DT
from cr.cube.measures.scale_means import ScaleMeans
from cr.cube.util import lazyproperty, compress_pruned, memoize

Expand All @@ -26,7 +27,7 @@ class CubeSlice(object):

def __init__(self, cube, index, ca_as_0th=False):

if ca_as_0th and cube.dim_types[0] != 'categorical_array':
if ca_as_0th and cube.dim_types[0] != DT.CA_SUBVAR:
msg = (
'Cannot set CA as 0th for cube that '
'does not have CA items as the 0th dimension.'
Expand Down Expand Up @@ -78,81 +79,11 @@ def __repr__(self):
text += '\n' + body
return text

def _apply_pruning_mask(self, res, prune):
if not prune:
return res

array = self.as_array(prune=True)
if not isinstance(array, np.ma.core.MaskedArray):
return res

return np.ma.masked_array(res, mask=array.mask)

def _prepare_index_baseline(self, axis):
# First get the margin of the opposite direction of the index axis.
# We need this in order to end up with the right shape of the
# numerator vs denominator.
baseline = self.margin(axis=(1 - axis), include_missing=True)

# Now check if the shape of the marginal needs to be fixed, because
# different versions of the MR containing cubes, combined with
# different margin directions, provide marginals of different shapes.
# We also need to calculate the percentage marginals correctly,
# so we need to perform the addition (to get the denominator)
# across the correct axis.
if axis == self.mr_dim_ind:
baseline = baseline / np.sum(baseline, axis=1)[:, None]
return baseline[:, 0]
elif isinstance(self.mr_dim_ind, tuple) and axis in self.mr_dim_ind:
total = np.sum(baseline, axis=(axis + 1))
if axis == 0:
return baseline[:, 0, 0] / total[:, 0]
return baseline[0, :, 0] / total[0]

if axis == 0 and self.mr_dim_ind is not None:
baseline = baseline[:, 0]
return baseline / np.sum(baseline)

baseline = baseline if len(baseline.shape) <= 1 else baseline[0]
baseline = baseline / np.sum(baseline)
return baseline / np.sum(baseline, axis=0)

@lazyproperty
def ca_dim_ind(self):
"""Return items dimension index if there is one.

If the slice is a part of a cube that has a categorical-array
variable, return the index of the items dimension (if it belongs to
this slice).

Examples:

* For a CA(items) x CAT => returns 0
* For CAT x CA(items) => returns 1
* For CAT x CA(items) x CAT => returns 0 (because the items is the 0th
dimension of each slice)
* For CA(items) x CAT x CAT => returns None (because the 0th items
dimension doesn't belong to any one slice, and is itself used for
slicing the cube).
"""
index = self._cube.ca_dim_ind
if index is None:
return None

if self._cube.ndim == 3:
if index == 0:
# If tab dim is items, slices are not
return None
return index - 1

# If 2D - just return it
return index

@lazyproperty
def ca_main_axis(self):
"""For univariate CA, the main axis is the categorical axis"""
try:
ca_ind = self.dim_types.index('categorical_array')
ca_ind = self.dim_types.index(DT.CA_SUBVAR)
return 1 - ca_ind
except ValueError:
return None
Expand All @@ -162,6 +93,11 @@ def col_dim_ind(self):
"""Return 1 if not categorical array as 0th, 0 otherwise."""
return 1 if not self.ca_as_0th else 0

@lazyproperty
def dim_types(self):
"""Tuple of DIMENSION_TYPE member for each dimension of slice."""
return self._cube.dim_types[-2:]

@memoize
def get_shape(self, prune=False):
"""Tuple of array dimensions' lengths.
Expand Down Expand Up @@ -192,11 +128,11 @@ def has_ca(self):
"""Check if the cube slice has the CA dimension.

This is used to distinguish between slices that are considered 'normal'
(like CAT x CAT), that might be a part of te 3D cube that has 0th dim
(like CAT x CAT), that might be a part of the 3D cube that has 0th dim
as the CA items (subvars). In such a case, we still need to process
the slices 'normally', and not address the CA items constraints.
"""
return 'categorical_array' in self.dim_types
return DT.CA_SUBVAR in self.dim_types

@lazyproperty
def has_mr(self):
Expand All @@ -205,7 +141,7 @@ def has_mr(self):
This property needs to be overridden, because we don't care about the
0th dimension (and if it's an MR) in the case of a 3D cube.
"""
return 'multiple_response' in self.dim_types
return DT.MR in self.dim_types

def index_table(self, axis=None, baseline=None, prune=False):
"""Return index percentages for a given axis and baseline.
Expand Down Expand Up @@ -237,7 +173,7 @@ def is_double_mr(self):
account, since it's only the tabs dimension, and mustn't affect the
properties of the slices.
"""
return self.dim_types == ['multiple_response'] * 2
return self.dim_types == (DT.MR, DT.MR)

def labels(self, hs_dims=None, prune=False):
"""Get labels for the cube slice, and perform pruning by slice."""
Expand All @@ -260,7 +196,7 @@ def prune_dimension_labels(labels, prune_indices):
labels = [
prune_dimension_labels(dim_labels, dim_prune_inds)
for dim_labels, dim_prune_inds in
zip(labels, self.prune_indices(transforms=hs_dims))
zip(labels, self._prune_indices(transforms=hs_dims))
]
return labels

Expand Down Expand Up @@ -352,6 +288,16 @@ def table_name(self):
table_name = self._cube.labels()[0][self._index]
return '%s: %s' % (title, table_name)

def _apply_pruning_mask(self, res, prune):
if not prune:
return res

array = self.as_array(prune=True)
if not isinstance(array, np.ma.core.MaskedArray):
return res

return np.ma.masked_array(res, mask=array.mask)

def _call_cube_method(self, method, *args, **kwargs):
kwargs = self._update_args(kwargs)
result = getattr(self._cube, method)(*args, **kwargs)
Expand All @@ -361,6 +307,35 @@ def _call_cube_method(self, method, *args, **kwargs):
return result
return self._update_result(result)

def _prepare_index_baseline(self, axis):
# First get the margin of the opposite direction of the index axis.
# We need this in order to end up with the right shape of the
# numerator vs denominator.
baseline = self.margin(axis=(1 - axis), include_missing=True)

# Now check if the shape of the marginal needs to be fixed, because
# different versions of the MR containing cubes, combined with
# different margin directions, provide marginals of different shapes.
# We also need to calculate the percentage marginals correctly,
# so we need to perform the addition (to get the denominator)
# across the correct axis.
if axis == self.mr_dim_ind:
baseline = baseline / np.sum(baseline, axis=1)[:, None]
return baseline[:, 0]
elif isinstance(self.mr_dim_ind, tuple) and axis in self.mr_dim_ind:
total = np.sum(baseline, axis=(axis + 1))
if axis == 0:
return baseline[:, 0, 0] / total[:, 0]
return baseline[0, :, 0] / total[0]

if axis == 0 and self.mr_dim_ind is not None:
baseline = baseline[:, 0]
return baseline / np.sum(baseline)

baseline = baseline if len(baseline.shape) <= 1 else baseline[0]
baseline = baseline / np.sum(baseline)
return baseline / np.sum(baseline, axis=0)

def _update_args(self, kwargs):
if self._cube.ndim < 3:
# If cube is 2D it doesn't actually have slices (itself is a slice).
Expand Down
Loading