Skip to content

Commit

Permalink
Merge branch 'sentry-cannot-broadcast-161845317'
Browse files Browse the repository at this point in the history
This branch fixes the Sentry error from ticket 161845317 and makes some
nice refactoring improvements along the way.
  • Loading branch information
scanny committed Nov 14, 2018
2 parents 543ee6c + eb3d79e commit 55963e8
Show file tree
Hide file tree
Showing 9 changed files with 14,714 additions and 2,771 deletions.
3 changes: 1 addition & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@ testpaths =
show-source = True
max-line-length = 80
ignore =
# E123: closing bracket does not match indentation of opening bracket's line
E123
W504 # line break after binary operator (e.g. 'and')

[isort]
line_length = 80
Expand Down
113 changes: 26 additions & 87 deletions src/cr/cube/crunch_cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
import warnings

import numpy as np
from scipy.stats import norm
from scipy.stats.contingency import expected_freq

from cr.cube.cube_slice import CubeSlice
from cr.cube.dimension import AllDimensions
Expand Down Expand Up @@ -72,15 +70,9 @@ def __repr__(self):
"""
try:
dimensionality = ' x '.join(dt.name for dt in self.dim_types)
slice_reprs = (
'\n'.join(
'slices[%d]: %s' % (idx, repr(s))
for idx, s in enumerate(self.slices)
)
)
return (
"%s(name='%s', dim_types='%s')\n%s" %
(type(self).__name__, self.name, dimensionality, slice_reprs)
"%s(name='%s', dim_types='%s')" %
(type(self).__name__, self.name, dimensionality)
)
except Exception:
return super(CrunchCube, self).__repr__()
Expand Down Expand Up @@ -620,26 +612,21 @@ def hs_dims_for_den(hs_dims, axis):
return res

def pvals(self, weighted=True, prune=False, hs_dims=None):
"""Calculate p-vals.
"""Return ndarray with calculated p-vals.
This function calculates statistically significant results for
categorical contingency tables. The values can be calculated across
columns (axis = 0), or across rows (axis = 1).
Returns
(ndarray): 2-Dimensional array, representing the p-values for each
cell of the table-like representation of the
crunch cube.
categorical contingency tables. The values are calculated for 2D tables
only. For 3D cubes, the slices' results are stacked together and
returned as an ndarray.
:param weighted: Use weighted counts for zscores
:param prune: Prune based on unweighted counts
:param hs_dims: Include headers and subtotals (as NaN values)
:returns: 2 or 3 Dimensional ndarray, representing the p-values for each
cell of the table-like representation of the crunch cube.
"""
stats = self.zscore(weighted=weighted, prune=prune, hs_dims=hs_dims)
res = 2 * (1 - norm.cdf(np.abs(stats)))

if isinstance(stats, np.ma.core.MaskedArray):
# Explicit setting of the mask is necessary, because the norm.cdf
# creates a non-masked version
res = np.ma.masked_array(res, stats.mask)

return res
res = [s.pvals(weighted, prune, hs_dims) for s in self.slices]
return np.array(res) if self.ndim == 3 else res[0]

@lazyproperty
def row_direction_axis(self):
Expand Down Expand Up @@ -706,35 +693,20 @@ def univariate_ca_main_axis(self):
return self.dim_types.index(DT.CA_CAT)

def zscore(self, weighted=True, prune=False, hs_dims=None):
"""Get cube zscore measurement."""
res = []
for slice_ in self.slices:
counts = slice_.as_array(weighted=weighted)
total = slice_.margin(weighted=weighted)
colsum = slice_.margin(axis=0, weighted=weighted)
rowsum = slice_.margin(axis=1, weighted=weighted)
std_res = self._calculate_std_res(
counts, total, colsum, rowsum, slice_,
)
res.append(std_res)

if len(res) == 1 and self.ndim < 3:
res = res[0]
else:
res = np.array(res)
"""Return ndarray with cube's zscore measurements.
if hs_dims:
res = self._intersperse_hs_in_std_res(hs_dims, res)
Zscore is a measure of statistical signifficance of observed vs.
expected counts. It's only applicable to a 2D contingency tables.
For 3D cubes, the measures of separate slices are stacked together
and returned as the result.
if prune:
arr = self.as_array(
prune=prune,
include_transforms_for_dims=hs_dims,
)
if isinstance(arr, np.ma.core.MaskedArray):
res = np.ma.masked_array(res, arr.mask)

return res
:param weighted: Use weighted counts for zscores
:param prune: Prune based on unweighted counts
:param hs_dims: Include headers and subtotals (as NaN values)
:returns zscore: ndarray representing zscore measurements
"""
res = [s.zscore(weighted, prune, hs_dims) for s in self.slices]
return np.array(res) if self.ndim == 3 else res[0]

def _adjust_axis(self, axis):
"""Return raw axis/axes corresponding to apparent axis/axes.
Expand Down Expand Up @@ -911,31 +883,6 @@ def _calculate_constraints_sum(cls, prop_table, prop_margin, axis):
# (because of the inner matrix dimensions).
return np.dot(prop_margin, V)

def _calculate_std_res(self, counts, total, colsum, rowsum, slice_):
has_mr_or_ca = set(slice_.dim_types) & DT.ARRAY_TYPES
if has_mr_or_ca:
if (not self.is_double_mr and
(self.mr_dim_ind == 0 or
self.mr_dim_ind == 1 and self.ndim == 3)):
total = total[:, np.newaxis]
rowsum = rowsum[:, np.newaxis]

expected = rowsum * colsum / total
variance = (
rowsum * colsum * (total - rowsum) * (total - colsum) /
total ** 3
)
res = (counts - expected) / np.sqrt(variance)
else:
expected_counts = expected_freq(counts)
residuals = counts - expected_counts
variance = (
np.outer(rowsum, colsum) *
np.outer(total - rowsum, total - colsum) / total ** 3
)
res = residuals / np.sqrt(variance)
return res

@lazyproperty
def _col_direction_axis(self):
return self.ndim - 2
Expand Down Expand Up @@ -1077,14 +1024,6 @@ def iter_insertions():

return [insertion for insertion in iter_insertions()]

def _intersperse_hs_in_std_res(self, hs_dims, res):
for dim, inds in enumerate(self.inserted_hs_indices()):
for i in inds:
if dim not in hs_dims:
continue
res = np.insert(res, i, np.nan, axis=(dim - self.ndim))
return res

def _is_axis_allowed(self, axis):
"""Check if axis are allowed.
Expand Down
111 changes: 106 additions & 5 deletions src/cr/cube/cube_slice.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

import numpy as np
from tabulate import tabulate
from scipy.stats import norm
from scipy.stats.contingency import expected_freq

from cr.cube.enum import DIMENSION_TYPE as DT
from cr.cube.measures.scale_means import ScaleMeans
Expand Down Expand Up @@ -174,7 +176,7 @@ def index_table(self, axis=None, baseline=None, prune=False):

indexes = proportions / baseline * 100

return self._apply_pruning_mask(indexes, prune)
return self._apply_pruning_mask(indexes) if prune else indexes

@lazyproperty
def is_double_mr(self):
Expand Down Expand Up @@ -299,16 +301,94 @@ def table_name(self):
table_name = self._cube.labels()[0][self._index]
return '%s: %s' % (title, table_name)

def _apply_pruning_mask(self, res, prune):
if not prune:
return res
def pvals(self, weighted=True, prune=False, hs_dims=None):
"""Return 2D ndarray with calculated p-vals.
This function calculates statistically significant results for
categorical contingency tables. The values are calculated for 2D tables
only.
:param weighted: Use weighted counts for zscores
:param prune: Prune based on unweighted counts
:param hs_dims: Include headers and subtotals (as NaN values)
:returns: 2 or 3 Dimensional ndarray, representing the p-values for each
cell of the table-like representation of the crunch cube.
"""
stats = self.zscore(weighted=weighted, prune=prune, hs_dims=hs_dims)
pvals = 2 * (1 - norm.cdf(np.abs(stats)))

return self._apply_pruning_mask(pvals, hs_dims) if prune else pvals

def zscore(self, weighted=True, prune=False, hs_dims=None):
"""Return ndarray with slices's zscore measurements.
Zscore is a measure of statistical signifficance of observed vs.
expected counts. It's only applicable to a 2D contingency tables.
:param weighted: Use weighted counts for zscores
:param prune: Prune based on unweighted counts
:param hs_dims: Include headers and subtotals (as NaN values)
:returns zscore: ndarray representing zscore measurements
"""
counts = self.as_array(weighted=weighted)
total = self.margin(weighted=weighted)
colsum = self.margin(axis=0, weighted=weighted)
rowsum = self.margin(axis=1, weighted=weighted)
zscore = self._calculate_std_res(
counts, total, colsum, rowsum,
)

if hs_dims:
zscore = self._intersperse_hs_in_std_res(hs_dims, zscore)

if prune:
return self._apply_pruning_mask(zscore, hs_dims)

return zscore

def _apply_pruning_mask(self, res, hs_dims=None):
array = self.as_array(prune=True, include_transforms_for_dims=hs_dims)

array = self.as_array(prune=True)
if not isinstance(array, np.ma.core.MaskedArray):
return res

return np.ma.masked_array(res, mask=array.mask)

def _array_type_std_res(self, counts, total, colsum, rowsum):
"""Return ndarray containing standard residuals for array values.
The shape of the return value is the same as that of *counts*.
Array variables require special processing because of the
underlying math. Essentially, it boils down to the fact that the
variable dimensions are mutually independent, and standard residuals
are calculated for each of them separately, and then stacked together
in the resulting array.
"""
if self.mr_dim_ind == 0:
# --This is a special case where broadcasting cannot be
# --automatically done. We need to "inflate" the single dimensional
# --ndarrays, to be able to treat them as "columns" (essentially a
# --Nx1 ndarray). This is needed for subsequent multiplication
# --that needs to happen column wise (rowsum * colsum) / total.
total = total[:, np.newaxis]
rowsum = rowsum[:, np.newaxis]

expected_counts = rowsum * colsum / total
variance = (
rowsum * colsum * (total - rowsum) * (total - colsum) /
total ** 3
)
return (counts - expected_counts) / np.sqrt(variance)

def _calculate_std_res(self, counts, total, colsum, rowsum):
"""Return ndarray containing standard residuals.
The shape of the return value is the same as that of *counts*.
"""
if set(self.dim_types) & DT.ARRAY_TYPES: # ---has-mr-or-ca---
return self._array_type_std_res(counts, total, colsum, rowsum)
return self._scalar_type_std_res(counts, total, colsum, rowsum)

def _call_cube_method(self, method, *args, **kwargs):
kwargs = self._update_args(kwargs)
result = getattr(self._cube, method)(*args, **kwargs)
Expand All @@ -318,6 +398,14 @@ def _call_cube_method(self, method, *args, **kwargs):
return result
return self._update_result(result)

def _intersperse_hs_in_std_res(self, hs_dims, res):
for dim, inds in enumerate(self.inserted_hs_indices()):
for i in inds:
if dim not in hs_dims:
continue
res = np.insert(res, i, np.nan, axis=(dim - self.ndim))
return res

def _prepare_index_baseline(self, axis):
# First get the margin of the opposite direction of the index axis.
# We need this in order to end up with the right shape of the
Expand Down Expand Up @@ -347,6 +435,19 @@ def _prepare_index_baseline(self, axis):
baseline = baseline / np.sum(baseline)
return baseline / np.sum(baseline, axis=0)

def _scalar_type_std_res(self, counts, total, colsum, rowsum):
"""Return ndarray containing standard residuals for category values.
The shape of the return value is the same as that of *counts*.
"""
expected_counts = expected_freq(counts)
residuals = counts - expected_counts
variance = (
np.outer(rowsum, colsum) *
np.outer(total - rowsum, total - colsum) / total ** 3
)
return residuals / np.sqrt(variance)

def _update_args(self, kwargs):
if self._cube.ndim < 3:
# If cube is 2D it doesn't actually have slices (itself is a slice).
Expand Down
Loading

0 comments on commit 55963e8

Please sign in to comment.