Skip to content

Commit

Permalink
Merge pull request #217 from Crunch-io/handle-smoothing-transform-173…
Browse files Browse the repository at this point in the history
…943536

Handle smoothing transform 173943536
  • Loading branch information
ernestoarbitrio committed Sep 21, 2020
2 parents 57de896 + 3d4be27 commit b3fbd03
Show file tree
Hide file tree
Showing 50 changed files with 8,337 additions and 2,694 deletions.
47 changes: 47 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,52 @@
# History of Changes

#### 1.11.31
- Use bases instead of margin for MR `standard_error` calculation

#### 1.11.30
- Fix `standard_error` calculation for MR types

#### 1.11.29
- Fix `standard_error` denominator for `Strand`

#### 1.11.28
- Fix collapsed `scale-mean-pairwise-indices`

#### 1.11.27
- Standard deviation and standard error for `Strand`

#### 1.11.26
- Fix `pairwise_indices()` array collapse when all values empty

#### 1.11.25
- Expose two-level pairwise-t-test

#### 1.11.24
- Bug fix for scale_median calculation

#### 1.11.23
- Expose population fraction in cube partitions

#### 1.11.22
- Additional summary measures for scale (`std_dev`, `std_error`, `median`)

#### 1.11.21
- Fix slicing for CA + single col filter

#### 1.11.20
- Fix cube title payload discrepancy

#### 1.11.19
- Fix problem where pre-ordering anchor-idx was used for locating inserted subtotal vectors
- Enable handling of filter-only multitable-template placeholders.
- New measures: table and columns standard deviation and standard error

#### 1.11.18
- Fix wrong proportions and base values when explicit order is expressed

#### 1.11.17
- Fix incorrect means values after hiding

#### 1.11.16
- New base for t_stats overlaps

Expand Down
49 changes: 3 additions & 46 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,9 @@ The detailed description can be found

## Changes

### 1.12.1
- Smoothing on column percentages and column index

### 1.11.37
- PR 216: Document matrix.py classes and properties

Expand All @@ -130,51 +133,5 @@ The detailed description can be found
#### 1.11.32
- Handle hidden option for insertions

#### 1.11.31
- Use bases instead of margin for MR `standard_error` calculation

#### 1.11.30
- Fix `standard_error` calculation for MR types

#### 1.11.29
- Fix `standard_error` denominator for `Strand`

#### 1.11.28
- Fix collapsed `scale-mean-pairwise-indices`

#### 1.11.27
- Standard deviation and standard error for `Strand`

#### 1.11.26
- Fix `pairwise_indices()` array collapse when all values empty

#### 1.11.25
- Expose two-level pairwise-t-test

#### 1.11.24
- Bug fix for scale_median calculation

#### 1.11.23
- Expose population fraction in cube partitions

#### 1.11.22
- Additional summary measures for scale (`std_dev`, `std_error`, `median`)

#### 1.11.21
- Fix slicing for CA + single col filter

#### 1.11.20
- Fix cube title payload discrepancy

#### 1.11.19
- Fix problem where pre-ordering anchor-idx was used for locating inserted subtotal vectors
- Enable handling of filter-only multitable-template placeholders.
- New measures: table and columns standard deviation and standard error

#### 1.11.18
- Fix wrong proportions and base values when explicit order is expressed

#### 1.11.17
- Fix incorrect means values after hiding

For a complete list of changes see [history](https://github.com/Crunch-io/crunch-cube/blob/master/HISTORY.md).
2 changes: 1 addition & 1 deletion src/cr/cube/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@

"""Initialization module for crunch-cube package."""

__version__ = "1.11.37"
__version__ = "1.12.1"

# NOTE: We'll be switching to 2.0.0 once we throw out the old cube and slice
2 changes: 1 addition & 1 deletion src/cr/cube/cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ def _cube_dict(self):

@lazyproperty
def _is_single_filter_col_cube(self):
""" -> bool, determines if it is a single column filter cube."""
"""bool determines if it is a single column filter cube."""
return self._cube_dict["result"].get("is_single_col_cube", False)

def _measure(self, weighted):
Expand Down
1 change: 0 additions & 1 deletion src/cr/cube/cube_slice.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,6 @@ def _array_type_std_res(self, counts, total, colsum, rowsum):
are calculated for each of them separately, and then stacked together
in the resulting array.
"""
# __import__("ipdb").set_trace()
if self.mr_dim_ind == 0:
# --This is a special case where broadcasting cannot be
# --automatically done. We need to "inflate" the single dimensional
Expand Down
18 changes: 13 additions & 5 deletions src/cr/cube/cubepart.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@ def columns_std_err(self):

@lazyproperty
def counts(self):
"""2D np.float64 ndarray of weighted cube counts."""
return np.array([row.counts for row in self._matrix.rows])

@lazyproperty
Expand Down Expand Up @@ -865,10 +866,12 @@ def _transform_dicts(self):
item is an empty dict (`{}`) when no transforms are specified for that
dimension.
"""
return (
self._transforms_dict.get("rows_dimension", {}),
self._transforms_dict.get("columns_dimension", {}),
)
transforms_dict = self._transforms_dict
rows_dimension_dict = transforms_dict.get("rows_dimension", {})
columns_dimension_dict = transforms_dict.get("columns_dimension", {})
if "smoothing" in transforms_dict:
columns_dimension_dict["smoothing"] = transforms_dict["smoothing"]
return (rows_dimension_dict, columns_dimension_dict)


class _Strand(CubePartition):
Expand All @@ -892,6 +895,7 @@ def bases(self):

@lazyproperty
def counts(self):
"""tuple, 1D cube counts."""
return tuple(row.count for row in self._stripe.rows)

@lazyproperty
Expand Down Expand Up @@ -1177,7 +1181,11 @@ def _rows_dimension(self):
@lazyproperty
def _row_transforms_dict(self):
"""Transforms dict for the single (rows) dimension of this strand."""
return self._transforms_dict.get("rows_dimension", {})
transforms_dict = self._transforms_dict
rows_dimension_dict = transforms_dict.get("rows_dimension", {})
if "smoothing" in transforms_dict:
rows_dimension_dict["smoothing"] = transforms_dict["smoothing"]
return rows_dimension_dict

@lazyproperty
def _stripe(self):
Expand Down
157 changes: 157 additions & 0 deletions src/cr/cube/dimension.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""Provides the Dimension class."""

import sys
import warnings

if sys.version_info >= (3, 3):
from collections.abc import Sequence
Expand Down Expand Up @@ -385,6 +386,19 @@ def prune(self):
def shape(self):
return len(self.all_elements)

@lazyproperty
def smooth(self):
"""Function performing smoothing for this dimension, based on transform."""

def null_smooth(values):
return values

return (
_SingleSideMovingAvgSmoother.smoothing_function(self._smoothing_window)
if self._show_smoothing
else null_smooth
)

@lazyproperty
def sort(self):
"""A _BaseSort-subclass object or None, describing the applied sort method.
Expand Down Expand Up @@ -439,6 +453,49 @@ def _dimension_transforms_dict(self):
else {}
)

@lazyproperty
def _is_cat_date(self):
"""True for a categorical dimension having date defined on all valid categories.
Only meaningful when the dimension is known to be categorical
(has base-type `categorical`).
"""
categories = self._dimension_dict["type"].get("categories", [])
if not categories:
return False
return all(
category.get("date")
for category in categories
if not category.get("missing", False)
)

@lazyproperty
def _show_smoothing(self):
"""Return True if a smoothing transform is active for this dimension."""
smoothing = self._dimension_transforms_dict.get("smoothing")
# --- default is no smoothing when smoothing transform is not present ---
if not smoothing:
return False
# --- no smoothing when dimension is not a categorical date ---
if not self._is_cat_date:
return False
# --- no smoothing when the smoothing transform is inactive ---
if not smoothing.get("show", True):
return False
return True

@lazyproperty
def _smoothing_window(self):
"""size of the moving window.
This is the number of observations used for calculating the statistic. Each
window will be a fixed size.
"""
smoothing = self._dimension_transforms_dict.get("smoothing")
if not smoothing:
return None
return smoothing.get("window", 3)


class _BaseElements(Sequence):
"""Base class for element sequence containers."""
Expand Down Expand Up @@ -976,3 +1033,103 @@ def label(self):
def prune(self):
"""True if this subtotal should not appear when empty."""
return self._prune


class _SingleSideMovingAvgSmoother(object):
"""Create and configure smoothing function for one-sided moving average."""

def __init__(self, window):
self._window = window

@classmethod
def smoothing_function(cls, window):
"""Returns function that smooths a sequence of numeric values in `window`."""
return cls(window)._smoothing_function

@lazyproperty
def _smoothing_function(self):
"""function that returns an array of smoothed values."""

def smooth(values):
""" -> 1D/2D float64 ndarray of smootehd values including additional nans.
Given a series of numbers and a fixed subset size, the first element of the
moving average is obtained by taking the average of the initial fixed subset
of the number series. Then the subset is modified by `shifting forward` the
values. A moving average is commonly used with time series data to smooth
out short-term fluctuations and highlight longer-term trends or cycles.
The below examples will show 1D and 2D array rolling mean calculations with
window sizes of 2 and 3, respectively.
[window = 2]
----------------------------------------------------------------------------
x | smooth(x) x | smooth(x)
---------+-------------- -------------+------------------------
1 | NaN 1 3 2 3 | NaN 2.0 2.5 2.5
2 | 1.5 2 3 3 2 | NaN 2.5 3.0 2.5
3 | 2.5 3 2 4 4 | NaN 2.5 3.0 4.0
4 | 3.5 4 1 5 1 | NaN 2.5 3.0 3.0
[window = 3]
----------------------------------------------------------------------------
x | smooth(x) x | smooth(x)
---------+-------------- -------------+------------------------
1 | NaN 1 3 2 3 | NaN NaN 2.0 2.67
2 | NaN 2 3 3 2 | NaN NaN 2.67 2.67
3 | 2 3 2 4 4 | NaN NaN 3.0 3.33
4 | 3 4 1 5 1 | NaN NaN 3.33 2.33
This is performed just taking the average of the last 2 or 3 rows according
to the window, all the way down the column.
"""
if not self._valid_window(values.shape[-1]):
warnings.warn(
"No smoothing performed. Window (value: {}) parameter is not "
"valid: window must be less than equal to the total period "
"(value: {}) and positive".format(self._window, values.shape[-1]),
UserWarning,
)
return values
smoothed_values = self._smoother(values)
# offset between original values and smoothed values
offset = [values.shape[-1] - smoothed_values.shape[-1]]
additional_nans = np.full(list(values.shape[:-1]) + offset, np.nan)
return np.concatenate(
[additional_nans, smoothed_values], axis=values.ndim - 1
)

return smooth

def _smoother(self, values):
""" -> np.ndarray, provide smoothing algorithm on the given values.
In this case the moving average smoother is performed using the np.convolve
(https://numpy.org/doc/stable/reference/generated/numpy.convolve.html)
operator that returns the discrete, linear convolution of two one-dimensional
sequences.
A moving average is a form of a convolution often used in time series analysis
to smooth out noise in data by replacing a data point with the average of
neighboring values in a moving window. A moving average is essentially a
low-pass filter because it removes short-term fluctuations to highlight a deeper
underlying trend.
"""
w = self._window
return (
np.array(tuple(np.convolve(values, np.ones(w), mode="valid") / w))
if values.ndim == 1
else np.array(
[tuple(np.convolve(v, np.ones(w), mode="valid") / w) for v in values]
)
)

def _valid_window(self, total_period):
""" -> bool, the validity of the window parameter.
Return last dimension size, if window is greater than the the last dimension
size because we cannot have a moving window grater than the number of elements
of each column.
"""
if self._window > total_period or self._window == 0:
return False
return True
Loading

0 comments on commit b3fbd03

Please sign in to comment.