Skip to content

Commit

Permalink
feat: add row and population MoE
Browse files Browse the repository at this point in the history
* drive by TDD (add failing tests)
* implement functionality as separate properties
  • Loading branch information
slobodan-ilic committed Nov 11, 2020
1 parent 9745bb1 commit a2379f7
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 3 deletions.
47 changes: 47 additions & 0 deletions src/cr/cube/cubepart.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,18 @@ def population_counts(self):
self.table_proportions * self._population * self._cube.population_fraction
)

@lazyproperty
def population_moe(self):
"""2D np.float64 ndarray of population margin-of-error (MoE) for table percents.
The values are represented as population estimates, analogue to the `population_counts`
property. This means that the values will be presented by actual estimated counts of the population
The values can be np.nan when the corresponding percentage is also np.nan, which
happens when the respective table margin is 0.
"""
total_filtered_population = self._population * self._cube.population_fraction
return Z_975 * total_filtered_population * self.table_std_err

@lazyproperty
def pvals(self):
return np.array([row.pvals for row in self._matrix.rows])
Expand Down Expand Up @@ -526,6 +538,22 @@ def rows_dimension_type(self):
def rows_margin(self):
return np.array([row.margin for row in self._matrix.rows])

@lazyproperty
def rows_percentages_moe(self):
"""2D np.float64 ndarray of margin-of-error (MoE) for rows percentages.
The values are represented as percentages, analogue to the `row_percentages`
property. This means that the value of 3.5% will have the value 3.5 (not 0.035).
The values can be np.nan when the corresponding percentage is also np.nan, which
happens when the respective table margin is 0.
"""
return Z_975 * 100 * self.rows_std_err

@lazyproperty
def rows_std_err(self):
"""2D np.float64 ndarray of standard errors for row percentages """
return np.sqrt(self._rows_variance / self.rows_margin[:, None])

This comment has been minimized.

Copy link
@malecki

malecki Nov 19, 2020

Contributor

Why does this have to be sliced [:, None]?
We are seeing some curious behavior of row se and moe that I think could be coming from something off here.

This comment has been minimized.

Copy link
@scanny

scanny Nov 19, 2020

Contributor

The docstring doesn't mention it (because there isn't one, fixed in spike I believe), but .rows_margin can be either 1D or 2D. I believe 2D arises in the X_MR case (CAT_X_MR or MR_X_MR). I'm not seeing a test for these cases but maybe it got added in another commit. I suppose I'd be thinking there would be two cases, like do it this way when 1D and that way when 2D, but maybe the slicing takes care of that.


@lazyproperty
def scale_mean_pairwise_indices(self):
"""Sequence of column-idx tuples indicating pairwise-t result of scale-means.
Expand Down Expand Up @@ -900,6 +928,12 @@ def _columns_variance(self):
self.counts / self.columns_margin * (1 - self.counts / self.columns_margin)
)

@lazyproperty
def _rows_variance(self):
"""ndarray of variances for row percentages"""
margin = self.rows_margin[:, None]
return self.counts / margin * (1 - self.counts / margin)

@lazyproperty
def _dimensions(self):
"""tuple of (rows_dimension, columns_dimension) Dimension objects."""
Expand Down Expand Up @@ -1006,6 +1040,19 @@ def population_counts(self):
* self._cube.population_fraction
)

@lazyproperty
def population_moe(self):
"""1D np.float64 ndarray of population margin-of-error (MoE) for table percents.
The values are represented as population estimates, analogue to the
`population_counts` property. This means that the values will be presented by
actual estimated counts of the population The values can be np.nan when the
corresponding percentage is also np.nan, which happens when the respective
table margin is 0.
"""
total_filtered_population = self._population * self._cube.population_fraction
return Z_975 * total_filtered_population * self.standard_error

@lazyproperty
def row_base(self):
return np.array([row.base for row in self._stripe.rows])
Expand Down
25 changes: 23 additions & 2 deletions tests/integration/test_cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,17 +477,20 @@ def test_proportions_text(self):
)

def test_std_dev_err_moe_univariate_cat_axis_none(self):
strand = Cube(CR.UNIVARIATE_CATEGORICAL).partitions[0]
strand = Cube(CR.UNIVARIATE_CATEGORICAL, population=1000).partitions[0]
np.testing.assert_almost_equal(
strand.standard_deviation, [0.47140452, 0.47140452]
)
np.testing.assert_almost_equal(strand.standard_error, [0.1217161, 0.1217161])
np.testing.assert_almost_equal(
strand.table_percentages_moe, [23.8559221, 23.8559221]
)
np.testing.assert_almost_equal(
strand.population_moe, [238.55922104, 238.55922104]
)

def test_std_dev_err_numeric(self):
strand = Cube(CR.VOTER_REGISTRATION).partitions[0]
strand = Cube(CR.VOTER_REGISTRATION, population=1000).partitions[0]
np.testing.assert_almost_equal(
strand.standard_deviation, [0.31902194, 0.30655342, 0.09949874]
)
Expand All @@ -497,6 +500,9 @@ def test_std_dev_err_numeric(self):
np.testing.assert_almost_equal(
strand.table_percentages_moe, [1.9772822, 1.9000029, 0.6166883]
)
np.testing.assert_almost_equal(
strand.population_moe, [19.77282169, 19.0000289, 6.16688276]
)

def test_std_dev_err_datetime(self):
strand = Cube(CR.SIMPLE_DATETIME).partitions[0]
Expand Down Expand Up @@ -858,13 +864,28 @@ def test_calculate_various_measures_axis_0(self):
[11.5249326, 7.2633194, 5.0491687, 6.5859452, 8.8723517, 14.7331947],
[11.5249326, 7.2633194, 5.0491687, 6.5859452, 8.8723517, 14.7331947],
]
expected_row_percentages_moe = [
[2.17593262, 3.33242829, 4.18778361, 3.71672761, 3.08030997, 1.41567652],
[2.34602515, 3.42712442, 4.29055665, 3.54381017, 2.34602515, 1.95365402],
]
expected_table_percentages_moe = [
[1.10701312, 1.75249771, 2.36182549, 1.99602358, 1.6037868, 0.71269548],
[1.19745296, 1.81024584, 2.47465565, 1.88321449, 1.19745296, 0.99024628],
]

np.testing.assert_almost_equal(slice_.table_std_dev, expected_table_std_dev)
np.testing.assert_almost_equal(slice_.table_std_err, expected_table_std_err)
np.testing.assert_almost_equal(slice_.columns_std_dev, expected_col_std_dev)
np.testing.assert_almost_equal(slice_.columns_std_err, expected_col_std_err)
np.testing.assert_almost_equal(
slice_.columns_percentages_moe, expected_col_percentages_moe
)
np.testing.assert_almost_equal(
slice_.rows_percentages_moe, expected_row_percentages_moe
)
np.testing.assert_almost_equal(
slice_.table_percentages_moe, expected_table_percentages_moe
)
np.testing.assert_almost_equal(slice_.zscores, expected_zscore)

def test_pvals(self):
Expand Down
26 changes: 25 additions & 1 deletion tests/integration/test_cubepart.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def it_provides_values_for_cat_x_cat(self):
assert slice_.variable_name == "v7"

def it_provides_values_for_cat_x_cat_pruning_hs(self):
slice_ = Cube(CR.CAT_X_CAT_PRUNING_HS).partitions[0]
slice_ = Cube(CR.CAT_X_CAT_PRUNING_HS, population=1000).partitions[0]

np.testing.assert_array_equal(
slice_.unweighted_counts,
Expand Down Expand Up @@ -207,6 +207,30 @@ def it_provides_values_for_cat_x_cat_pruning_hs(self):
[2.10138857, 2.95581821, 2.10138857, 2.10138857, 0.0, 0.0],
],
)
np.testing.assert_almost_equal(
slice_.rows_percentages_moe,
[
[12.68831687, 9.85619767, 12.04005472, 9.50403295, 0.0, 3.21895077],
[15.64864029, 17.27994745, 17.91592841, 16.23527759, 0.0, 9.22752652],
[69.23282609, 69.23282609, 0.0, 69.23282609, 0.0, 0.0],
[26.21465158, 28.33974473, 28.24934405, 22.87877357, 0.0, 22.61789555],
[19.31744591, 22.72065706, 24.58087449, 22.72065706, 0.0, 0.0],
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
[54.13770267, 54.13770267, 54.13770267, 54.13770267, 0.0, 0.0],
],
)
np.testing.assert_almost_equal(
slice_.population_moe,
[
[94.74253424, 102.30317355, 85.14161791, 64.011466, 0.0, 21.0138857],
[54.85480609, 83.25703304, 69.10626987, 57.75633485, 0.0, 30.001713],
[21.64685834, 21.64685834, 0.0, 21.0138857, 0.0, 0.0],
[36.71419866, 54.64165404, 41.95117344, 30.43818589, 0.0, 30.001713],
[36.71419866, 66.64317283, 57.75633485, 46.23600101, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[21.0138857, 29.55818215, 21.0138857, 21.0138857, 0.0, 0.0],
],
)
np.testing.assert_almost_equal(
slice_.zscores,
[
Expand Down

0 comments on commit a2379f7

Please sign in to comment.