feat: add row and population MoE

* drive by TDD (add failing tests) * implement functionality as separate properties
Crunch-io · Nov 11, 2020 · a2379f7 · malecki · Nov 19, 2020 · scanny
1 parent 9745bb1
commit a2379f7
Show file tree

Hide file tree

Showing 3 changed files with 95 additions and 3 deletions.
diff --git a/src/cr/cube/cubepart.py b/src/cr/cube/cubepart.py
@@ -459,6 +459,18 @@ def population_counts(self):
             self.table_proportions * self._population * self._cube.population_fraction
         )
 
+    @lazyproperty
+    def population_moe(self):
+        """2D np.float64 ndarray of population margin-of-error (MoE) for table percents.
+
+        The values are represented as population estimates, analogue to the `population_counts`
+        property. This means that the values will be presented by actual estimated counts of the population
+        The values can be np.nan when the corresponding percentage is also np.nan, which
+        happens when the respective table margin is 0.
+        """
+        total_filtered_population = self._population * self._cube.population_fraction
+        return Z_975 * total_filtered_population * self.table_std_err
+
     @lazyproperty
     def pvals(self):
         return np.array([row.pvals for row in self._matrix.rows])
@@ -526,6 +538,22 @@ def rows_dimension_type(self):
     def rows_margin(self):
         return np.array([row.margin for row in self._matrix.rows])
 
+    @lazyproperty
+    def rows_percentages_moe(self):
+        """2D np.float64 ndarray of margin-of-error (MoE) for rows percentages.
+
+        The values are represented as percentages, analogue to the `row_percentages`
+        property. This means that the value of 3.5% will have the value 3.5 (not 0.035).
+        The values can be np.nan when the corresponding percentage is also np.nan, which
+        happens when the respective table margin is 0.
+        """
+        return Z_975 * 100 * self.rows_std_err
+
+    @lazyproperty
+    def rows_std_err(self):
+        """2D np.float64 ndarray of standard errors for row percentages """
+        return np.sqrt(self._rows_variance / self.rows_margin[:, None])
+
     @lazyproperty
     def scale_mean_pairwise_indices(self):
         """Sequence of column-idx tuples indicating pairwise-t result of scale-means.
@@ -900,6 +928,12 @@ def _columns_variance(self):
             self.counts / self.columns_margin * (1 - self.counts / self.columns_margin)
         )
 
+    @lazyproperty
+    def _rows_variance(self):
+        """ndarray of variances for row percentages"""
+        margin = self.rows_margin[:, None]
+        return self.counts / margin * (1 - self.counts / margin)
+
     @lazyproperty
     def _dimensions(self):
         """tuple of (rows_dimension, columns_dimension) Dimension objects."""
@@ -1006,6 +1040,19 @@ def population_counts(self):
             * self._cube.population_fraction
         )
 
+    @lazyproperty
+    def population_moe(self):
+        """1D np.float64 ndarray of population margin-of-error (MoE) for table percents.
+
+        The values are represented as population estimates, analogue to the
+        `population_counts` property. This means that the values will be presented by
+        actual estimated counts of the population The values can be np.nan when the
+        corresponding percentage is also np.nan, which happens when the respective
+        table margin is 0.
+        """
+        total_filtered_population = self._population * self._cube.population_fraction
+        return Z_975 * total_filtered_population * self.standard_error
+
     @lazyproperty
     def row_base(self):
         return np.array([row.base for row in self._stripe.rows])

diff --git a/tests/integration/test_cube.py b/tests/integration/test_cube.py
@@ -477,17 +477,20 @@ def test_proportions_text(self):
         )
 
     def test_std_dev_err_moe_univariate_cat_axis_none(self):
-        strand = Cube(CR.UNIVARIATE_CATEGORICAL).partitions[0]
+        strand = Cube(CR.UNIVARIATE_CATEGORICAL, population=1000).partitions[0]
         np.testing.assert_almost_equal(
             strand.standard_deviation, [0.47140452, 0.47140452]
         )
         np.testing.assert_almost_equal(strand.standard_error, [0.1217161, 0.1217161])
         np.testing.assert_almost_equal(
             strand.table_percentages_moe, [23.8559221, 23.8559221]
         )
+        np.testing.assert_almost_equal(
+            strand.population_moe, [238.55922104, 238.55922104]
+        )
 
     def test_std_dev_err_numeric(self):
-        strand = Cube(CR.VOTER_REGISTRATION).partitions[0]
+        strand = Cube(CR.VOTER_REGISTRATION, population=1000).partitions[0]
         np.testing.assert_almost_equal(
             strand.standard_deviation, [0.31902194, 0.30655342, 0.09949874]
         )
@@ -497,6 +500,9 @@ def test_std_dev_err_numeric(self):
         np.testing.assert_almost_equal(
             strand.table_percentages_moe, [1.9772822, 1.9000029, 0.6166883]
         )
+        np.testing.assert_almost_equal(
+            strand.population_moe, [19.77282169, 19.0000289, 6.16688276]
+        )
 
     def test_std_dev_err_datetime(self):
         strand = Cube(CR.SIMPLE_DATETIME).partitions[0]
@@ -858,13 +864,28 @@ def test_calculate_various_measures_axis_0(self):
             [11.5249326, 7.2633194, 5.0491687, 6.5859452, 8.8723517, 14.7331947],
             [11.5249326, 7.2633194, 5.0491687, 6.5859452, 8.8723517, 14.7331947],
         ]
+        expected_row_percentages_moe = [
+            [2.17593262, 3.33242829, 4.18778361, 3.71672761, 3.08030997, 1.41567652],
+            [2.34602515, 3.42712442, 4.29055665, 3.54381017, 2.34602515, 1.95365402],
+        ]
+        expected_table_percentages_moe = [
+            [1.10701312, 1.75249771, 2.36182549, 1.99602358, 1.6037868, 0.71269548],
+            [1.19745296, 1.81024584, 2.47465565, 1.88321449, 1.19745296, 0.99024628],
+        ]
+
         np.testing.assert_almost_equal(slice_.table_std_dev, expected_table_std_dev)
         np.testing.assert_almost_equal(slice_.table_std_err, expected_table_std_err)
         np.testing.assert_almost_equal(slice_.columns_std_dev, expected_col_std_dev)
         np.testing.assert_almost_equal(slice_.columns_std_err, expected_col_std_err)
         np.testing.assert_almost_equal(
             slice_.columns_percentages_moe, expected_col_percentages_moe
         )
+        np.testing.assert_almost_equal(
+            slice_.rows_percentages_moe, expected_row_percentages_moe
+        )
+        np.testing.assert_almost_equal(
+            slice_.table_percentages_moe, expected_table_percentages_moe
+        )
         np.testing.assert_almost_equal(slice_.zscores, expected_zscore)
 
     def test_pvals(self):

diff --git a/tests/integration/test_cubepart.py b/tests/integration/test_cubepart.py
@@ -53,7 +53,7 @@ def it_provides_values_for_cat_x_cat(self):
         assert slice_.variable_name == "v7"
 
     def it_provides_values_for_cat_x_cat_pruning_hs(self):
-        slice_ = Cube(CR.CAT_X_CAT_PRUNING_HS).partitions[0]
+        slice_ = Cube(CR.CAT_X_CAT_PRUNING_HS, population=1000).partitions[0]
 
         np.testing.assert_array_equal(
             slice_.unweighted_counts,
@@ -207,6 +207,30 @@ def it_provides_values_for_cat_x_cat_pruning_hs(self):
                 [2.10138857, 2.95581821, 2.10138857, 2.10138857, 0.0, 0.0],
             ],
         )
+        np.testing.assert_almost_equal(
+            slice_.rows_percentages_moe,
+            [
+                [12.68831687, 9.85619767, 12.04005472, 9.50403295, 0.0, 3.21895077],
+                [15.64864029, 17.27994745, 17.91592841, 16.23527759, 0.0, 9.22752652],
+                [69.23282609, 69.23282609, 0.0, 69.23282609, 0.0, 0.0],
+                [26.21465158, 28.33974473, 28.24934405, 22.87877357, 0.0, 22.61789555],
+                [19.31744591, 22.72065706, 24.58087449, 22.72065706, 0.0, 0.0],
+                [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
+                [54.13770267, 54.13770267, 54.13770267, 54.13770267, 0.0, 0.0],
+            ],
+        )
+        np.testing.assert_almost_equal(
+            slice_.population_moe,
+            [
+                [94.74253424, 102.30317355, 85.14161791, 64.011466, 0.0, 21.0138857],
+                [54.85480609, 83.25703304, 69.10626987, 57.75633485, 0.0, 30.001713],
+                [21.64685834, 21.64685834, 0.0, 21.0138857, 0.0, 0.0],
+                [36.71419866, 54.64165404, 41.95117344, 30.43818589, 0.0, 30.001713],
+                [36.71419866, 66.64317283, 57.75633485, 46.23600101, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [21.0138857, 29.55818215, 21.0138857, 21.0138857, 0.0, 0.0],
+            ],
+        )
         np.testing.assert_almost_equal(
             slice_.zscores,
             [